diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt new file mode 100644 index 0000000..2719005 --- /dev/null +++ b/CONTRIBUTORS.txt @@ -0,0 +1,98 @@ +The Red Hat VDO Team: + Principal Engineer/Lead Architect: + J. corwin Coburn + + Primary Authors: + Joseph Chapman + Sweet Tea Dorminy + *Thomas Jaskiewicz + Bruce Johnston + Susan McGhee + Ken Raeburn + Michael Sclafani + Matthew Sakai + Joseph Shimkus + John Wiele + + Support, Testing, Documentation, and other things too numerous to mention: + Chung Chung : + Bryan Gurney + *Simon J. Hernandez + Jakub Krysl + Marek Suchanek + + Project Management & Technical Direction: + Jered Floyd + Louis Imershein + Dennis Keefe + Andrew Walsh + + *former team members + +Other Contributors: + Ji-Hyeon Gim : + Updates for FC26/Kernel 4.13 + Vojtech Trefny + Getting correct size of partitions + Achilles Gaikwad + Bash completion for the vdo and vdostats commands + Jin-young Kwon + Adding vdo --version command, and documentation fixes + +VDO was originally created at Permabit Technology Corporation, and was +subsequently acquired and open-sourced by Red Hat. + +Former Members of the Permabit VDO Team: + Engineers: + Mark Amidon + David Buckle + Jacky Chu + Joel Hoff + Dimitri Kountourogianni + Alexis Layton + Michael Lee + Rich Macchi + Dave Paniriti + Karl Ramm + Hooman Vassef + Assar Westurlund + + Support, Testing, Documentation, etc. + Carl Alexander + Mike Chu + Mark Iskra + Farid Jahanmir + Francesca Koulikov + Erik Lattimore + Jennifer Levine + Randy Long + Steve Looby + Uche Onyekwuluje + Catherine Powell + Jeff Pozz + Sarmad Sada + John Schmidt + Omri Schwarz + Jay Splaine + John Welle + Mary-Anne Wolf + Devon Yablonski + Robert Zupko + + Interns: + Ari Entlich + Lori Monteleone + + Project Management & Technical Direction: + Michael Fortson + +Other Past Permabit Contributors (for early work on the index): + James Clough + Dave Golombek + Albert Lin + Edwin Olson + Dave Pinkney + Rich Brennan + +And Very Special Thanks To: + Norman Margolis, who started the whole thing diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..7d5393a --- /dev/null +++ b/COPYING @@ -0,0 +1,278 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4084615 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +obj-y += uds/ +obj-y += vdo/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..528277d --- /dev/null +++ b/README.md @@ -0,0 +1,125 @@ +# kvdo + +A pair of kernel modules which provide pools of deduplicated and/or compressed +block storage. + +## Background + +VDO (which includes [kvdo](https://github.com/dm-vdo/kvdo) and +[vdo](https://github.com/dm-vdo/vdo)) is software that provides inline +block-level deduplication, compression, and thin provisioning capabilities for +primary storage. VDO installs within the Linux device mapper framework, where +it takes ownership of existing physical block devices and remaps these to new, +higher-level block devices with data-efficiency capabilities. + +Deduplication is a technique for reducing the consumption of storage resources +by eliminating multiple copies of duplicate blocks. Compression takes the +individual unique blocks and shrinks them with coding algorithms; these reduced +blocks are then efficiently packed together into physical blocks. Thin +provisioning manages the mapping from LBAs presented by VDO to where the data +has actually been stored, and also eliminates any blocks of all zeroes. + +With deduplication, instead of writing the same data more than once each +duplicate block is detected and recorded as a reference to the original +block. VDO maintains a mapping from logical block addresses (used by the +storage layer above VDO) to physical block addresses (used by the storage layer +under VDO). After deduplication, multiple logical block addresses may be mapped +to the same physical block address; these are called shared blocks and are +reference-counted by the software. + +With VDO's compression, multiple blocks (or shared blocks) are compressed with +the fast LZ4 algorithm, and binned together where possible so that multiple +compressed blocks fit within a 4 KB block on the underlying storage. Mapping +from LBA is to a physical block address and index within it for the desired +compressed data. All compressed blocks are individually reference counted for +correctness. + +Block sharing and block compression are invisible to applications using the +storage, which read and write blocks as they would if VDO were not +present. When a shared block is overwritten, a new physical block is allocated +for storing the new block data to ensure that other logical block addresses +that are mapped to the shared physical block are not modified. + +This public source release of VDO includes two kernel modules, and a set of +userspace tools for managing them. The "kvdo" module implements fine-grained +storage virtualization, thin provisioning, block sharing, and compression; the +"uds" module provides memory-efficient duplicate identification. The userspace +tools include a pair of python scripts, "vdo" for creating and managing VDO +volumes, and "vdostats" for extracting statistics from those volumes. + +## Documentation + +- [RHEL8 VDO Documentation](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/deduplicating_and_compressing_storage/index) +- [RHEL7 VDO Integration Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-integration) +- [RHEL7 VDO Evaluation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-evaluation) + +## Releases + +Each branch on this project is intended to work with a specific release of +Enterprise Linux (Red Hat Enterprise Linux, CentOS, etc.). We try to maintain +compatibility with active Fedora releases, but some modifications may be +required. + +Version | Intended Enterprise Linux Release | Supported With Modifications +------- | --------------------------------- | ------------------------------- +6.1.x.x | EL7 (3.10.0-*.el7) | +6.2.x.x | EL8 (4.18.0-*.el8) | Fedora 28, Fedora 29, Fedora 30, Rawhide +* Pre-built versions with the required modifications for the referenced Fedora + releases can be found + [here](https://copr.fedorainfracloud.org/coprs/rhawalsh/dm-vdo) and can be + used by running `dnf copr enable rhawalsh/dm-vdo`. + +## Status + +VDO was originally developed by Permabit Technology Corp. as a proprietary set +of kernel modules and userspace tools. This software and technology has been +acquired by Red Hat, has been relicensed under the GPL (v2 or later), and this +repository begins the process of preparing for integration with the upstream +kernel. + +While this software has been relicensed there are a number of issues that must +still be addressed to be ready for upstream. These include: + +- Conformance with kernel coding standards +- Use of existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate +- Refactoring of primitives (e.g. cryptographic) to appropriate kernel + subsystems +- Support for non-x86-64 platforms +- Refactoring of platform layer abstractions and other changes requested by + upstream maintainers + +We expect addressing these issues to take some time. In the meanwhile, this +project allows interested parties to begin using VDO immediately. The +technology itself is thoroughly tested, mature, and in production use since +2014 in its previous proprietary form. + +## Building + +In order to build the kernel modules, invoke the following command +from the top directory of this tree: + + make -C /usr/src/kernels/`uname -r` M=`pwd` + +* Patched sources that work with the most recent upstream kernels can be found + [here](https://github.com/rhawalsh/kvdo). + +## Communication channels + +Community feedback, participation and patches are welcome to the +vdo-devel@redhat.com mailing list -- subscribe +[here](https://www.redhat.com/mailman/listinfo/vdo-devel). + +## Contributing + +This project is currently a stepping stone towards integration with the Linux +kernel. As such, contributions are welcome via a process similar to that for +Linux kernel development. Patches should be submitted to the +vdo-devel@redhat.com mailing list, where they will be considered for +inclusion. This project does not accept pull requests. + +## Licensing + +[GPL v2.0 or later](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html). +All contributions retain ownership by their original author, but must also +be licensed under the GPL 2.0 or later to be merged. + diff --git a/TODO b/TODO new file mode 100644 index 0000000..d2d5cf5 --- /dev/null +++ b/TODO @@ -0,0 +1,6 @@ +- Conform to kernel coding standards +- Use existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate +- Refactor primitives (e.g. cryptographic) to appropriate kernel subsystems +- Support non-x86-64 platforms +- Refactor platform layer abstractions and other changes requested by upstream + maintainers diff --git a/kvdo.spec b/kvdo.spec new file mode 100644 index 0000000..e340b2c --- /dev/null +++ b/kvdo.spec @@ -0,0 +1,89 @@ +%define spec_release 1 +%define kmod_name kvdo +%define kmod_driver_version 6.2.4.26 +%define kmod_rpm_release %{spec_release} +%define kmod_kernel_version 3.10.0-693.el7 + +# Disable the scanning for a debug package +%global debug_package %{nil} + +Source0: kmod-%{kmod_name}-%{kmod_driver_version}.tgz + +Name: kmod-kvdo +Version: %{kmod_driver_version} +Release: %{kmod_rpm_release}%{?dist} +Summary: Kernel Modules for Virtual Data Optimizer +License: GPLv2+ +URL: http://github.com/dm-vdo/kvdo +BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +Requires: dkms +Requires: kernel-devel >= %{kmod_kernel_version} +Requires: make +ExclusiveArch: x86_64 +ExcludeArch: s390 +ExcludeArch: s390x +ExcludeArch: ppc +ExcludeArch: ppc64 +ExcludeArch: ppc64le +ExcludeArch: aarch64 +ExcludeArch: i686 + +%description +Virtual Data Optimizer (VDO) is a device mapper target that delivers +block-level deduplication, compression, and thin provisioning. + +This package provides the kernel modules for VDO. + +%post +set -x +/usr/sbin/dkms --rpm_safe_upgrade add -m %{kmod_name} -v %{version}-%{kmod_driver_version} +/usr/sbin/dkms --rpm_safe_upgrade build -m %{kmod_name} -v %{version}-%{kmod_driver_version} +/usr/sbin/dkms --rpm_safe_upgrade install -m %{kmod_name} -v %{version}-%{kmod_driver_version} + +%preun +# Check whether kvdo or uds is loaded, and if so attempt to remove it. A +# failure here means there is still something using the module, which should be +# cleared up before attempting to remove again. +for module in kvdo uds; do + if grep -q "^${module}" /proc/modules; then + modprobe -r ${module} + fi +done +/usr/sbin/dkms --rpm_safe_upgrade remove -m %{kmod_name} -v %{version}-%{kmod_driver_version} --all || : + +%prep +%setup -n kmod-%{kmod_name}-%{kmod_driver_version} + +%build +# Nothing doing here, as we're going to build on whatever kernel we end up +# running inside. + +%install +mkdir -p $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version} +cp -r * $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/ +cat > $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/dkms.conf < - 6.2.4.26-1 +HASH(0x5645fb62bab0) \ No newline at end of file diff --git a/uds/Makefile b/uds/Makefile new file mode 100644 index 0000000..5afc64a --- /dev/null +++ b/uds/Makefile @@ -0,0 +1,21 @@ +UDS_VERSION = 8.0.2.4 + +SOURCES = $(notdir $(wildcard $(src)/*.c)) murmur/MurmurHash3.c +SOURCES += $(addprefix util/,$(notdir $(wildcard $(src)/util/*.c))) +OBJECTS = $(SOURCES:%.c=%.o) +INCLUDES = -I$(src) + +EXTRA_CFLAGS = -std=gnu99 \ + -fno-builtin-memset \ + -Werror \ + -Wframe-larger-than=400 \ + -Wno-declaration-after-statement \ + -DUDS_VERSION=\"$(UDS_VERSION)\" \ + $(INCLUDES) + +CFLAGS_REMOVE_deltaIndex.o = -std=gnu99 +CFLAGS_REMOVE_masterIndex005.o = -std=gnu99 + +obj-m += uds.o + +uds-objs = $(OBJECTS) diff --git a/uds/atomicDefs.h b/uds/atomicDefs.h new file mode 100644 index 0000000..0c82bca --- /dev/null +++ b/uds/atomicDefs.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/atomicDefs.h#2 $ + */ + +#ifndef LINUX_KERNEL_ATOMIC_DEFS_H +#define LINUX_KERNEL_ATOMIC_DEFS_H + +#include + +#endif /* LINUX_KERNEL_ATOMIC_DEFS_H */ diff --git a/uds/bits.c b/uds/bits.c new file mode 100644 index 0000000..eea4912 --- /dev/null +++ b/uds/bits.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bits.c#1 $ + */ + +#include "bits.h" + +#include "compiler.h" + +/** + * This is the largest field size supported by getBigField & setBigField. + * Any field that is larger is not guaranteed to fit in a single, byte + * aligned uint64_t. + **/ +enum { MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1 }; + +/** + * Get a big bit field from a bit stream + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE uint64_t getBigField(const byte *memory, + uint64_t offset, + int size) +{ + const void *addr = memory + offset / CHAR_BIT; + return (getUInt64LE(addr) >> (offset % CHAR_BIT)) & ((1UL << size) - 1); +} + +/** + * Set a big bit field in a bit stream + * + * @param value The value to put into the field + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setBigField(uint64_t value, byte *memory, uint64_t offset, + int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint64_t data = getUInt64LE(addr); + data &= ~(((1UL << size) - 1) << shift); + data |= value << shift; + storeUInt64LE(addr, data); +} + +/***********************************************************************/ +void getBytes(const byte *memory, uint64_t offset, byte *destination, int size) +{ + const byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + while (--size >= 0) { + *destination++ = getUInt16LE(addr++) >> shift; + } +} + +/***********************************************************************/ +void setBytes(byte *memory, uint64_t offset, const byte *source, int size) +{ + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint16_t mask = ~((uint16_t) 0xFF << shift); + while (--size >= 0) { + uint16_t data = (getUInt16LE(addr) & mask) | (*source++ << shift); + storeUInt16LE(addr++, data); + } +} + +/***********************************************************************/ +void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, + uint64_t destination, int size) +{ + enum { UINT32_BIT = sizeof(uint32_t) * CHAR_BIT }; + if (size > MAX_BIG_FIELD_BITS) { + if (source > destination) { + // This is a large move from a higher to a lower address. We move + // the lower addressed bits first. Start by moving one field that + // ends on a destination int boundary + int count + = MAX_BIG_FIELD_BITS - (destination + MAX_BIG_FIELD_BITS) % UINT32_BIT; + uint64_t field = getBigField(sMemory, source, count); + setBigField(field, dMemory, destination, count); + source += count; + destination += count; + size -= count; + // Now do the main loop to copy 32 bit chunks that are int-aligned + // at the destination. + int offset = source % UINT32_BIT; + const byte *src = sMemory + (source - offset) / CHAR_BIT; + byte *dest = dMemory + destination / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + storeUInt32LE(dest, getUInt64LE(src) >> offset); + src += sizeof(uint32_t); + dest += sizeof(uint32_t); + source += UINT32_BIT; + destination += UINT32_BIT; + size -= UINT32_BIT; + } + } else { + // This is a large move from a lower to a higher address. We move + // the higher addressed bits first. Start by moving one field that + // begins on a destination int boundary + int count = (destination + size) % UINT32_BIT; + if (count > 0) { + size -= count; + uint64_t field = getBigField(sMemory, source + size, count); + setBigField(field, dMemory, destination + size, count); + } + // Now do the main loop to copy 32 bit chunks that are int-aligned + // at the destination. + int offset = (source + size) % UINT32_BIT; + const byte *src = sMemory + (source + size - offset) / CHAR_BIT; + byte *dest = dMemory + (destination + size) / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + src -= sizeof(uint32_t); + dest -= sizeof(uint32_t); + size -= UINT32_BIT; + storeUInt32LE(dest, getUInt64LE(src) >> offset); + } + } + } + // Finish up by doing the last chunk, which can have any arbitrary alignment + if (size > 0) { + uint64_t field = getBigField(sMemory, source, size); + setBigField(field, dMemory, destination, size); + } +} + +/***********************************************************************/ +bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, + uint64_t offset2, int size) +{ + while (size >= MAX_FIELD_BITS) { + unsigned int field1 = getField(mem1, offset1, MAX_FIELD_BITS); + unsigned int field2 = getField(mem2, offset2, MAX_FIELD_BITS); + if (field1 != field2) return false; + offset1 += MAX_FIELD_BITS; + offset2 += MAX_FIELD_BITS; + size -= MAX_FIELD_BITS; + } + if (size > 0) { + unsigned int field1 = getField(mem1, offset1, size); + unsigned int field2 = getField(mem2, offset2, size); + if (field1 != field2) return false; + } + return true; +} diff --git a/uds/bits.h b/uds/bits.h new file mode 100644 index 0000000..2c2d4ea --- /dev/null +++ b/uds/bits.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bits.h#1 $ + */ + +#ifndef BITS_H +#define BITS_H 1 + +#include "compiler.h" +#include "numeric.h" +#include "typeDefs.h" + +/* + * These bit stream and bit field utility routines are used for the + * non-byte aligned delta indices. + * + * Bits and bytes are numbered in little endian order. For example: Within + * a byte, bit 0 is the least significant bit (0x1), and bit 7 is the most + * significant bit (0x80). Within a bit stream, bit 7 is the most + * signficant bit of byte 0, and bit 8 is the least significant bit of byte + * 1. Within a byte array, a byte's number corresponds to it's index in + * the array. + * + * The implementation assumes that the native machine is little endian, and + * that performance is very important. These assumptions match our current + * operating environment. + */ + +/** + * This is the largest field size supported by getField & setField. Any + * field that is larger is not guaranteed to fit in a single, byte aligned + * uint32_t. + **/ +enum { MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1 }; + +/** + * This is the number of guard bytes needed at the end of the memory byte + * array when using the bit utilities. 3 bytes are needed when getField & + * setField access a field, because they will access some "extra" bytes + * past the end of the field. And 7 bytes are needed when getBigField & + * setBigField access a big field, for the same reason. Note that moveBits + * calls getBigField & setBigField. 7 is rewritten to make it clear how it + * is derived. + **/ +enum { POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1 }; + +/** + * Get a bit field from a bit stream + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE unsigned int getField(const byte *memory, uint64_t offset, + int size) +{ + const void *addr = memory + offset / CHAR_BIT; + return (getUInt32LE(addr) >> (offset % CHAR_BIT)) & ((1 << size) - 1); +} + +/** + * Set a bit field in a bit stream + * + * @param value The value to put into the field + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setField(unsigned int value, byte *memory, uint64_t offset, + int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint32_t data = getUInt32LE(addr); + data &= ~(((1 << size) - 1) << shift); + data |= value << shift; + storeUInt32LE(addr, data); +} + +/** + * Set a bit field in a bit stream to all ones + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setOne(byte *memory, uint64_t offset, int size) +{ + if (size > 0) { + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; + *addr++ |= ((1 << count) - 1) << shift; + for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { + *addr++ = 0xFF; + } + if (size) { + *addr |= ~(0xFF << size); + } + } +} + +/** + * Set a bit field in a bit stream to all zeros + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setZero(byte *memory, uint64_t offset, int size) +{ + if (size > 0) { + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; + *addr++ &= ~(((1 << count) - 1) << shift); + for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { + *addr++ = 0; + } + if (size) { + *addr &= 0xFF << size; + } + } +} + +/** + * Get a byte stream from a bit stream, reading a whole number of bytes + * from an arbitrary bit boundary. + * + * @param memory The base memory byte address for the bit stream + * @param offset The bit offset of the start of the bit stream + * @param destination Where to store the bytes + * @param size The number of bytes + **/ +void getBytes(const byte *memory, uint64_t offset, byte *destination, int size); + +/** + * Store a byte stream into a bit stream, writing a whole number of bytes + * to an arbitrary bit boundary. + * + * @param memory The base memory byte address for the bit stream + * @param offset The bit offset of the start of the bit stream + * @param source Where to read the bytes + * @param size The number of bytes + **/ +void setBytes(byte *memory, uint64_t offset, const byte *source, int size); + +/** + * Move bits from one field to another. When the fields overlap, behave as + * if we first move all the bits from the source to a temporary value, and + * then move all the bits from the temporary value to the destination. + * + * @param sMemory The base source memory byte address + * @param source Bit offset into memory for the source start + * @param dMemory The base destination memory byte address + * @param destination Bit offset into memory for the destination start + * @param size The number of bits in the field + **/ +void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, + uint64_t destination, int size); + +/** + * Compare bits from one field to another, testing for sameness + * + * @param mem1 The base memory byte address (first field) + * @param offset1 Bit offset into the memory for the start (first field) + * @param mem2 The base memory byte address (second field) + * @param offset2 Bit offset into the memory for the start (second field) + * @param size The number of bits in the field + * + * @return true if fields are the same, false if different + **/ +bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, + uint64_t offset2, int size) + __attribute__((warn_unused_result)); + +#endif /* BITS_H */ diff --git a/uds/buffer.c b/uds/buffer.c new file mode 100644 index 0000000..2bf6d20 --- /dev/null +++ b/uds/buffer.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/buffer.c#3 $ + */ + +#include "buffer.h" + +#include "bufferPrivate.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" +#include "typeDefs.h" + +/**********************************************************************/ +int wrapBuffer(byte *bytes, + size_t length, + size_t contentLength, + Buffer **bufferPtr) +{ + int result = ASSERT((contentLength <= length), + "content length, %zu, fits in buffer size, %zu", + length, contentLength); + Buffer *buffer; + result = ALLOCATE(1, Buffer, "buffer", &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + buffer->data = bytes; + buffer->start = 0; + buffer->end = contentLength; + buffer->length = length; + buffer->wrapped = true; + + *bufferPtr = buffer; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeBuffer(size_t size, Buffer **newBuffer) +{ + byte *data; + int result = ALLOCATE(size, byte, "buffer data", &data); + if (result != UDS_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer(data, size, 0, &buffer); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + buffer->wrapped = false; + *newBuffer = buffer; + return UDS_SUCCESS; +} + +/***********************************************************************/ +void freeBuffer(Buffer **pBuffer) +{ + Buffer *buffer = *pBuffer; + *pBuffer = NULL; + if (buffer == NULL) { + return; + } + if (!buffer->wrapped) { + FREE(buffer->data); + } + FREE(buffer); +} + +/**********************************************************************/ +size_t bufferLength(Buffer *buffer) +{ + return buffer->length; +} + +/**********************************************************************/ +size_t contentLength(Buffer *buffer) +{ + return buffer->end - buffer->start; +} + +/**********************************************************************/ +size_t uncompactedAmount(Buffer *buffer) +{ + return buffer->start; +} + +/**********************************************************************/ +size_t availableSpace(Buffer *buffer) +{ + return buffer->length - buffer->end; +} + +/**********************************************************************/ +size_t bufferUsed(Buffer *buffer) +{ + return buffer->end; +} + +/***********************************************************************/ +int growBuffer(Buffer *buffer, size_t length) +{ + if (buffer == NULL) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot resize NULL buffer"); + } + + if (buffer->wrapped) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot resize wrapped buffer"); + } + if (buffer->end > length) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot shrink buffer"); + } + + byte *data; + int result = reallocateMemory(buffer->data, buffer->length, length, + "buffer data", &data); + if (result != UDS_SUCCESS) { + return result; + } + + buffer->data = data; + buffer->length = length; + return UDS_SUCCESS; +} + +/***********************************************************************/ +bool ensureAvailableSpace(Buffer *buffer, size_t bytes) +{ + if (availableSpace(buffer) >= bytes) { + return true; + } + compactBuffer(buffer); + return (availableSpace(buffer) >= bytes); +} + +/***********************************************************************/ +void clearBuffer(Buffer *buffer) +{ + buffer->start = 0; + buffer->end = buffer->length; +} + +/***********************************************************************/ +void compactBuffer(Buffer *buffer) +{ + if ((buffer->start == 0) || (buffer->end == 0)) { + return; + } + size_t bytesToMove = buffer->end - buffer->start; + memmove(buffer->data, buffer->data + buffer->start, bytesToMove); + buffer->start = 0; + buffer->end = bytesToMove; +} + +/**********************************************************************/ +int resetBufferEnd(Buffer *buffer, size_t end) +{ + if (end > buffer->length) { + return UDS_BUFFER_ERROR; + } + buffer->end = end; + if (buffer->start > buffer->end) { + buffer->start = buffer->end; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int skipForward(Buffer *buffer, size_t bytesToSkip) +{ + if (contentLength(buffer) < bytesToSkip) { + return UDS_BUFFER_ERROR; + } + + buffer->start += bytesToSkip; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int rewindBuffer(Buffer *buffer, size_t bytesToRewind) +{ + if (buffer->start < bytesToRewind) { + return UDS_BUFFER_ERROR; + } + + buffer->start -= bytesToRewind; + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) +{ + return ((contentLength(buffer) >= length) + && (memcmp(buffer->data + buffer->start, data, length) == 0)); +} + +/**********************************************************************/ +bool equalBuffers(Buffer *buffer1, Buffer *buffer2) +{ + return hasSameBytes(buffer1, buffer2->data + buffer2->start, + contentLength(buffer2)); +} + +/**********************************************************************/ +int getByte(Buffer *buffer, byte *bytePtr) +{ + if (contentLength(buffer) < sizeof(byte)) { + return UDS_BUFFER_ERROR; + } + + *bytePtr = buffer->data[buffer->start++]; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) +{ + if (contentLength(buffer) < (offset + sizeof(byte))) { + return UDS_BUFFER_ERROR; + } + + *bytePtr = buffer->data[buffer->start + offset]; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putByte(Buffer *buffer, byte b) +{ + if (!ensureAvailableSpace(buffer, sizeof(byte))) { + return UDS_BUFFER_ERROR; + } + + buffer->data[buffer->end++] = b; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) +{ + if (contentLength(buffer) < length) { + return UDS_BUFFER_ERROR; + } + + memcpy(destination, buffer->data + buffer->start, length); + buffer->start += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +byte *getBufferContents(Buffer *buffer) +{ + return buffer->data + buffer->start; +} + +/**********************************************************************/ +int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) +{ + byte *destination; + int result = ALLOCATE(length, byte, "copyBytes() buffer", + &destination); + if (result != UDS_SUCCESS) { + return result; + } + + result = getBytesFromBuffer(buffer, length, destination); + if (result != UDS_SUCCESS) { + FREE(destination); + } else { + *destinationPtr = destination; + } + return result; +} + +/**********************************************************************/ +int putBytes(Buffer *buffer, size_t length, const void *source) +{ + if (!ensureAvailableSpace(buffer, length)) { + return UDS_BUFFER_ERROR; + } + memcpy(buffer->data + buffer->end, source, length); + buffer->end += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putBuffer(Buffer *target, Buffer *source, size_t length) +{ + if (contentLength(source) < length) { + return UDS_BUFFER_ERROR; + } + + int result = putBytes(target, length, getBufferContents(source)); + if (result != UDS_SUCCESS) { + return result; + } + + source->start += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int zeroBytes(Buffer *buffer, size_t length) +{ + if (!ensureAvailableSpace(buffer, length)) { + return UDS_BUFFER_ERROR; + } + memset(buffer->data + buffer->end, 0, length); + buffer->end += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getBoolean(Buffer *buffer, bool *b) +{ + byte by; + int result = getByte(buffer, &by); + if (result == UDS_SUCCESS) { + *b = (by == 1); + } + return result; +} + +/**********************************************************************/ +int putBoolean(Buffer *buffer, bool b) +{ + return putByte(buffer, (byte) (b ? 1 : 0)); +} + +/**********************************************************************/ +int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) +{ + if (contentLength(buffer) < sizeof(uint16_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt16BE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt16BE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) +{ + if (contentLength(buffer) < sizeof(uint32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt32BE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt32BE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint32_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt32BE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt32BE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt64BE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt64BE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) +{ + if (contentLength(buffer) < sizeof(uint16_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt16LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt16LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint16_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt16LE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt16LE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) +{ + if (contentLength(buffer) < sizeof(int32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeInt32LE(buffer->data, &buffer->start, i); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) +{ + if (contentLength(buffer) < sizeof(uint32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt32LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt32LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) +{ + if (!ensureAvailableSpace(buffer, sizeof(int64_t))) { + return UDS_BUFFER_ERROR; + } + + encodeInt64LE(buffer->data, &buffer->end, i); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) +{ + if (contentLength(buffer) < sizeof(uint64_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt64LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt64LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt64LE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt64LE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + diff --git a/uds/buffer.h b/uds/buffer.h new file mode 100644 index 0000000..22df042 --- /dev/null +++ b/uds/buffer.h @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/buffer.h#2 $ + */ + +#ifndef BUFFER_H +#define BUFFER_H + +#include "common.h" + +typedef struct buffer Buffer; + +/** + * Create a buffer which wraps an existing byte array. + * + * @param bytes The bytes to wrap + * @param length The length of the buffer + * @param contentLength The length of the current contents of the buffer + * @param bufferPtr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int wrapBuffer(byte *bytes, + size_t length, + size_t contentLength, + Buffer **bufferPtr) + __attribute__((warn_unused_result)); + +/** + * Create a new buffer and allocate its memory. + * + * @param length The length of the buffer + * @param bufferPtr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int makeBuffer(size_t length, Buffer **bufferPtr) + __attribute__((warn_unused_result)); + +/** + * Release a buffer and, if not wrapped, free its memory. + * + * @param pBuffer Pointer to the buffer to release + **/ +void freeBuffer(Buffer **pBuffer); + +/** + * Grow a non-wrapped buffer. + * + * @param buffer The buffer to resize + * @param length The new length of the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int growBuffer(Buffer *buffer, size_t length) + __attribute__((warn_unused_result)); + +/** + * Ensure that a buffer has a given amount of space available, compacting the + * buffer if necessary. + * + * @param buffer The buffer + * @param bytes The number of available bytes desired + * + * @return true if the requested number of bytes are now available + **/ +bool ensureAvailableSpace(Buffer *buffer, size_t bytes) + __attribute__((warn_unused_result)); + +/** + * Clear the buffer. The start position is set to zero and the end position + * is set to the buffer length. + **/ +void clearBuffer(Buffer *buffer); + +/** + * Eliminate buffer contents which have been extracted. This function copies + * any data between the start and end pointers to the beginning of the buffer, + * moves the start pointer to the beginning, and the end pointer to the end + * of the copied data. + * + * @param buffer The buffer to compact + **/ +void compactBuffer(Buffer *buffer); + +/** + * Skip forward the specified number of bytes in a buffer (advance the + * start pointer). + * + * @param buffer The buffer + * @param bytesToSkip The number of bytes to skip + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long + * enough to skip forward the requested number of bytes + **/ +int skipForward(Buffer *buffer, size_t bytesToSkip) + __attribute__((warn_unused_result)); + +/** + * Rewind the specified number of bytes in a buffer (back up the start + * pointer). + * + * @param buffer The buffer + * @param bytesToRewind The number of bytes to rewind + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long + * enough to rewind backward the requested number of bytes + **/ +int rewindBuffer(Buffer *buffer, size_t bytesToRewind) + __attribute__((warn_unused_result)); + +/** + * Return the length of the buffer. + * + * @param buffer the buffer + * + * @return the buffer length + **/ +size_t bufferLength(Buffer *buffer); + +/** + * Compute the amount of data current in the buffer. + * + * @param buffer The buffer to examine + * + * @return The number of bytes between the start and end pointers of the buffer + **/ +size_t contentLength(Buffer *buffer); + +/** + * Compute the amount of available space in this buffer. + * + * @param buffer The buffer to examine + * + * @return The number of bytes between the end pointer and the end of the buffer + **/ +size_t availableSpace(Buffer *buffer); + +/** + * Amount of buffer that has already been processed. + * + * @param buffer the buffer to examine + * + * @return The number of bytes between the beginning of the buffer and the + * start pointer. + **/ +size_t uncompactedAmount(Buffer *buffer); + +/** + * Return the amount of the buffer that is currently utilized. + * + * @param buffer the buffer to examine + * + * @return The number of bytes between the beginning of the buffer and + * the end pointer. + **/ +size_t bufferUsed(Buffer *buffer); + +/** + * Reset the end of buffer to a different position. + * + * @param buffer the buffer + * @param end the new end of the buffer + * + * @return UDS_SUCCESS unless the end is larger than can fit + **/ +int resetBufferEnd(Buffer *buffer, size_t end) + __attribute__((warn_unused_result)); + +/** + * Check whether the start of the content of a buffer matches a specified + * array of bytes. + * + * @param buffer The buffer to check + * @param data The desired data + * @param length The length of the desired data + * + * @return true if the first length bytes of the buffer's + * contents match data + **/ +bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) + __attribute__((warn_unused_result)); + +/** + * Check whether two buffers have the same contents. + * + * @param buffer1 The first buffer + * @param buffer2 The second buffer + * + * @return true if the contents of the two buffers are the + * same + **/ +bool equalBuffers(Buffer *buffer1, Buffer *buffer2); + +/** + * Get a single byte from a buffer and advance the start pointer. + * + * @param buffer The buffer + * @param bytePtr A pointer to hold the byte + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are no bytes to + * retrieve + **/ +int getByte(Buffer *buffer, byte *bytePtr) __attribute__((warn_unused_result)); + +/** + * Get a single byte from a buffer without advancing the start pointer. + * + * @param buffer The buffer + * @param offset The offset past the start pointer of the desired byte + * @param bytePtr A pointer to hold the byte + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the offset is past the end + * of the buffer + **/ +int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) + __attribute__((warn_unused_result)); + +/** + * Put a single byte into a buffer and advance the end pointer. + * + * @param buffer The buffer + * @param b The byte to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer + **/ +int putByte(Buffer *buffer, byte b) __attribute__((warn_unused_result)); + +/** + * Get bytes out of a buffer and advance the start of the buffer past the + * copied data. + * + * @param buffer The buffer from which to copy + * @param length The number of bytes to copy + * @param destination A pointer to hold the data + * + * @return UDS_SUCCESS or an error code + **/ +int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) + __attribute__((warn_unused_result)); + +/** + * Get a pointer to the current contents of the buffer. This will be a pointer + * to the actual memory managed by the buffer. It is the caller's responsibility + * to ensure that the buffer is not modified while this pointer is in use. + * + * @param buffer The buffer from which to get the contents + * + * @return a pointer to the current contents of the buffer + **/ +byte *getBufferContents(Buffer *buffer); + +/** + * Copy bytes out of a buffer and advance the start of the buffer past the + * copied data. Memory will be allocated to hold the copy. + * + * @param buffer The buffer from which to copy + * @param length The number of bytes to copy + * @param destinationPtr A pointer to hold the copied data + * + * @return UDS_SUCCESS or an error code + **/ +int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) + __attribute__((warn_unused_result)); + +/** + * Copy bytes into a buffer and advance the end of the buffer past the + * copied data. + * + * @param buffer The buffer to copy into + * @param length The length of the data to copy + * @param source The data to copy + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have + * length bytes available + **/ +int putBytes(Buffer *buffer, size_t length, const void *source) + __attribute__((warn_unused_result)); + +/** + * Copy the contents of a source buffer into the target buffer. Advances the + * start of the source buffer and the end of the target buffer past the copied + * data. + * + * @param target The buffer to receive the copy of the data + * @param source The buffer containing the data to copy + * @param length The length of the data to copy + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the target buffer does not have + * length bytes available or if the source buffer does not have length + * bytes of content + **/ +int putBuffer(Buffer *target, Buffer *source, size_t length) + __attribute__((warn_unused_result)); + +/** + * Zero bytes in a buffer starting at the start pointer, and advance the + * end of the buffer past the zeros. + * + * @param buffer The buffer to zero + * @param length The number of bytes to zero + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have + * length bytes available + **/ +int zeroBytes(Buffer *buffer, size_t length) + __attribute__((warn_unused_result)); + +/** + * Get a boolean value from a buffer and advance the start pointer. + * + * @param buffer The buffer + * @param b A pointer to hold the boolean value + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getBoolean(Buffer *buffer, bool *b) __attribute__((warn_unused_result)); + +/** + * Put a boolean value into a buffer and advance the end pointer. + * + * @param buffer The buffer + * @param b The boolean to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer + **/ +int putBoolean(Buffer *buffer, bool b) __attribute__((warn_unused_result)); + +/** + * Get a 2 byte, big endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 2 byte, big endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, big endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 4 byte, big endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 4 byte, big endian encoded integer from a buffer and + * advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 4 byte, big endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 8 byte, big endian encoded integer from a buffer and + * advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 8 byte, big endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a 2 byte, little endian encoded integer from a buffer and + * advance the start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 2 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 2 byte, little endian encoded integer from a buffer + * and advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 2 byte, little endian encoded integers into a + * buffer and advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, little endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param i A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, little endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 4 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) + __attribute__((warn_unused_result)); + +/** + * Get an 8 byte, little endian encoded, unsigned integer from a + * buffer and advance the start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put an 8 byte, little endian encoded signed integer into a buffer + * and advance the end pointer past it. + * + * @param buffer The buffer + * @param i The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) + __attribute__((warn_unused_result)); + + /** + * Put an 8 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 8 byte, little endian encoded integer from a buffer + * and advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 8 byte, little endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) + __attribute__((warn_unused_result)); + +#endif /* BUFFER_H */ diff --git a/uds/bufferPrivate.h b/uds/bufferPrivate.h new file mode 100644 index 0000000..8a0f46a --- /dev/null +++ b/uds/bufferPrivate.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferPrivate.h#1 $ + */ + +#ifndef BUFFER_PRIVATE_H +#define BUFFER_PRIVATE_H + +#include "common.h" + +struct buffer { + size_t start; + size_t end; + size_t length; + byte *data; + bool wrapped; +}; + +#endif /* BUFFER_PRIVATE_H */ diff --git a/uds/bufferedReader.c b/uds/bufferedReader.c new file mode 100644 index 0000000..b67d33d --- /dev/null +++ b/uds/bufferedReader.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.c#5 $ + */ + +#include "bufferedReader.h" + +#include "compiler.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + +#ifndef __KERNEL__ +/* + * Define sector_t. The kernel really wants us to use it. The code becomes + * ugly if we need to #ifdef every usage of sector_t. Note that the of #define + * means that even if a user mode include typedefs sector_t, it will not affect + * this module. + */ +#define sector_t uint64_t +#endif + +struct bufferedReader { +#ifdef __KERNEL__ + // IOFactory owning the block device + IOFactory *br_factory; + // The dm_bufio_client to read from + struct dm_bufio_client *br_client; + // The current dm_buffer + struct dm_buffer *br_buffer; + // The number of blocks that can be read from + sector_t br_limit; + // Number of the current block + sector_t br_blockNumber; +#else + // Region to read from + IORegion *br_region; + // Number of the current block + uint64_t br_blockNumber; +#endif + // Start of the buffer + byte *br_start; + // End of the data read from the buffer + byte *br_pointer; +}; + +#ifdef __KERNEL__ +/*****************************************************************************/ +static void readAhead(BufferedReader *br, sector_t blockNumber) +{ + if (blockNumber < br->br_limit) { + enum { MAX_READ_AHEAD = 4 }; + size_t readAhead = minSizeT(MAX_READ_AHEAD, br->br_limit - blockNumber); + dm_bufio_prefetch(br->br_client, blockNumber, readAhead); + } +} +#endif + +/*****************************************************************************/ +#ifdef __KERNEL__ +int makeBufferedReader(IOFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedReader **readerPtr) +{ + BufferedReader *reader = NULL; + int result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); + if (result != UDS_SUCCESS) { + return result; + } + + *reader = (BufferedReader) { + .br_factory = factory, + .br_client = client, + .br_buffer = NULL, + .br_limit = blockLimit, + .br_blockNumber = 0, + .br_start = NULL, + .br_pointer = NULL, + }; + + readAhead(reader,0); + getIOFactory(factory); + *readerPtr = reader; + return UDS_SUCCESS; +} +#else +int makeBufferedReader(IORegion *region, BufferedReader **readerPtr) +{ + byte *data; + int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, + "buffer writer buffer", &data); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedReader *reader = NULL; + result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + *reader = (BufferedReader) { + .br_region = region, + .br_blockNumber = 0, + .br_start = data, + .br_pointer = NULL, + }; + + getIORegion(region); + *readerPtr = reader; + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +void freeBufferedReader(BufferedReader *br) +{ + if (br == NULL) { + return; + } +#ifdef __KERNEL__ + if (br->br_buffer != NULL) { + dm_bufio_release(br->br_buffer); + } + dm_bufio_client_destroy(br->br_client); + putIOFactory(br->br_factory); +#else + putIORegion(br->br_region); + FREE(br->br_start); +#endif + FREE(br); +} + +/*****************************************************************************/ +static int positionReader(BufferedReader *br, + sector_t blockNumber, + off_t offset) +{ + if ((br->br_pointer == NULL) || (blockNumber != br->br_blockNumber)) { +#ifdef __KERNEL__ + if (blockNumber >= br->br_limit) { + return UDS_OUT_OF_RANGE; + } + if (br->br_buffer != NULL) { + dm_bufio_release(br->br_buffer); + br->br_buffer = NULL; + } + struct dm_buffer *buffer = NULL; + void *data = dm_bufio_read(br->br_client, blockNumber, &buffer); + if (IS_ERR(data)) { + return -PTR_ERR(data); + } + br->br_buffer = buffer; + br->br_start = data; + if (blockNumber == br->br_blockNumber + 1) { + readAhead(br, blockNumber + 1); + } +#else + int result = readFromRegion(br->br_region, blockNumber * UDS_BLOCK_SIZE, + br->br_start, UDS_BLOCK_SIZE, NULL); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "%s got readFromRegion error", + __func__); + return result; + } +#endif + } + br->br_blockNumber = blockNumber; + br->br_pointer = br->br_start + offset; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static size_t bytesRemainingInReadBuffer(BufferedReader *br) +{ + return (br->br_pointer == NULL + ? 0 + : br->br_start + UDS_BLOCK_SIZE - br->br_pointer); +} + +/*****************************************************************************/ +int readFromBufferedReader(BufferedReader *br, void *data, size_t length) +{ + byte *dp = data; + int result = UDS_SUCCESS; + while (length > 0) { + if (bytesRemainingInReadBuffer(br) == 0) { + sector_t blockNumber = br->br_blockNumber; + if (br->br_pointer != NULL) { + ++blockNumber; + } + result = positionReader(br, blockNumber, 0); + if (result != UDS_SUCCESS) { + break; + } + } + + size_t avail = bytesRemainingInReadBuffer(br); + size_t chunk = minSizeT(length, avail); + memcpy(dp, br->br_pointer, chunk); + length -= chunk; + dp += chunk; + br->br_pointer += chunk; + } + + if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE)) + && (dp - (byte *) data > 0)) { + result = UDS_SHORT_READ; + } + return result; +} + +/*****************************************************************************/ +int verifyBufferedData(BufferedReader *br, + const void *value, + size_t length) +{ + const byte *vp = value; + sector_t startingBlockNumber = br->br_blockNumber; + int startingOffset = br->br_pointer - br->br_start; + while (length > 0) { + if (bytesRemainingInReadBuffer(br) == 0) { + sector_t blockNumber = br->br_blockNumber; + if (br->br_pointer != NULL) { + ++blockNumber; + } + int result = positionReader(br, blockNumber, 0); + if (result != UDS_SUCCESS) { + positionReader(br, startingBlockNumber, startingOffset); + return UDS_CORRUPT_FILE; + } + } + + size_t avail = bytesRemainingInReadBuffer(br); + size_t chunk = minSizeT(length, avail); + if (memcmp(vp, br->br_pointer, chunk) != 0) { + positionReader(br, startingBlockNumber, startingOffset); + return UDS_CORRUPT_FILE; + } + length -= chunk; + vp += chunk; + br->br_pointer += chunk; + } + + return UDS_SUCCESS; +} diff --git a/uds/bufferedReader.h b/uds/bufferedReader.h new file mode 100644 index 0000000..4da8119 --- /dev/null +++ b/uds/bufferedReader.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.h#3 $ + */ + +#ifndef BUFFERED_READER_H +#define BUFFERED_READER_H 1 + +#include "common.h" + +#ifdef __KERNEL__ +struct dm_bufio_client; +struct ioFactory; +#else +struct ioRegion; +#endif + +/** + * The buffered reader allows efficient IO for IORegions, which may be + * file- or block-based. The internal buffer always reads aligned data + * from the underlying region. + **/ +typedef struct bufferedReader BufferedReader; + +#ifdef __KERNEL__ +/** + * Make a new buffered reader. + * + * @param factory The IOFactory creating the buffered reader. + * @param client The dm_bufio_client to read from. + * @param blockLimit The number of blocks that may be read. + * @param readerPtr The pointer to hold the newly allocated buffered reader + * + * @return UDS_SUCCESS or error code. + **/ +int makeBufferedReader(struct ioFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); +#else +/** + * Make a new buffered reader. + * + * @param region An IORegion to read from. + * @param readerPtr The pointer to hold the newly allocated buffered reader. + * + * @return UDS_SUCCESS or error code. + **/ +int makeBufferedReader(struct ioRegion *region, BufferedReader **readerPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Free a buffered reader. + * + * @param reader The buffered reader + **/ +void freeBufferedReader(BufferedReader *reader); + +/** + * Retrieve data from a buffered reader, reading from the region when needed. + * + * @param reader The buffered reader + * @param data The buffer to read data into + * @param length The length of the data to read + * + * @return UDS_SUCCESS or an error code. + **/ +int readFromBufferedReader(BufferedReader *reader, void *data, size_t length) + __attribute__((warn_unused_result)); + +/** + * Verify that the data currently in the buffer matches the required value. + * + * @param reader The buffered reader. + * @param value The value that must match the buffer contents. + * @param length The length of the value that must match. + * + * @return UDS_SUCCESS or an error code, specifically UDS_CORRUPT_FILE + * if the required value fails to match. + * + * @note If the value matches, the matching contents are consumed. However, + * if the match fails, any buffer contents are left as is. + **/ +int verifyBufferedData(BufferedReader *reader, + const void *value, + size_t length) + __attribute__((warn_unused_result)); + +#endif // BUFFERED_READER_H diff --git a/uds/bufferedWriter.c b/uds/bufferedWriter.c new file mode 100644 index 0000000..abfb9cf --- /dev/null +++ b/uds/bufferedWriter.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.c#6 $ + */ + +#include "bufferedWriter.h" + +#include "compiler.h" +#include "errors.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + + +struct bufferedWriter { +#ifdef __KERNEL__ + // IOFactory owning the block device + IOFactory *bw_factory; + // The dm_bufio_client to write to + struct dm_bufio_client *bw_client; + // The current dm_buffer + struct dm_buffer *bw_buffer; + // The number of blocks that can be written to + sector_t bw_limit; + // Number of the current block + sector_t bw_blockNumber; +#else + // Region to write to + IORegion *bw_region; + // Number of the current block + uint64_t bw_blockNumber; +#endif + // Start of the buffer + byte *bw_start; + // End of the data written to the buffer + byte *bw_pointer; + // Error code + int bw_error; + // Have writes been done? + bool bw_used; +}; + +#ifdef __KERNEL__ +/*****************************************************************************/ +__attribute__((warn_unused_result)) +int prepareNextBuffer(BufferedWriter *bw) +{ + if (bw->bw_blockNumber >= bw->bw_limit) { + bw->bw_error = UDS_OUT_OF_RANGE; + return UDS_OUT_OF_RANGE; + } + + struct dm_buffer *buffer = NULL; + void *data = dm_bufio_new(bw->bw_client, bw->bw_blockNumber, &buffer); + if (IS_ERR(data)) { + bw->bw_error = -PTR_ERR(data); + return bw->bw_error; + } + bw->bw_buffer = buffer; + bw->bw_start = data; + bw->bw_pointer = data; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int flushPreviousBuffer(BufferedWriter *bw) +{ + if (bw->bw_buffer != NULL) { + if (bw->bw_error == UDS_SUCCESS) { + size_t avail = spaceRemainingInWriteBuffer(bw); + if (avail > 0) { + memset(bw->bw_pointer, 0, avail); + } + dm_bufio_mark_buffer_dirty(bw->bw_buffer); + } + dm_bufio_release(bw->bw_buffer); + bw->bw_buffer = NULL; + bw->bw_start = NULL; + bw->bw_pointer = NULL; + bw->bw_blockNumber++; + } + return bw->bw_error; +} +#endif + +/*****************************************************************************/ +#ifdef __KERNEL__ +int makeBufferedWriter(IOFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedWriter **writerPtr) +{ + BufferedWriter *writer; + int result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); + if (result != UDS_SUCCESS) { + return result; + } + + *writer = (BufferedWriter) { + .bw_factory = factory, + .bw_client = client, + .bw_buffer = NULL, + .bw_limit = blockLimit, + .bw_start = NULL, + .bw_pointer = NULL, + .bw_blockNumber = 0, + .bw_error = UDS_SUCCESS, + .bw_used = false, + }; + + getIOFactory(factory); + *writerPtr = writer; + return UDS_SUCCESS; +} +#else +int makeBufferedWriter(IORegion *region, BufferedWriter **writerPtr) +{ + byte *data; + int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, + "buffer writer buffer", &data); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer; + result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + *writer = (BufferedWriter) { + .bw_region = region, + .bw_start = data, + .bw_pointer = data, + .bw_blockNumber = 0, + .bw_error = UDS_SUCCESS, + .bw_used = false, + }; + + getIORegion(region); + *writerPtr = writer; + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +void freeBufferedWriter(BufferedWriter *bw) +{ + if (bw == NULL) { + return; + } +#ifdef __KERNEL__ + flushPreviousBuffer(bw); + int result = -dm_bufio_write_dirty_buffers(bw->bw_client); +#else + int result = syncRegionContents(bw->bw_region); +#endif + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "%s cannot sync storage", __func__); + } +#ifdef __KERNEL__ + dm_bufio_client_destroy(bw->bw_client); + putIOFactory(bw->bw_factory); +#else + putIORegion(bw->bw_region); + FREE(bw->bw_start); +#endif + FREE(bw); +} + +/*****************************************************************************/ +static INLINE size_t spaceUsedInBuffer(BufferedWriter *bw) +{ + return bw->bw_pointer - bw->bw_start; +} + +/*****************************************************************************/ +size_t spaceRemainingInWriteBuffer(BufferedWriter *bw) +{ + return UDS_BLOCK_SIZE - spaceUsedInBuffer(bw); +} + +/*****************************************************************************/ +int writeToBufferedWriter(BufferedWriter *bw, const void *data, size_t len) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + + const byte *dp = data; + int result = UDS_SUCCESS; + while ((len > 0) && (result == UDS_SUCCESS)) { +#ifdef __KERNEL__ + if (bw->bw_buffer == NULL) { + result = prepareNextBuffer(bw); + continue; + } +#endif + + size_t avail = spaceRemainingInWriteBuffer(bw); + size_t chunk = minSizeT(len, avail); + memcpy(bw->bw_pointer, dp, chunk); + len -= chunk; + dp += chunk; + bw->bw_pointer += chunk; + + if (spaceRemainingInWriteBuffer(bw) == 0) { + result = flushBufferedWriter(bw); + } + } + + bw->bw_used = true; + return result; +} + +/*****************************************************************************/ +int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + + int result = UDS_SUCCESS; + while ((len > 0) && (result == UDS_SUCCESS)) { +#ifdef __KERNEL__ + if (bw->bw_buffer == NULL) { + result = prepareNextBuffer(bw); + continue; + } +#endif + + size_t avail = spaceRemainingInWriteBuffer(bw); + size_t chunk = minSizeT(len, avail); + memset(bw->bw_pointer, 0, chunk); + len -= chunk; + bw->bw_pointer += chunk; + + if (spaceRemainingInWriteBuffer(bw) == 0) { + result = flushBufferedWriter(bw); + } + } + + bw->bw_used = true; + return result; +} + +/*****************************************************************************/ +int flushBufferedWriter(BufferedWriter *bw) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + +#ifdef __KERNEL__ + return flushPreviousBuffer(bw); +#else + size_t n = spaceUsedInBuffer(bw); + if (n > 0) { + int result = writeToRegion(bw->bw_region, + bw->bw_blockNumber * UDS_BLOCK_SIZE, + bw->bw_start, UDS_BLOCK_SIZE, n); + if (result != UDS_SUCCESS) { + return bw->bw_error = result; + } else { + bw->bw_pointer = bw->bw_start; + bw->bw_blockNumber++; + } + } + return UDS_SUCCESS; +#endif +} + +/*****************************************************************************/ +bool wasBufferedWriterUsed(const BufferedWriter *bw) +{ + return bw->bw_used; +} + +/*****************************************************************************/ +void noteBufferedWriterUsed(BufferedWriter *bw) +{ + bw->bw_used = true; +} diff --git a/uds/bufferedWriter.h b/uds/bufferedWriter.h new file mode 100644 index 0000000..8774b5b --- /dev/null +++ b/uds/bufferedWriter.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.h#5 $ + */ + +#ifndef BUFFERED_WRITER_H +#define BUFFERED_WRITER_H 1 + +#include "common.h" + +#ifdef __KERNEL__ +struct dm_bufio_client; +struct ioFactory; +#else +struct ioRegion; +#endif + +typedef struct bufferedWriter BufferedWriter; + +#ifdef __KERNEL__ +/** + * Make a new buffered writer. + * + * @param factory The IOFactory creating the buffered writer + * @param client The dm_bufio_client to write to. + * @param blockLimit The number of blocks that may be written to. + * @param writerPtr The new buffered writer goes here. + * + * @return UDS_SUCCESS or an error code. + **/ +int makeBufferedWriter(struct ioFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); +#else +/** + * Make a new buffered writer. + * + * @param region The IOregion to write to. + * @param writerPtr The new buffered writer goes here. + * + * @return UDS_SUCCESS or an error code. + **/ +int makeBufferedWriter(struct ioRegion *region, BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Free a buffered writer, without flushing. + * + * @param [in] buffer The buffered writer object. + **/ +void freeBufferedWriter(BufferedWriter *buffer); + +/** + * Append data to buffer, writing as needed. + * + * @param buffer The buffered writer object. + * @param data The data to write. + * @param len The length of the data written. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int writeToBufferedWriter(BufferedWriter *buffer, const void *data, size_t len) + __attribute__((warn_unused_result)); + +/** + * Zero data in the buffer, writing as needed. + * + * @param buffer The buffered writer object. + * @param len The number of zero bytes to write. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) + __attribute__((warn_unused_result)); + + +/** + * Flush any partial data from the buffer. + * + * @param buffer The buffered writer object. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int flushBufferedWriter(BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Return the size of the remaining space in the buffer (for testing) + * + * @param [in] buffer The buffered writer object. + * + * @return The number of available bytes in the buffer. + **/ +size_t spaceRemainingInWriteBuffer(BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Return whether the buffer was ever written to. + * + * @param buffer The buffered writer object. + * + * @return True if at least one call to writeToBufferedWriter + * was made. + **/ +bool wasBufferedWriterUsed(const BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Note the buffer has been used. + * + * @param buffer The buffered writer object. + **/ +void noteBufferedWriterUsed(BufferedWriter *buffer); + +#endif // BUFFERED_WRITER_H diff --git a/uds/cacheCounters.c b/uds/cacheCounters.c new file mode 100644 index 0000000..8bf7ad4 --- /dev/null +++ b/uds/cacheCounters.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.c#1 $ + */ + +#include "cacheCounters.h" + +#include "atomicDefs.h" +#include "compiler.h" +#include "errors.h" +#include "permassert.h" +#include "stringUtils.h" +#include "uds.h" + +/**********************************************************************/ +void incrementCacheCounter(CacheCounters *counters, + int probeType, + CacheResultKind kind) +{ + CacheProbeType basicProbeType = probeType & ~CACHE_PROBE_IGNORE_FAILURE; + int result = ASSERT(basicProbeType <= CACHE_PROBE_RECORD_RETRY, + "invalid cache probe type %#x", probeType); + if (result != UDS_SUCCESS) { + return; + } + result = ASSERT(kind <= CACHE_RESULT_QUEUED, + "invalid cache probe result type %#x", kind); + if (result != UDS_SUCCESS) { + return; + } + + if (((probeType & CACHE_PROBE_IGNORE_FAILURE) != 0) + && (kind != CACHE_RESULT_HIT)) { + return; + } + + CacheCountsByKind *kindCounts; + switch (basicProbeType) { + case CACHE_PROBE_INDEX_FIRST: + kindCounts = &counters->firstTime.indexPage; + break; + case CACHE_PROBE_RECORD_FIRST: + kindCounts = &counters->firstTime.recordPage; + break; + case CACHE_PROBE_INDEX_RETRY: + kindCounts = &counters->retried.indexPage; + break; + case CACHE_PROBE_RECORD_RETRY: + kindCounts = &counters->retried.recordPage; + break; + default: + // Never used but the compiler hasn't figured that out. + return; + } + + uint64_t *myCounter; + switch (kind) { + case CACHE_RESULT_MISS: + myCounter = &kindCounts->misses; + break; + case CACHE_RESULT_QUEUED: + myCounter = &kindCounts->queued; + break; + case CACHE_RESULT_HIT: + myCounter = &kindCounts->hits; + break; + default: + // Never used but the compiler hasn't figured that out. + return; + } + // XXX Vile case makes many assumptions. Counters should be declared atomic. + atomic64_inc((atomic64_t *) myCounter); +} diff --git a/uds/cacheCounters.h b/uds/cacheCounters.h new file mode 100644 index 0000000..9029453 --- /dev/null +++ b/uds/cacheCounters.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.h#1 $ + */ + +#ifndef CACHE_COUNTERS_H +#define CACHE_COUNTERS_H + +#include "typeDefs.h" + +/** + * Basic counts of hits and misses for a given type of cache probe. + **/ +typedef struct cacheCountsByKind { + /** Number of hits */ + uint64_t hits; + /** Number of misses */ + uint64_t misses; + /** Number of probes for data already queued for read */ + uint64_t queued; +} CacheCountsByKind; + +/** + * The various types of cache probes we care about. + **/ +typedef enum cacheProbeType { + /** First attempt to look up an index page, for a given request. */ + CACHE_PROBE_INDEX_FIRST = 0, + /** First attempt to look up a record page, for a given request. */ + CACHE_PROBE_RECORD_FIRST, + /** Second or later attempt to look up an index page, for a given request. */ + CACHE_PROBE_INDEX_RETRY, + /** Second or later attempt to look up a record page, for a given request. */ + CACHE_PROBE_RECORD_RETRY +} CacheProbeType; + +enum { + /** Flag bit to indicate that failures shouldn't be recorded. */ + CACHE_PROBE_IGNORE_FAILURE = 128 +}; + +/** + * Result-type counts for both kinds of data pages in the page cache. + **/ +typedef struct cacheCountsByPageType { + /** His/miss counts for index pages. */ + CacheCountsByKind indexPage; + /** Hit/miss counts for record pages. */ + CacheCountsByKind recordPage; +} CacheCountsByPageType; + +/** + * All the counters used for an entry cache. + **/ +typedef struct cacheCounters { + // counters for the page cache + /** Hit/miss counts for the first attempt per request */ + CacheCountsByPageType firstTime; + /** Hit/miss counts when a second (or later) attempt is needed */ + CacheCountsByPageType retried; + + /** Number of cache entry invalidations due to single-entry eviction */ + uint64_t evictions; + /** Number of cache entry invalidations due to chapter expiration */ + uint64_t expirations; + + // counters for the sparse chapter index cache + /** Hit/miss counts for the sparse cache chapter probes */ + CacheCountsByKind sparseChapters; + /** Hit/miss counts for the sparce cache name searches */ + CacheCountsByKind sparseSearches; +} CacheCounters; + +/** + * Success/failure assessment of cache probe result. + **/ +typedef enum cacheResultKind { + /** The requested entry was found in the cache */ + CACHE_RESULT_HIT, + /** The requested entry was not found in the cache */ + CACHE_RESULT_MISS, + /** The requested entry wasn't found in the cache but is queued for read */ + CACHE_RESULT_QUEUED +} CacheResultKind; + +/** + * Increment one of the cache counters. + * + * @param counters pointer to the counters + * @param probeType type of access done + * @param kind result of probe + **/ +void incrementCacheCounter(CacheCounters *counters, + int probeType, + CacheResultKind kind); + +#endif /* CACHE_COUNTERS_H */ diff --git a/uds/cachedChapterIndex.c b/uds/cachedChapterIndex.c new file mode 100644 index 0000000..ae0a22c --- /dev/null +++ b/uds/cachedChapterIndex.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.c#3 $ + */ + +#include "cachedChapterIndex.h" + +#include "memoryAlloc.h" + +/**********************************************************************/ +int initializeCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry) +{ + chapter->virtualChapter = UINT64_MAX; + chapter->indexPagesCount = geometry->indexPagesPerChapter; + + int result = ALLOCATE(chapter->indexPagesCount, DeltaIndexPage, __func__, + &chapter->indexPages); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(chapter->indexPagesCount, struct volume_page, + "sparse index VolumePages", &chapter->volumePages); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < chapter->indexPagesCount; i++) { + result = initializeVolumePage(geometry, &chapter->volumePages[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void destroyCachedChapterIndex(CachedChapterIndex *chapter) +{ + if (chapter->volumePages != NULL) { + unsigned int i; + for (i = 0; i < chapter->indexPagesCount; i++) { + destroyVolumePage(&chapter->volumePages[i]); + } + } + FREE(chapter->indexPages); + FREE(chapter->volumePages); +} + +/**********************************************************************/ +int cacheChapterIndex(CachedChapterIndex *chapter, + uint64_t virtualChapter, + const Volume *volume) +{ + // Mark the cached chapter as unused in case the update fails midway. + chapter->virtualChapter = UINT64_MAX; + + // Read all the page data and initialize the entire DeltaIndexPage array. + // (It's not safe for the zone threads to do it lazily--they'll race.) + int result = readChapterIndexFromVolume(volume, virtualChapter, + chapter->volumePages, + chapter->indexPages); + if (result != UDS_SUCCESS) { + return result; + } + + // Reset all chapter counter values to zero. + chapter->counters.searchHits = 0; + chapter->counters.searchMisses = 0; + chapter->counters.consecutiveMisses = 0; + + // Mark the entry as valid--it's now in the cache. + chapter->virtualChapter = virtualChapter; + chapter->skipSearch = false; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int searchCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry, + const IndexPageMap *indexPageMap, + const UdsChunkName *name, + int *recordPagePtr) +{ + // Find the indexPageNumber in the chapter that would have the chunk name. + unsigned int physicalChapter + = mapToPhysicalChapter(geometry, chapter->virtualChapter); + unsigned int indexPageNumber; + int result = findIndexPageNumber(indexPageMap, name, physicalChapter, + &indexPageNumber); + if (result != UDS_SUCCESS) { + return result; + } + + return searchChapterIndexPage(&chapter->indexPages[indexPageNumber], + geometry, name, recordPagePtr); +} diff --git a/uds/cachedChapterIndex.h b/uds/cachedChapterIndex.h new file mode 100644 index 0000000..f759d5d --- /dev/null +++ b/uds/cachedChapterIndex.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.h#3 $ + */ + +#ifndef CACHED_CHAPTER_INDEX_H +#define CACHED_CHAPTER_INDEX_H + +#include "chapterIndex.h" +#include "common.h" +#include "compiler.h" +#include "cpu.h" +#include "geometry.h" +#include "indexPageMap.h" +#include "typeDefs.h" +#include "volume.h" +#include "volumeStore.h" + +/** + * These counters are essentially fields of the CachedChapterIndex, but are + * segregated into this structure because they are frequently modified. They + * are grouped and aligned to keep them on different cache lines from the + * chapter fields that are accessed far more often than they are updated. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedIndexCounters { + /** the total number of search hits since this chapter was cached */ + uint64_t searchHits; + + /** the total number of search misses since this chapter was cached */ + uint64_t searchMisses; + + /** the number of consecutive search misses since the last cache hit */ + uint64_t consecutiveMisses; +}; +typedef struct cachedIndexCounters CachedIndexCounters; + +/** + * CachedChapterIndex is the structure for a cache entry, representing a + * single cached chapter index in the sparse chapter index cache. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedChapterIndex { + /* + * The virtual chapter number of the cached chapter index. UINT64_MAX means + * this cache entry is unused. Must only be modified in the critical section + * in updateSparseCache(). + */ + uint64_t virtualChapter; + + /* The number of index pages in a chapter */ + unsigned int indexPagesCount; + + /* + * This flag is mutable between cache updates, but it rarely changes and + * is frequently accessed, so it groups with the immutable fields. + * + * If set, skip the chapter when searching the entire cache. This flag is + * just a performance optimization. If we do not see a recent change to it, + * it will be corrected when we pass through a memory barrier while getting + * the next request from the queue. So we may do one extra search of the + * chapter index, or miss one deduplication opportunity. + */ + bool skipSearch; + + // These pointers are immutable during the life of the cache. The contents + // of the arrays change when the cache entry is replaced. + + /* pointer to a cache-aligned array of ChapterIndexPages */ + DeltaIndexPage *indexPages; + + /* pointer to an array of VolumePages containing the index pages */ + struct volume_page *volumePages; + + // The cache-aligned counters change often and are placed at the end of the + // structure to prevent false sharing with the more stable fields above. + + /* counter values updated by the thread servicing zone zero */ + CachedIndexCounters counters; +}; +typedef struct cachedChapterIndex CachedChapterIndex; + +/** + * Initialize a CachedChapterIndex, allocating the memory for the array of + * ChapterIndexPages and the raw index page data. The chapter index will be + * marked as unused (virtualChapter == UINT64_MAX). + * + * @param chapter the chapter index cache entry to initialize + * @param geometry the geometry governing the volume + **/ +int initializeCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Destroy a CachedChapterIndex, freeing the memory allocated for the + * ChapterIndexPages and raw index page data. + * + * @param chapter the chapter index cache entry to destroy + **/ +void destroyCachedChapterIndex(CachedChapterIndex *chapter); + +/** + * Assign a new value to the skipSearch flag of a cached chapter index. + * + * @param chapter the chapter index cache entry to modify + * @param skipSearch the new value of the skipSearch falg + **/ +static INLINE void setSkipSearch(CachedChapterIndex *chapter, bool skipSearch) +{ + // Explicitly check if the field is set so we don't keep dirtying the memory + // cache line on continued search hits. + if (READ_ONCE(chapter->skipSearch) != skipSearch) { + WRITE_ONCE(chapter->skipSearch, skipSearch); + } +} + +/** + * Check if a cached sparse chapter index should be skipped over in the search + * for a chunk name. Filters out unused, invalid, disabled, and irrelevant + * cache entries. + * + * @param zone the zone doing the check + * @param chapter the cache entry search candidate + * @param virtualChapter the virtualChapter containing a hook, or UINT64_MAX + * if searching the whole cache for a non-hook + * + * @return true if the provided chapter index should be skipped + **/ +static INLINE bool shouldSkipChapterIndex(const IndexZone *zone, + const CachedChapterIndex *chapter, + uint64_t virtualChapter) +{ + // Don't search unused entries (contents undefined) or invalid entries + // (the chapter is no longer the zone's view of the volume). + if ((chapter->virtualChapter == UINT64_MAX) + || (chapter->virtualChapter < zone->oldestVirtualChapter)) { + return true; + } + + if (virtualChapter != UINT64_MAX) { + // If the caller specified a virtual chapter, only search the cache + // entry containing that chapter. + return (virtualChapter != chapter->virtualChapter); + } else { + // When searching the entire cache, save time by skipping over chapters + // that have had too many consecutive misses. + return READ_ONCE(chapter->skipSearch); + } +} + +/** + * Cache a chapter index, reading all the index pages from the volume and + * initializing the array of ChapterIndexPages in the cache entry to represent + * them. The virtualChapter field of the cache entry will be set to UINT64_MAX + * if there is any error since the remaining mutable fields will be in an + * undefined state. + * + * @param chapter the chapter index cache entry to replace + * @param virtualChapter the virtual chapter number of the index to read + * @param volume the volume containing the chapter index + * + * @return UDS_SUCCESS or an error code + **/ +int cacheChapterIndex(CachedChapterIndex *chapter, + uint64_t virtualChapter, + const Volume *volume) + __attribute__((warn_unused_result)); + +/** + * Search a single cached sparse chapter index for a chunk name, returning the + * record page number that may contain the name. + * + * @param [in] chapter the cache entry for the chapter to search + * @param [in] geometry the geometry governing the volume + * @param [in] indexPageMap the index page number map for the volume + * @param [in] name the chunk name to search for + * @param [out] recordPagePtr the record page number of a match, else + * NO_CHAPTER_INDEX_ENTRY if nothing matched + * + * @return UDS_SUCCESS or an error code + **/ +int searchCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry, + const IndexPageMap *indexPageMap, + const UdsChunkName *name, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* CACHED_CHAPTER_INDEX_H */ diff --git a/uds/chapterIndex.c b/uds/chapterIndex.c new file mode 100644 index 0000000..5653a41 --- /dev/null +++ b/uds/chapterIndex.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.c#5 $ + */ + +#include "chapterIndex.h" + +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + + +/**********************************************************************/ +int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, + const Geometry *geometry, + bool chapterIndexHeaderNativeEndian, + uint64_t volumeNonce) +{ + + int result = ALLOCATE(1, OpenChapterIndex, "open chapter index", + openChapterIndex); + if (result != UDS_SUCCESS) { + return result; + } + + // The delta index will rebalance delta lists when memory gets tight, so + // give the chapter index one extra page. + size_t memorySize + = (geometry->indexPagesPerChapter + 1) * geometry->bytesPerPage; + (*openChapterIndex)->geometry = geometry; + (*openChapterIndex)->volumeNonce = volumeNonce; + (*openChapterIndex)->headerNativeEndian = chapterIndexHeaderNativeEndian, + result = initializeDeltaIndex(&(*openChapterIndex)->deltaIndex, 1, + geometry->deltaListsPerChapter, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, memorySize); + if (result != UDS_SUCCESS) { + FREE(*openChapterIndex); + *openChapterIndex = NULL; + } + return result; +} + +/**********************************************************************/ +void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex) +{ + if (openChapterIndex == NULL) { + return; + } + + + uninitializeDeltaIndex(&openChapterIndex->deltaIndex); + FREE(openChapterIndex); +} + +/**********************************************************************/ +void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, + uint64_t virtualChapterNumber) +{ + emptyDeltaIndex(&openChapterIndex->deltaIndex); + openChapterIndex->virtualChapterNumber = virtualChapterNumber; +} + +/** + * Check whether a delta list entry reflects a successful search for a given + * address. + * + * @param entry the delta list entry from the search + * @param address the address of the desired entry + * + * @return true iff the address was found + **/ +static INLINE bool wasEntryFound(const DeltaIndexEntry *entry, + unsigned int address) +{ + return (!entry->atEnd && (entry->key == address)); +} + +/**********************************************************************/ +int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, + const UdsChunkName *name, + unsigned int pageNumber) +{ + const Geometry *geometry = openChapterIndex->geometry; + int result + = ASSERT_WITH_ERROR_CODE(pageNumber < geometry->recordPagesPerChapter, + UDS_INVALID_ARGUMENT, + "Page number within chapter (%u) exceeds" + " the maximum value %u", + pageNumber, geometry->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaIndexEntry entry; + unsigned int address = hashToChapterDeltaAddress(name, geometry); + result = getDeltaIndexEntry(&openChapterIndex->deltaIndex, + hashToChapterDeltaList(name, geometry), + address, name->name, false, &entry); + if (result != UDS_SUCCESS) { + return result; + } + bool found = wasEntryFound(&entry, address); + result = ASSERT_WITH_ERROR_CODE(!(found && entry.isCollision), + UDS_BAD_STATE, + "Chunk appears more than once in chapter %" + PRIu64, + openChapterIndex->virtualChapterNumber); + if (result != UDS_SUCCESS) { + return result; + } + return putDeltaIndexEntry(&entry, address, pageNumber, + (found ? name->name : NULL)); +} + +/**********************************************************************/ +int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, + byte *memory, + unsigned int firstList, + bool lastPage, + unsigned int *numLists) +{ + DeltaIndex *deltaIndex = &openChapterIndex->deltaIndex; + const Geometry *geometry = openChapterIndex->geometry; + unsigned int removals = 0; + for (;;) { + int result = packDeltaIndexPage(deltaIndex, openChapterIndex->volumeNonce, + openChapterIndex->headerNativeEndian, + memory, geometry->bytesPerPage, + openChapterIndex->virtualChapterNumber, + firstList, numLists); + if (result != UDS_SUCCESS) { + return result; + } + if ((firstList + *numLists) == geometry->deltaListsPerChapter) { + // All lists are packed + break; + } else if (*numLists == 0) { + // The next delta list does not fit on a page. This delta list will + // be removed. + } else if (lastPage) { + /* + * This is the last page and there are lists left unpacked, but all of + * the remaining lists must fit on the page. Find a list that contains + * entries and remove the entire list. Try the first list that does not + * fit. If it is empty, we will select the last list that already fits + * and has any entries. + */ + } else { + // This page is done + break; + } + if (removals == 0) { + DeltaIndexStats stats; + getDeltaIndexStats(deltaIndex, &stats); + logWarning("The chapter index for chapter %" PRIu64 + " contains %ld entries with %ld collisions", + openChapterIndex->virtualChapterNumber, + stats.recordCount, stats.collisionCount); + } + DeltaIndexEntry entry; + int listNumber = *numLists; + do { + if (listNumber < 0) { + return UDS_OVERFLOW; + } + result = startDeltaIndexSearch(deltaIndex, firstList + listNumber--, + 0, false, &entry); + if (result != UDS_SUCCESS) { + return result; + } + result = nextDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + } while (entry.atEnd); + do { + result = removeDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + removals++; + } while (!entry.atEnd); + } + if (removals > 0) { + logWarning("To avoid chapter index page overflow in chapter %" PRIu64 + ", %u entries were removed from the chapter index", + openChapterIndex->virtualChapterNumber, removals); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) +{ + DeltaIndexStats stats; + getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); + return stats.recordCount; +} + +/**********************************************************************/ +size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex) +{ + DeltaIndexStats stats; + getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); + return stats.memoryAllocated + sizeof(OpenChapterIndex); +} + +/**********************************************************************/ +int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + byte *indexPage, + uint64_t volumeNonce) +{ + return initializeDeltaIndexPage(chapterIndexPage, volumeNonce, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, + indexPage, geometry->bytesPerPage); +} + +/**********************************************************************/ +int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, + const Geometry *geometry) +{ + const DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; + unsigned int first = chapterIndexPage->lowestListNumber; + unsigned int last = chapterIndexPage->highestListNumber; + // We walk every delta list from start to finish. + unsigned int listNumber; + for (listNumber = first; listNumber <= last; listNumber++) { + DeltaIndexEntry entry; + int result = startDeltaIndexSearch(deltaIndex, listNumber - first, 0, true, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + for (;;) { + result = nextDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + if (result == UDS_CORRUPT_DATA) { + // A random bit stream is highly likely to arrive here when we go + // past the end of the delta list + return UDS_CORRUPT_COMPONENT; + } + return result; + } + if (entry.atEnd) { + break; + } + // Also make sure that the record page field contains a plausible value + if (getDeltaEntryValue(&entry) >= geometry->recordPagesPerChapter) { + // Do not log this as an error. It happens in normal operation when + // we are doing a rebuild but haven't written the entire volume once. + return UDS_CORRUPT_COMPONENT; + } + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + const UdsChunkName *name, + int *recordPagePtr) +{ + DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; + unsigned int address = hashToChapterDeltaAddress(name, geometry); + unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); + unsigned int subListNumber + = deltaListNumber - chapterIndexPage->lowestListNumber;; + DeltaIndexEntry entry; + int result = getDeltaIndexEntry(deltaIndex, subListNumber, address, + name->name, true, &entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (wasEntryFound(&entry, address)) { + *recordPagePtr = getDeltaEntryValue(&entry); + } else { + *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; + } + return UDS_SUCCESS; +} diff --git a/uds/chapterIndex.h b/uds/chapterIndex.h new file mode 100644 index 0000000..4dd425b --- /dev/null +++ b/uds/chapterIndex.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.h#4 $ + */ + +#ifndef CHAPTER_INDEX_H +#define CHAPTER_INDEX_H 1 + +#include "deltaIndex.h" +#include "geometry.h" + +enum { + // The value returned as the record page number when an entry is not found + // in the chapter index. + NO_CHAPTER_INDEX_ENTRY = -1 +}; + +typedef struct openChapterIndex { + const Geometry *geometry; + DeltaIndex deltaIndex; + uint64_t virtualChapterNumber; + bool headerNativeEndian; + uint64_t volumeNonce; +} OpenChapterIndex; + + +/** + * Make a new open chapter index. + * + * @param openChapterIndex Location to hold new open chapter index pointer + * @param geometry The geometry + * @param chapterIndexHeaderNativeEndian chapter index header format + * @param volumeNonce The volume nonce. + * + * @return error code or UDS_SUCCESS + **/ +int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, + const Geometry *geometry, + bool chapterIndexHeaderNativeEndian, + uint64_t volumeNonce) + __attribute__((warn_unused_result)); + +/** + * Terminate and clean up an open chapter index. + * + * @param openChapterIndex The open chapter index to terminate + **/ +void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex); + +/** + * Empty an open chapter index, and prepare it for writing a new virtual + * chapter. + * + * @param openChapterIndex The open chapter index to empty + * @param virtualChapterNumber The virtual chapter number + **/ +void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, + uint64_t virtualChapterNumber); + +/** + * Create a new record in an open chapter index, associating a chunk name with + * the number of the record page containing the metadata for the chunk. + * + * @param openChapterIndex The open chapter index + * @param name The chunk name + * @param pageNumber The number of the record page containing the name + * + * @return UDS_SUCCESS or an error code + **/ +int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, + const UdsChunkName *name, + unsigned int pageNumber) + __attribute__((warn_unused_result)); + +/** + * Pack a section of an open chapter index into a chapter index page. A + * range of delta lists (starting with a specified list index) is copied + * from the open chapter index into a memory page. The number of lists + * copied onto the page is returned to the caller. + * + * @param openChapterIndex The open chapter index + * @param memory The memory page to use + * @param firstList The first delta list number to be copied + * @param lastPage If true, this is the last page of the chapter + * index and all the remaining lists must be packed + * onto this page + * @param numLists The number of delta lists that were copied + * + * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists + * argument contains the number of lists copied. + **/ +int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, + byte *memory, + unsigned int firstList, + bool lastPage, + unsigned int *numLists) + __attribute__((warn_unused_result)); + +/** + * Get the number of records in an open chapter index. + * + * @param openChapterIndex The open chapter index + * + * @return The number of records + **/ +int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for the open chapter index. + * + * @param openChapterIndex The open chapter index + * + * @return the number of bytes allocated + **/ +size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex); + +/** + * Make a new chapter index page, initializing it with the data from the + * given buffer. + * + * @param chapterIndexPage The new chapter index page + * @param geometry The geometry + * @param indexPage The memory page to use + * @param volumeNonce If non-zero, the volume nonce to verify + * + * @return UDS_SUCCESS or an error code + **/ +int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + byte *indexPage, + uint64_t volumeNonce) + __attribute__((warn_unused_result)); + +/** + * Validate a chapter index page. This is called at rebuild time to ensure + * that the volume file contains a coherent chapter index. + * + * @param chapterIndexPage The chapter index page + * @param geometry The geometry of the volume + * + * @return The result code: + * UDS_SUCCESS for a good chapter index page + * UDS_CORRUPT_COMPONENT if the chapter index code detects invalid data + * UDS_CORRUPT_DATA if there is a problem in a delta list bit stream + * UDS_BAD_STATE if the code follows an invalid code path + **/ +int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, + const Geometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Search a chapter index page for a chunk name, returning the record page + * number that may contain the name. + * + * @param [in] chapterIndexPage The chapter index page + * @param [in] geometry The geometry of the volume + * @param [in] name The chunk name + * @param [out] recordPagePtr The record page number + * or NO_CHAPTER_INDEX_ENTRY if not found + * + * @return UDS_SUCCESS or an error code + **/ +int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + const UdsChunkName *name, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* CHAPTER_INDEX_H */ diff --git a/uds/chapterWriter.c b/uds/chapterWriter.c new file mode 100644 index 0000000..3a926ab --- /dev/null +++ b/uds/chapterWriter.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.c#2 $ + */ + +#include "chapterWriter.h" + +#include "errors.h" +#include "index.h" +#include "indexCheckpoint.h" +#include "indexComponent.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "openChapter.h" +#include "threads.h" + + +struct chapterWriter { + /* The index to which we belong */ + Index *index; + /* The thread to do the writing */ + Thread thread; + /* lock protecting the following fields */ + Mutex mutex; + /* condition signalled on state changes */ + CondVar cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memoryAllocated; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zonesToWrite; + /* Open chapter index used by closeOpenChapter() */ + OpenChapterIndex *openChapterIndex; + /* Collated records used by closeOpenChapter() */ + UdsChunkRecord *collatedRecords; + /* The chapters to write (one per zone) */ + OpenChapterZone *chapters[]; +}; + +/** + * This is the driver function for the writer thread. It loops until + * terminated, waiting for a chapter to provided to close. + **/ +static void closeChapters(void *arg) +{ + ChapterWriter *writer = arg; + logDebug("chapter writer starting"); + lockMutex(&writer->mutex); + for (;;) { + while (writer->zonesToWrite < writer->index->zoneCount) { + if (writer->stop && (writer->zonesToWrite == 0)) { + // We've been told to stop, and all of the zones are in the same + // open chapter, so we can exit now. + unlockMutex(&writer->mutex); + logDebug("chapter writer stopping"); + return; + } + waitCond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't need to do + * this, but it seems safer in principle. It's OK to access the chapter + * and chapterNumber fields without the lock since those aren't allowed to + * change until we're done. + */ + unlockMutex(&writer->mutex); + + if (writer->index->hasSavedOpenChapter) { + writer->index->hasSavedOpenChapter = false; + /* + * Remove the saved open chapter as that chapter is about to be written + * to the volume. This matters the first time we close the open chapter + * after loading from a clean shutdown, or after doing a clean save. + */ + IndexComponent *oc = findIndexComponent(writer->index->state, + &OPEN_CHAPTER_INFO); + int result = discardIndexComponent(oc); + if (result == UDS_SUCCESS) { + logDebug("Discarding saved open chapter"); + } + } + + int result = closeOpenChapter(writer->chapters, + writer->index->zoneCount, + writer->index->volume, + writer->openChapterIndex, + writer->collatedRecords, + writer->index->newestVirtualChapter); + + if (result == UDS_SUCCESS) { + result = processChapterWriterCheckpointSaves(writer->index); + } + + + lockMutex(&writer->mutex); + // Note that the index is totally finished with the writing chapter + advanceActiveChapters(writer->index); + writer->result = result; + writer->zonesToWrite = 0; + broadcastCond(&writer->cond); + } +} + +/**********************************************************************/ +int makeChapterWriter(Index *index, + const struct index_version *indexVersion, + ChapterWriter **writerPtr) +{ + size_t collatedRecordsSize + = (sizeof(UdsChunkRecord) + * (1 + index->volume->geometry->recordsPerChapter)); + ChapterWriter *writer; + int result = ALLOCATE_EXTENDED(ChapterWriter, + index->zoneCount, OpenChapterZone *, + "Chapter Writer", &writer); + if (result != UDS_SUCCESS) { + return result; + } + writer->index = index; + + result = initMutex(&writer->mutex); + if (result != UDS_SUCCESS) { + FREE(writer); + return result; + } + result = initCond(&writer->cond); + if (result != UDS_SUCCESS) { + destroyMutex(&writer->mutex); + FREE(writer); + return result; + } + + // Now that we have the mutex+cond, it is safe to call freeChapterWriter. + result = allocateCacheAligned(collatedRecordsSize, "collated records", + &writer->collatedRecords); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + result = makeOpenChapterIndex(&writer->openChapterIndex, + index->volume->geometry, + indexVersion->chapterIndexHeaderNativeEndian, + index->volume->nonce); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + + size_t openChapterIndexMemoryAllocated + = getOpenChapterIndexMemoryAllocated(writer->openChapterIndex); + writer->memoryAllocated = (sizeof(ChapterWriter) + + index->zoneCount * sizeof(OpenChapterZone *) + + collatedRecordsSize + + openChapterIndexMemoryAllocated); + + // We're initialized, so now it's safe to start the writer thread. + result = createThread(closeChapters, writer, "writer", &writer->thread); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + + *writerPtr = writer; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeChapterWriter(ChapterWriter *writer) +{ + if (writer == NULL) { + return; + } + + int result __attribute__((unused)) = stopChapterWriter(writer); + destroyMutex(&writer->mutex); + destroyCond(&writer->cond); + freeOpenChapterIndex(writer->openChapterIndex); + FREE(writer->collatedRecords); + FREE(writer); +} + +/**********************************************************************/ +unsigned int startClosingChapter(ChapterWriter *writer, + unsigned int zoneNumber, + OpenChapterZone *chapter) +{ + lockMutex(&writer->mutex); + unsigned int finishedZones = ++writer->zonesToWrite; + writer->chapters[zoneNumber] = chapter; + broadcastCond(&writer->cond); + unlockMutex(&writer->mutex); + + return finishedZones; +} + +/**********************************************************************/ +int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) +{ + int result; + lockMutex(&writer->mutex); + while (writer->index->newestVirtualChapter < currentChapterNumber) { + waitCond(&writer->cond, &writer->mutex); + } + result = writer->result; + unlockMutex(&writer->mutex); + + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "Writing of previous open chapter failed"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void waitForIdleChapterWriter(ChapterWriter *writer) +{ + lockMutex(&writer->mutex); + while (writer->zonesToWrite > 0) { + // The chapter writer is probably writing a chapter. If it is not, it will + // soon wake up and write a chapter. + waitCond(&writer->cond, &writer->mutex); + } + unlockMutex(&writer->mutex); +} + +/**********************************************************************/ +int stopChapterWriter(ChapterWriter *writer) +{ + Thread writerThread = 0; + + lockMutex(&writer->mutex); + if (writer->thread != 0) { + writerThread = writer->thread; + writer->thread = 0; + writer->stop = true; + broadcastCond(&writer->cond); + } + int result = writer->result; + unlockMutex(&writer->mutex); + + if (writerThread != 0) { + joinThreads(writerThread); + } + + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "Writing of previous open chapter failed"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getChapterWriterMemoryAllocated(ChapterWriter *writer) +{ + return writer->memoryAllocated; +} diff --git a/uds/chapterWriter.h b/uds/chapterWriter.h new file mode 100644 index 0000000..85c1f42 --- /dev/null +++ b/uds/chapterWriter.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.h#2 $ + */ + +#ifndef CHAPTER_WRITER_H +#define CHAPTER_WRITER_H + +#include "atomicDefs.h" +#include "indexVersion.h" +#include "openChapterZone.h" + +typedef struct chapterWriter ChapterWriter; + +// This opaque declaration breaks the dependency loop with index.h +struct index; + + +/** + * Create a chapter writer and start its thread. + * + * @param index the index containing the chapters to be written + * @param indexVersion the index version parameters + * @param writerPtr pointer to hold the new writer + * + * @return UDS_SUCCESS or an error code + **/ +int makeChapterWriter(struct index *index, + const struct index_version *indexVersion, + ChapterWriter **writerPtr) + __attribute__((warn_unused_result)); + +/** + * Free a chapter writer, waiting for its thread to finish. + * + * @param writer the chapter writer to destroy + **/ +void freeChapterWriter(ChapterWriter *writer); + +/** + * Asychronously close and write a chapter by passing it to the writer + * thread. Writing won't start until all zones have submitted a chapter. + * + * @param writer the chapter writer + * @param zoneNumber the number of the zone submitting a chapter + * @param chapter the chapter to write + * + * @return The number of zones which have submitted the current chapter + **/ +unsigned int startClosingChapter(ChapterWriter *writer, + unsigned int zoneNumber, + OpenChapterZone *chapter) + __attribute__((warn_unused_result)); + +/** + * Wait for the chapter writer thread to finish closing the chapter previous + * to the one specified. + * + * @param writer the chapter writer + * @param currentChapterNumber the currentChapter number + * + * @return UDS_SUCCESS or an error code from the most recent write + * request + **/ +int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) + __attribute__((warn_unused_result)); + + +/** + * Wait for the chapter writer thread to finish all writes to storage. + * + * @param writer the chapter writer + **/ +void waitForIdleChapterWriter(ChapterWriter *writer); + +/** + * Stop the chapter writer and wait for it to finish. + * + * @param writer the chapter writer to stop + * + * @return UDS_SUCCESS or an error code from the most recent write + * request + **/ +int stopChapterWriter(ChapterWriter *writer) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for the chapter writer. + * + * @param writer the chapter writer + * + * @return the number of bytes allocated + **/ +size_t getChapterWriterMemoryAllocated(ChapterWriter *writer); + +#endif /* CHAPTER_WRITER_H */ diff --git a/uds/common.h b/uds/common.h new file mode 100644 index 0000000..bea27e5 --- /dev/null +++ b/uds/common.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/common.h#1 $ + */ + +#ifndef COMMON_H +#define COMMON_H + +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" +#include "uds-block.h" + +enum { + KILOBYTE = 1024, + MEGABYTE = KILOBYTE * KILOBYTE, + GIGABYTE = KILOBYTE * MEGABYTE +}; + +typedef struct udsChunkData UdsChunkData; + +typedef struct { + UdsChunkName name; + UdsChunkData data; +} UdsChunkRecord; + +#endif /* COMMON_H */ diff --git a/uds/compiler.h b/uds/compiler.h new file mode 100644 index 0000000..cd57590 --- /dev/null +++ b/uds/compiler.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/compiler.h#1 $ + */ + +#ifndef COMMON_COMPILER_H +#define COMMON_COMPILER_H + +#include "compilerDefs.h" + +// Count the elements in a static array while attempting to catch some type +// errors. (See http://stackoverflow.com/a/1598827 for an explanation.) +#define COUNT_OF(x) ((sizeof(x) / sizeof(0[x])) \ + / ((size_t) (!(sizeof(x) % sizeof(0[x]))))) + +#define const_container_of(ptr, type, member) \ + __extension__ ({ \ + const __typeof__(((type *)0)->member) *__mptr = (ptr); \ + (const type *)((const char *)__mptr - offsetof(type,member)); \ + }) + +// The "inline" keyword alone takes affect only when the optimization level +// is high enough. Define INLINE to force the gcc to "always inline". +#define INLINE __attribute__((always_inline)) inline + +#endif /* COMMON_COMPILER_H */ diff --git a/uds/compilerDefs.h b/uds/compilerDefs.h new file mode 100644 index 0000000..cc81ce2 --- /dev/null +++ b/uds/compilerDefs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/compilerDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_COMPILER_DEFS_H +#define LINUX_KERNEL_COMPILER_DEFS_H + +#include + +#define __STRING(x) #x + +#endif /* LINUX_KERNEL_COMPILER_DEFS_H */ diff --git a/uds/config.c b/uds/config.c new file mode 100644 index 0000000..a953da3 --- /dev/null +++ b/uds/config.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/config.c#2 $ + */ + +#include "config.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +/**********************************************************************/ +void freeIndexLocation(IndexLocation *loc) +{ + if (loc == NULL) { + return; + } + + FREE(loc->host); + FREE(loc->port); + FREE(loc->directory); +} + +/**********************************************************************/ +bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) +{ + bool result = true; + if (a->recordPagesPerChapter != b->recordPagesPerChapter) { + logError("Record pages per chapter (%u) does not match (%u)", + a->recordPagesPerChapter, b->recordPagesPerChapter); + result = false; + } + if (a->chaptersPerVolume != b->chaptersPerVolume) { + logError("Chapter count (%u) does not match (%u)", + a->chaptersPerVolume, b->chaptersPerVolume); + result = false; + } + if (a->sparseChaptersPerVolume != b->sparseChaptersPerVolume) { + logError("Sparse chapter count (%u) does not match (%u)", + a->sparseChaptersPerVolume, b->sparseChaptersPerVolume); + result = false; + } + if (a->cacheChapters != b->cacheChapters) { + logError("Cache size (%u) does not match (%u)", + a->cacheChapters, b->cacheChapters); + result = false; + } + if (a->masterIndexMeanDelta != b->masterIndexMeanDelta) { + logError("Master index mean delta (%u) does not match (%u)", + a->masterIndexMeanDelta, b->masterIndexMeanDelta); + result = false; + } + if (a->bytesPerPage != b->bytesPerPage) { + logError("Bytes per page value (%u) does not match (%u)", + a->bytesPerPage, b->bytesPerPage); + result = false; + } + if (a->sparseSampleRate != b->sparseSampleRate) { + logError("Sparse sample rate (%u) does not match (%u)", + a->sparseSampleRate, b->sparseSampleRate); + result = false; + } + if (a->nonce != b->nonce) { + logError("Nonce (%llu) does not match (%llu)", + a->nonce, b->nonce); + result = false; + } + return result; +} + +/**********************************************************************/ +void logUdsConfiguration(UdsConfiguration conf) +{ + logDebug("Configuration:"); + logDebug(" Record pages per chapter: %10u", conf->recordPagesPerChapter); + logDebug(" Chapters per volume: %10u", conf->chaptersPerVolume); + logDebug(" Sparse chapters per volume: %10u", conf->sparseChaptersPerVolume); + logDebug(" Cache size (chapters): %10u", conf->cacheChapters); + logDebug(" Master index mean delta: %10u", conf->masterIndexMeanDelta); + logDebug(" Bytes per page: %10u", conf->bytesPerPage); + logDebug(" Sparse sample rate: %10u", conf->sparseSampleRate); + logDebug(" Nonce: %llu", conf->nonce); +} diff --git a/uds/config.h b/uds/config.h new file mode 100644 index 0000000..f31efab --- /dev/null +++ b/uds/config.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/config.h#2 $ + */ + +#ifndef CONFIG_H +#define CONFIG_H + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "geometry.h" +#include "uds.h" + +enum { + DEFAULT_MASTER_INDEX_MEAN_DELTA = 4096, + DEFAULT_CACHE_CHAPTERS = 7, + DEFAULT_SPARSE_SAMPLE_RATE = 0 +}; + +/** + * Data that are used for configuring a new index. + **/ +struct udsConfiguration { + /** Smaller (16), Small (64) or large (256) indices */ + unsigned int recordPagesPerChapter; + /** Total number of chapters per volume */ + unsigned int chaptersPerVolume; + /** Number of sparse chapters per volume */ + unsigned int sparseChaptersPerVolume; + /** Size of the page cache, in chapters */ + unsigned int cacheChapters; + /** Frequency with which to checkpoint */ + // XXX the checkpointFrequency is not used - it is now a runtime parameter + unsigned int checkpointFrequency; + /** The master index mean delta to use */ + unsigned int masterIndexMeanDelta; + /** Size of a page, used for both record pages and index pages */ + unsigned int bytesPerPage; + /** Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; + /** Index Owner's nonce */ + UdsNonce nonce; +}; + +/** + * Data that are used for a 6.01 index. + **/ +struct udsConfiguration6_01 { + /** Smaller (16), Small (64) or large (256) indices */ + unsigned int recordPagesPerChapter; + /** Total number of chapters per volume */ + unsigned int chaptersPerVolume; + /** Number of sparse chapters per volume */ + unsigned int sparseChaptersPerVolume; + /** Size of the page cache, in chapters */ + unsigned int cacheChapters; + /** Frequency with which to checkpoint */ + unsigned int checkpointFrequency; + /** The master index mean delta to use */ + unsigned int masterIndexMeanDelta; + /** Size of a page, used for both record pages and index pages */ + unsigned int bytesPerPage; + /** Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; +}; + +typedef struct indexLocation { + char *host; + char *port; + char *directory; +} IndexLocation; + +/** + * A set of configuration parameters for the indexer. + **/ +typedef struct configuration Configuration; + +/** + * Construct a new indexer configuration. + * + * @param conf UdsConfiguration to use + * @param configPtr The new index configuration + * + * @return UDS_SUCCESS or an error code + **/ +int makeConfiguration(UdsConfiguration conf, + Configuration **configPtr) + __attribute__((warn_unused_result)); + +/** + * Clean up the configuration struct. + **/ +void freeConfiguration(Configuration *config); + +/** + * Read the index configuration from stable storage. + * + * @param reader A buffered reader. + * @param config The index configuration to overwrite. + * + * @return UDS_SUCCESS or an error code. + **/ +int readConfigContents(BufferedReader *reader, + UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Write the index configuration information to stable storage. + * + * @param writer A buffered writer. + * @param config The index configuration. + * + * @return UDS_SUCCESS or an error code. + **/ +int writeConfigContents(BufferedWriter *writer, + UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Free the memory used by an IndexLocation. + * + * @param loc index location to free + **/ +void freeIndexLocation(IndexLocation *loc); + +/** + * Compare two configurations for equality. + * + * @param a The first configuration to compare + * @param b The second configuration to compare + * + * @return true iff they are equal + **/ +bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) + __attribute__((warn_unused_result)); + +/** + * Log a user configuration. + * + * @param conf The configuration + **/ +void logUdsConfiguration(UdsConfiguration conf); + +#endif /* CONFIG_H */ diff --git a/uds/cpu.h b/uds/cpu.h new file mode 100644 index 0000000..9314985 --- /dev/null +++ b/uds/cpu.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cpu.h#1 $ + */ + +#ifndef CPU_H +#define CPU_H + +#include "compiler.h" +#include "typeDefs.h" + +/** + * The number of bytes in a CPU cache line. In the future, we'll probably need + * to move this to a processor-specific file or discover it at compilation + * time (or runtime, if sufficiently heterogeneous), but this will do for now. + * (Must be a \#define since enums are not proper compile-time constants.) + **/ +#ifdef __PPC__ +// N.B.: Some PPC processors have smaller cache lines. +#define CACHE_LINE_BYTES 128 +#elif defined(__s390x__) +#define CACHE_LINE_BYTES 256 +#elif defined(__x86_64__) || defined(__aarch64__) +#define CACHE_LINE_BYTES 64 +#else +#error "unknown cache line size" +#endif + +/** + * Minimize cache-miss latency by moving data into a CPU cache before it is + * accessed. + * + * @param address the address to fetch (may be invalid) + * @param forWrite must be constant at compile time--false if + * for reading, true if for writing + **/ +static INLINE void prefetchAddress(const void *address, bool forWrite) +{ + // forWrite won't won't be a constant if we are compiled with optimization + // turned off, in which case prefetching really doesn't matter. + if (__builtin_constant_p(forWrite)) { + __builtin_prefetch(address, forWrite); + } +} + +/** + * Minimize cache-miss latency by moving a range of addresses into a + * CPU cache before they are accessed. + * + * @param start the starting address to fetch (may be invalid) + * @param size the number of bytes in the address range + * @param forWrite must be constant at compile time--false if + * for reading, true if for writing + **/ +static INLINE void prefetchRange(const void *start, + unsigned int size, + bool forWrite) +{ + // Count the number of cache lines to fetch, allowing for the address range + // to span an extra cache line boundary due to address alignment. + const char *address = (const char *) start; + unsigned int offset = ((uintptr_t) address % CACHE_LINE_BYTES); + size += offset; + + unsigned int cacheLines = (1 + (size / CACHE_LINE_BYTES)); + while (cacheLines-- > 0) { + prefetchAddress(address, forWrite); + address += CACHE_LINE_BYTES; + } +} + +#endif /* CPU_H */ diff --git a/uds/deltaIndex.c b/uds/deltaIndex.c new file mode 100644 index 0000000..0c43e9b --- /dev/null +++ b/uds/deltaIndex.c @@ -0,0 +1,1707 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.c#7 $ + */ +#include "deltaIndex.h" + +#include "bits.h" +#include "buffer.h" +#include "compiler.h" +#include "cpu.h" +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" +#include "zone.h" + +/* + * A delta index is a key-value store, where each entry maps an address + * (the key) to a payload (the value). The entries are sorted by address, + * and only the delta between successive addresses is stored in the entry. + * The addresses are assumed to be uniformly distributed,and the deltas are + * therefore exponentially distributed. + * + * The entries could be stored in a single DeltaList, but for efficiency we + * use multiple DeltaLists. These lists are stored in a single chunk of + * memory managed by the DeltaMemory module. The DeltaMemory module can + * move the data around in memory, so we never keep any byte pointers into + * DeltaList memory. We only keep offsets into the memory. + * + * The delta lists are stored as bit streams. These bit streams are stored + * in little endian order, and all offsets into DeltaMemory are bit + * offsets. + * + * All entries are stored as a fixed length payload (the value) followed by a + * variable length key (the delta). Always strictly in little endian order. + * + * A collision entry is used when two block names have the same delta list + * address. A collision entry is encoded with DELTA==0, and has 256 + * extension bits containing the full block name. + * + * There is a special exception to be noted. The DELTA==0 encoding usually + * indicates a collision with the preceding entry. But for the first entry + * in any delta list there is no preceding entry, so the DELTA==0 encoding + * at the beginning of a delta list indicates a normal entry. + * + * The Huffman code is driven by 3 parameters: + * + * MINBITS This is the number of bits in the smallest code + * + * BASE This is the number of values coded using a code of length MINBITS + * + * INCR This is the number of values coded by using one additional bit. + * + * These parameters are related by: + * + * BASE + INCR == 1 << MINBITS + * + * When we create an index, we need to know the mean delta. From the mean + * delta, we compute these three parameters. The math for the Huffman code + * of an exponential distribution says that we compute: + * + * INCR = log(2) * MEAN_DELTA + * + * Then we find the smallest MINBITS so that + * + * 1 << MINBITS > INCR + * + * And then: + * + * BASE = (1 << MINBITS) - INCR + * + * Now we need a code such that + * + * - The first BASE values code using MINBITS bits + * - The next INCR values code using MINBITS+1 bits. + * - The next INCR values code using MINBITS+2 bits. + * - The next INCR values code using MINBITS+3 bits. + * - (and so on). + * + * ENCODE(DELTA): + * + * if (DELTA < BASE) { + * put DELTA in MINBITS bits; + * } else { + * T1 = (DELTA - BASE) % INCR + BASE; + * T2 = (DELTA - BASE) / INCR; + * put T1 in MINBITS bits; + * put 0 in T2 bits; + * put 1 in 1 bit; + * } + * + * DECODE(BIT_STREAM): + * + * T1 = next MINBITS bits of stream; + * if (T1 < BASE) { + * DELTA = T1; + * } else { + * Scan bits in the stream until reading a 1, + * setting T2 to the number of 0 bits read; + * DELTA = T2 * INCR + T1; + * } + * + * The bit field utilities that we use on the delta lists assume that it is + * possible to read a few bytes beyond the end of the bit field. So we + * make sure to allocates some extra bytes at the end of memory containing + * the delta lists. Look for POST_FIELD_GUARD_BYTES to find the code + * related to this. + * + * And note that the decode bit stream code includes a step that skips over + * 0 bits until the first 1 bit is found. A corrupted delta list could + * cause this step to run off the end of the delta list memory. As an + * extra protection against this happening, the guard bytes at the end + * should be set to all ones. + */ + +/** + * Constants and structures for the saved delta index. "DI" is for + * deltaIndex, and -##### is a number to increment when the format of the + * data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_DI_START[] = "DI-00002"; + +struct di_header { + char magic[MAGIC_SIZE]; // MAGIC_DI_START + uint32_t zoneNumber; + uint32_t numZones; + uint32_t firstList; + uint32_t numLists; + uint64_t recordCount; + uint64_t collisionCount; +}; + +//********************************************************************** +// Methods for dealing with mutable delta list headers +//********************************************************************** + +/** + * Move the start of the delta list bit stream without moving the end. + * + * @param deltaList The delta list header + * @param increment The change in the start of the delta list + **/ +static INLINE void moveDeltaListStart(DeltaList *deltaList, int increment) +{ + deltaList->startOffset += increment; + deltaList->size -= increment; +} + +/** + * Move the end of the delta list bit stream without moving the start. + * + * @param deltaList The delta list header + * @param increment The change in the end of the delta list + **/ +static INLINE void moveDeltaListEnd(DeltaList *deltaList, int increment) +{ + deltaList->size += increment; +} + +//********************************************************************** +// Methods for dealing with immutable delta list headers packed +//********************************************************************** + +// Header data used for immutable delta index pages. These data are +// followed by the delta list offset table. +typedef struct __attribute__((packed)) deltaPageHeader { + uint64_t nonce; // Externally-defined nonce + uint64_t virtualChapterNumber; // The virtual chapter number + uint16_t firstList; // Index of the first delta list on the page + uint16_t numLists; // Number of delta lists on the page +} DeltaPageHeader; + +// Immutable delta lists are packed into pages containing a header that +// encodes the delta list information into 19 bits per list (64KB bit offset) + +enum { IMMUTABLE_HEADER_SIZE = 19 }; + +/** + * Get the bit offset to the immutable delta list header + * + * @param listNumber The delta list number + * + * @return the offset of immutable delta list header + **/ +static INLINE unsigned int getImmutableHeaderOffset(unsigned int listNumber) +{ + return (sizeof(DeltaPageHeader) * CHAR_BIT + + listNumber * IMMUTABLE_HEADER_SIZE); +} + +/** + * Get the bit offset to the start of the immutable delta list bit stream + * + * @param memory The memory page containing the delta lists + * @param listNumber The delta list number + * + * @return the start of the delta list + **/ +static INLINE unsigned int getImmutableStart(const byte *memory, + unsigned int listNumber) +{ + return getField(memory, getImmutableHeaderOffset(listNumber), + IMMUTABLE_HEADER_SIZE); +} + +/** + * Set the bit offset to the start of the immutable delta list bit stream + * + * @param memory The memory page containing the delta lists + * @param listNumber The delta list number + * @param startOffset The start of the delta list + **/ +static INLINE void setImmutableStart(byte *memory, unsigned int listNumber, + unsigned int startOffset) +{ + setField(startOffset, memory, getImmutableHeaderOffset(listNumber), + IMMUTABLE_HEADER_SIZE); +} + +//********************************************************************** +// Methods for dealing with Delta List Entries +//********************************************************************** + +/** + * Decode a delta index entry delta value. The DeltaIndexEntry basically + * describes the previous list entry, and has had its offset field changed to + * point to the subsequent entry. We decode the bit stream and update the + * DeltaListEntry to describe the entry. + * + * @param deltaEntry The delta index entry + **/ +static INLINE void decodeDelta(DeltaIndexEntry *deltaEntry) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + const byte *memory = deltaZone->memory; + uint64_t deltaOffset + = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; + const byte *addr = memory + deltaOffset / CHAR_BIT; + int offset = deltaOffset % CHAR_BIT; + uint32_t data = getUInt32LE(addr) >> offset; + addr += sizeof(uint32_t); + int keyBits = deltaZone->minBits; + unsigned int delta = data & ((1 << keyBits) - 1); + if (delta >= deltaZone->minKeys) { + data >>= keyBits; + if (data == 0) { + keyBits = sizeof(uint32_t) * CHAR_BIT - offset; + while ((data = getUInt32LE(addr)) == 0) { + addr += sizeof(uint32_t); + keyBits += sizeof(uint32_t) * CHAR_BIT; + } + } + keyBits += ffs(data); + delta += (keyBits - deltaZone->minBits - 1) * deltaZone->incrKeys; + } + deltaEntry->delta = delta; + deltaEntry->key += delta; + + // Check for a collision, a delta of zero not at the start of the list. + if (unlikely((delta == 0) && (deltaEntry->offset > 0))) { + deltaEntry->isCollision = true; + // The small duplication of this math in the two arms of this if statement + // makes a tiny but measurable difference in performance. + deltaEntry->entryBits = deltaEntry->valueBits + keyBits + COLLISION_BITS; + } else { + deltaEntry->isCollision = false; + deltaEntry->entryBits = deltaEntry->valueBits + keyBits; + } +} + +/** + * Delete bits from a delta list at the offset of the specified delta index + * entry. + * + * @param deltaEntry The delta index entry + * @param size The number of bits to delete + **/ +static void deleteBits(const DeltaIndexEntry *deltaEntry, int size) +{ + DeltaList *deltaList = deltaEntry->deltaList; + byte *memory = deltaEntry->deltaZone->memory; + // Compute how many bits are retained before and after the deleted bits + uint32_t totalSize = getDeltaListSize(deltaList); + uint32_t beforeSize = deltaEntry->offset; + uint32_t afterSize = totalSize - deltaEntry->offset - size; + + // Determine whether to add to the available space either before or after + // the delta list. We prefer to move the least amount of data. If it is + // exactly the same, try to add to the smaller amount of free space. + bool beforeFlag; + if (beforeSize < afterSize) { + beforeFlag = true; + } else if (afterSize < beforeSize) { + beforeFlag = false; + } else { + uint64_t freeBefore + = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); + uint64_t freeAfter + = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); + beforeFlag = freeBefore < freeAfter; + } + + uint64_t source, destination; + uint32_t count; + if (beforeFlag) { + source = getDeltaListStart(deltaList); + destination = source + size; + moveDeltaListStart(deltaList, size); + count = beforeSize; + } else { + moveDeltaListEnd(deltaList, -size); + destination = getDeltaListStart(deltaList) + deltaEntry->offset; + source = destination + size; + count = afterSize; + } + moveBits(memory, source, memory, destination, count); +} + +/** + * Get the offset of the collision field in a DeltaIndexEntry + * + * @param entry The delta index record + * + * @return the offset of the start of the collision name + **/ +static INLINE uint64_t getCollisionOffset(const DeltaIndexEntry *entry) +{ + return (getDeltaEntryOffset(entry) + entry->entryBits - COLLISION_BITS); +} + +/** + * Encode a delta index entry delta. + * + * @param deltaEntry The delta index entry + **/ +static void encodeDelta(const DeltaIndexEntry *deltaEntry) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + byte *memory = deltaZone->memory; + uint64_t offset = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; + if (deltaEntry->delta < deltaZone->minKeys) { + setField(deltaEntry->delta, memory, offset, deltaZone->minBits); + return; + } + unsigned int temp = deltaEntry->delta - deltaZone->minKeys; + unsigned int t1 = (temp % deltaZone->incrKeys) + deltaZone->minKeys; + unsigned int t2 = temp / deltaZone->incrKeys; + setField(t1, memory, offset, deltaZone->minBits); + setZero(memory, offset + deltaZone->minBits, t2); + setOne(memory, offset + deltaZone->minBits + t2, 1); +} + +/** + * Encode a delta index entry. + * + * @param deltaEntry The delta index entry + * @param value The value associated with the entry + * @param name For collision entries, the 256 bit full name. + **/ +static void encodeEntry(const DeltaIndexEntry *deltaEntry, unsigned int value, + const byte *name) +{ + byte *memory = deltaEntry->deltaZone->memory; + uint64_t offset = getDeltaEntryOffset(deltaEntry); + setField(value, memory, offset, deltaEntry->valueBits); + encodeDelta(deltaEntry); + if (name != NULL) { + setBytes(memory, getCollisionOffset(deltaEntry), name, COLLISION_BYTES); + } +} + +/** + * Insert bits into a delta list at the offset of the specified delta index + * entry. + * + * @param deltaEntry The delta index entry + * @param size The number of bits to insert + * + * @return UDS_SUCCESS or an error code + **/ +static int insertBits(DeltaIndexEntry *deltaEntry, int size) +{ + DeltaMemory *deltaZone = deltaEntry->deltaZone; + DeltaList *deltaList = deltaEntry->deltaList; + // Compute how many bits are in use before and after the inserted bits + uint32_t totalSize = getDeltaListSize(deltaList); + uint32_t beforeSize = deltaEntry->offset; + uint32_t afterSize = totalSize - deltaEntry->offset; + if ((unsigned int) (totalSize + size) > UINT16_MAX) { + deltaEntry->listOverflow = true; + deltaZone->overflowCount++; + return UDS_OVERFLOW; + } + + // Compute how many bits are available before and after the delta list + uint64_t freeBefore + = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); + uint64_t freeAfter + = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); + + bool beforeFlag; + if (((unsigned int) size <= freeBefore) + && ((unsigned int) size <= freeAfter)) { + // We have enough space to use either before or after the list. Prefer + // to move the least amount of data. If it is exactly the same, try to + // take from the larger amount of free space. + if (beforeSize < afterSize) { + beforeFlag = true; + } else if (afterSize < beforeSize) { + beforeFlag = false; + } else { + beforeFlag = freeBefore > freeAfter; + } + } else if ((unsigned int) size <= freeBefore) { + // There is space before but not after + beforeFlag = true; + } else if ((unsigned int) size <= freeAfter) { + // There is space after but not before + beforeFlag = false; + } else { + // Neither of the surrounding spaces is large enough for this request, + // Extend and/or rebalance the delta list memory choosing to move the + // least amount of data. + unsigned int growingIndex = deltaEntry->listNumber + 1; + beforeFlag = beforeSize < afterSize; + if (!beforeFlag) { + growingIndex++; + } + int result = extendDeltaMemory(deltaZone, growingIndex, + (size + CHAR_BIT - 1) / CHAR_BIT, true); + if (result != UDS_SUCCESS) { + return result; + } + } + + uint64_t source, destination; + uint32_t count; + if (beforeFlag) { + source = getDeltaListStart(deltaList); + destination = source - size; + moveDeltaListStart(deltaList, -size); + count = beforeSize; + } else { + moveDeltaListEnd(deltaList, size); + source = getDeltaListStart(deltaList) + deltaEntry->offset; + destination = source + size; + count = afterSize; + } + byte *memory = deltaZone->memory; + moveBits(memory, source, memory, destination, count); + return UDS_SUCCESS; +} + +/** + * Get the amount of memory to allocate for each zone + * + * @param numZones The number of zones in the index + * @param memorySize The number of bytes in memory for the index + * + * @return the number of bytes to allocate for a single zone + **/ +static INLINE size_t getZoneMemorySize(unsigned int numZones, + size_t memorySize) +{ + size_t zoneSize = memorySize / numZones; + // Round the size up so that each zone is a multiple of 64K in size. + enum { ALLOC_BOUNDARY = 64 * KILOBYTE }; + return (zoneSize + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; +} + +/** + * Validate delta index parameters + * + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + **/ +static bool invalidParameters(unsigned int meanDelta, + unsigned int numPayloadBits) +{ + const unsigned int minDelta = 10; + const unsigned int maxDelta = 1 << MAX_FIELD_BITS; + if ((meanDelta < minDelta) || (meanDelta > maxDelta)) { + logWarning("error initializing delta index: " + "meanDelta (%u) is not in the range %u to %u", + meanDelta, minDelta, maxDelta); + return true; + } + if (numPayloadBits > MAX_FIELD_BITS) { + logWarning("error initializing delta index: Too many payload bits (%u)", + numPayloadBits); + return true; + } + return false; +} + +/** + * Set a delta index entry to be a collision + * + * @param deltaEntry The delta index entry + **/ +static void setCollision(DeltaIndexEntry *deltaEntry) +{ + deltaEntry->isCollision = true; + deltaEntry->entryBits += COLLISION_BITS; +} + +/** + * Set the delta in a delta index entry. + * + * @param deltaEntry The delta index entry + * @param delta The new delta + **/ +static void setDelta(DeltaIndexEntry *deltaEntry, unsigned int delta) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + deltaEntry->delta = delta; + int keyBits = (deltaZone->minBits + + ((deltaZone->incrKeys - deltaZone->minKeys + delta) + / deltaZone->incrKeys)); + deltaEntry->entryBits = deltaEntry->valueBits + keyBits; +} + +//********************************************************************** +// External functions declared in deltaIndex.h +//********************************************************************** + +int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, + unsigned int numLists, unsigned int meanDelta, + unsigned int numPayloadBits, size_t memorySize) +{ + size_t memSize = getZoneMemorySize(numZones, memorySize); + if (invalidParameters(meanDelta, numPayloadBits)) { + return UDS_INVALID_ARGUMENT; + } + + int result = ALLOCATE(numZones, DeltaMemory, "Delta Index Zones", + &deltaIndex->deltaZones); + if (result != UDS_SUCCESS) { + return result; + } + + deltaIndex->numZones = numZones; + deltaIndex->numLists = numLists; + deltaIndex->listsPerZone = (numLists + numZones - 1) / numZones; + deltaIndex->isMutable = true; + deltaIndex->tag = 'm'; + + unsigned int z; + for (z = 0; z < numZones; z++) { + unsigned int firstListInZone = z * deltaIndex->listsPerZone; + unsigned int numListsInZone = deltaIndex->listsPerZone; + if (z == numZones - 1) { + /* + * The last zone gets fewer lists if numZones doesn't evenly divide + * numLists. We'll have an underflow if the assertion below doesn't + * hold. (And it turns out that the assertion is equivalent to + * numZones <= 1 + (numLists / numZones) + (numLists % numZones) + * in the case that numZones doesn't evenly divide numlists. + * If numLists >= numZones * numZones, then the above inequality + * will always hold.) + */ + if (deltaIndex->numLists <= firstListInZone) { + uninitializeDeltaIndex(deltaIndex); + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%u delta-lists not enough for %u zones", + numLists, numZones); + } + numListsInZone = deltaIndex->numLists - firstListInZone; + } + int result = initializeDeltaMemory(&deltaIndex->deltaZones[z], memSize, + firstListInZone, numListsInZone, + meanDelta, numPayloadBits); + if (result != UDS_SUCCESS) { + uninitializeDeltaIndex(deltaIndex); + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +static bool verifyDeltaIndexPage(uint64_t nonce, + uint16_t numLists, + uint64_t expectedNonce, + byte *memory, + size_t memSize) +{ + // Verify the nonce. A mismatch here happens in normal operation when we are + // doing a rebuild but haven't written the entire volume once. + if (nonce != expectedNonce) { + return false; + } + + // Verify that the number of delta lists can fit in the page. + if (numLists > + (memSize - sizeof(DeltaPageHeader)) * CHAR_BIT / IMMUTABLE_HEADER_SIZE) { + return false; + } + + // Verify that the first delta list is immediately after the last delta list + // header. + if (getImmutableStart(memory, 0) != getImmutableHeaderOffset(numLists + 1)) { + return false; + } + + // Verify that the lists are in the correct order. + unsigned int i; + for (i = 0; i < numLists; i++) { + if (getImmutableStart(memory, i) > getImmutableStart(memory, i + 1)) { + return false; + } + } + + // Verify that the last list ends on the page, and that there is room for the + // post-field guard bits. + if (getImmutableStart(memory, numLists) + > (memSize - POST_FIELD_GUARD_BYTES) * CHAR_BIT) { + return false; + } + + // Verify that the guard bytes are correctly set to all ones. + for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { + byte guardByte = memory[memSize - POST_FIELD_GUARD_BYTES + i]; + if (guardByte != (byte) ~0) { + return false; + } + } + + // All verifications passed. + return true; +} + +/**********************************************************************/ +int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, + uint64_t expectedNonce, + unsigned int meanDelta, + unsigned int numPayloadBits, + byte *memory, + size_t memSize) +{ + const DeltaPageHeader *header = (const DeltaPageHeader *) memory; + + if (invalidParameters(meanDelta, numPayloadBits)) { + return UDS_INVALID_ARGUMENT; + } + + // First assume that the header is little endian + uint64_t nonce = getUInt64LE((const byte *) &header->nonce); + uint64_t vcn = getUInt64LE((const byte *) &header->virtualChapterNumber); + uint16_t firstList = getUInt16LE((const byte *) &header->firstList); + uint16_t numLists = getUInt16LE((const byte *) &header->numLists); + if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, memSize)) { + // That failed, so try big endian + nonce = getUInt64BE((const byte *) &header->nonce); + vcn = getUInt64BE((const byte *) &header->virtualChapterNumber); + firstList = getUInt16BE((const byte *) &header->firstList); + numLists = getUInt16BE((const byte *) &header->numLists); + if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, + memSize)) { + // Also failed. Do not log this as an error. It happens in normal + // operation when we are doing a rebuild but haven't written the entire + // volume once. + return UDS_CORRUPT_COMPONENT; + } + } + + deltaIndexPage->deltaIndex.deltaZones = &deltaIndexPage->deltaMemory; + deltaIndexPage->deltaIndex.numZones = 1; + deltaIndexPage->deltaIndex.numLists = numLists; + deltaIndexPage->deltaIndex.listsPerZone = numLists; + deltaIndexPage->deltaIndex.isMutable = false; + deltaIndexPage->deltaIndex.tag = 'p'; + deltaIndexPage->virtualChapterNumber = vcn; + deltaIndexPage->lowestListNumber = firstList; + deltaIndexPage->highestListNumber = firstList + numLists - 1; + + initializeDeltaMemoryPage(&deltaIndexPage->deltaMemory, (byte *) memory, + memSize, numLists, meanDelta, numPayloadBits); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void uninitializeDeltaIndex(DeltaIndex *deltaIndex) +{ + if (deltaIndex != NULL) { + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + uninitializeDeltaMemory(&deltaIndex->deltaZones[z]); + } + FREE(deltaIndex->deltaZones); + memset(deltaIndex, 0, sizeof(DeltaIndex)); + } +} + +/**********************************************************************/ +void emptyDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + emptyDeltaLists(&deltaIndex->deltaZones[z]); + } +} + +/**********************************************************************/ +void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, unsigned int zoneNumber) +{ + emptyDeltaLists(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int packDeltaIndexPage(const DeltaIndex *deltaIndex, + uint64_t headerNonce, + bool headerNativeEndian, + byte *memory, + size_t memSize, + uint64_t virtualChapterNumber, + unsigned int firstList, + unsigned int *numLists) +{ + if (!deltaIndex->isMutable) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack an immutable index"); + } + if (deltaIndex->numZones != 1) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack a delta index page when the" + " index has %u zones", + deltaIndex->numZones); + } + if (firstList > deltaIndex->numLists) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack a delta index page when the" + " first list (%u) is larger than the number" + " of lists (%u)", + firstList, deltaIndex->numLists); + } + + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[0]; + DeltaList *deltaLists = &deltaZone->deltaLists[firstList + 1]; + unsigned int maxLists = deltaIndex->numLists - firstList; + + // Compute how many lists will fit on the page + int numBits = memSize * CHAR_BIT; + // Subtract the size of the fixed header and 1 delta list offset + numBits -= getImmutableHeaderOffset(1); + // Subtract the guard bytes of memory so that allow us to freely read a + // short distance past the end of any byte we are interested in. + numBits -= POST_FIELD_GUARD_BYTES * CHAR_BIT; + if (numBits < IMMUTABLE_HEADER_SIZE) { + // This page is too small to contain even one empty delta list + return logErrorWithStringError(UDS_OVERFLOW, + "Chapter Index Page of %zu bytes is too" + " small", + memSize); + } + + unsigned int nLists = 0; + while (nLists < maxLists) { + // Each list requires 1 delta list offset and the list data + int bits = IMMUTABLE_HEADER_SIZE + getDeltaListSize(&deltaLists[nLists]); + if (bits > numBits) { + break; + } + nLists++; + numBits -= bits; + } + *numLists = nLists; + + // Construct the page header + DeltaPageHeader *header = (DeltaPageHeader *) memory; + if (headerNativeEndian) { + header->nonce = headerNonce; + header->virtualChapterNumber = virtualChapterNumber; + header->firstList = firstList; + header->numLists = nLists; + } else { + storeUInt64LE((byte *) &header->nonce, headerNonce); + storeUInt64LE((byte *) &header->virtualChapterNumber, + virtualChapterNumber); + storeUInt16LE((byte *) &header->firstList, firstList); + storeUInt16LE((byte *) &header->numLists, nLists); + } + + // Construct the delta list offset table, making sure that the memory + // page is large enough. + unsigned int offset = getImmutableHeaderOffset(nLists + 1); + setImmutableStart(memory, 0, offset); + unsigned int i; + for (i = 0; i < nLists; i++) { + offset += getDeltaListSize(&deltaLists[i]); + setImmutableStart(memory, i + 1, offset); + } + + // Copy the delta list data onto the memory page + for (i = 0; i < nLists; i++) { + DeltaList *deltaList = &deltaLists[i]; + moveBits(deltaZone->memory, getDeltaListStart(deltaList), memory, + getImmutableStart(memory, i), getDeltaListSize(deltaList)); + } + + // Set all the bits in the guard bytes. Do not use the bit field + // utilities. + memset(memory + memSize - POST_FIELD_GUARD_BYTES, ~0, + POST_FIELD_GUARD_BYTES); + return UDS_SUCCESS; +} + + +/**********************************************************************/ +void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag) +{ + deltaIndex->tag = tag; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + deltaIndex->deltaZones[z].tag = tag; + } +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) +{ + int result = getBytesFromBuffer(buffer, MAGIC_SIZE, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numZones); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->recordCount); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->collisionCount); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + return result; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int readDeltaIndexHeader(BufferedReader *reader, + struct di_header *header) +{ + Buffer *buffer; + + int result = makeBuffer(sizeof(*header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read delta index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeDeltaIndexHeader(buffer, header); + freeBuffer(&buffer); + return result; +} + +/**********************************************************************/ +int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + if (!deltaIndex->isMutable) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot restore to an immutable index"); + } + if (numReaders <= 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "No delta index files"); + } + + unsigned int numZones = numReaders; + if (numZones > MAX_ZONES) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "zone count %u must not exceed MAX_ZONES", + numZones); + } + + unsigned long recordCount = 0; + unsigned long collisionCount = 0; + unsigned int firstList[MAX_ZONES]; + unsigned int numLists[MAX_ZONES]; + BufferedReader *reader[MAX_ZONES]; + bool zoneFlags[MAX_ZONES] = { false, }; + + // Read the header from each file, and make sure we have a matching set + unsigned int z; + for (z = 0; z < numZones; z++) { + struct di_header header; + int result = readDeltaIndexHeader(bufferedReaders[z], &header); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to read delta index header"); + } + if (memcmp(header.magic, MAGIC_DI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index file has bad magic" + " number"); + } + if (numZones != header.numZones) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain mismatched" + " zone counts (%u,%u)", + numZones, header.numZones); + } + if (header.zoneNumber >= numZones) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contains zone %u of" + " %u zones", + header.zoneNumber, numZones); + } + if (zoneFlags[header.zoneNumber]) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain two of zone" + " %u", + header.zoneNumber); + } + reader[header.zoneNumber] = bufferedReaders[z]; + firstList[header.zoneNumber] = header.firstList; + numLists[header.zoneNumber] = header.numLists; + zoneFlags[header.zoneNumber] = true; + recordCount += header.recordCount; + collisionCount += header.collisionCount; + } + unsigned int listNext = 0; + for (z = 0; z < numZones; z++) { + if (firstList[z] != listNext) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index file for zone %u starts" + " with list %u instead of list %u", + z, firstList[z], listNext); + } + listNext += numLists[z]; + } + if (listNext != deltaIndex->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain %u delta lists" + " instead of %u delta lists", + listNext, deltaIndex->numLists); + } + if (collisionCount > recordCount) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain %ld collisions" + " and %ld records", + collisionCount, recordCount); + } + + emptyDeltaIndex(deltaIndex); + deltaIndex->deltaZones[0].recordCount = recordCount; + deltaIndex->deltaZones[0].collisionCount = collisionCount; + + // Read the delta list sizes from the files, and distribute each of them + // to proper zone + for (z = 0; z < numZones; z++) { + unsigned int i; + for (i = 0; i < numLists[z]; i++) { + byte deltaListSizeData[sizeof(uint16_t)]; + int result = readFromBufferedReader(reader[z], deltaListSizeData, + sizeof(deltaListSizeData)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to read delta index size"); + } + uint16_t deltaListSize = getUInt16LE(deltaListSizeData); + unsigned int listNumber = firstList[z] + i; + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + listNumber -= deltaZone->firstList; + deltaZone->deltaLists[listNumber + 1].size = deltaListSize; + } + } + + // Prepare each zone to start receiving the delta list data + for (z = 0; z < deltaIndex->numZones; z++) { + int result = startRestoringDeltaMemory(&deltaIndex->deltaZones[z]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + if (!areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[z])) { + return false; + } + } + return true; +} + +/**********************************************************************/ +int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + // Make sure the data are intended for this delta list. Do not + // log an error, as this may be valid data for another delta index. + if (dlsi->tag != deltaIndex->tag) { + return UDS_CORRUPT_COMPONENT; + } + + if (dlsi->index >= deltaIndex->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "invalid delta list number %u of %u", + dlsi->index, deltaIndex->numLists); + } + + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, dlsi->index); + return restoreDeltaList(&deltaIndex->deltaZones[zoneNumber], dlsi, data); +} + +/**********************************************************************/ +void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + abortRestoringDeltaMemory(&deltaIndex->deltaZones[z]); + } +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_DI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numZones); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->recordCount); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->collisionCount); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*header), + "%zu bytes encoded of %zu expected", + contentLength(buffer), sizeof(*header)); + + return result; +} + +/**********************************************************************/ +int startSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + struct di_header header; + memcpy(header.magic, MAGIC_DI_START, MAGIC_SIZE); + header.zoneNumber = zoneNumber; + header.numZones = deltaIndex->numZones; + header.firstList = deltaZone->firstList; + header.numLists = deltaZone->numLists; + header.recordCount = deltaZone->recordCount; + header.collisionCount = deltaZone->collisionCount; + + Buffer *buffer; + int result = makeBuffer(sizeof(struct di_header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeDeltaIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write delta index header"); + } + + unsigned int i; + for (i = 0; i < deltaZone->numLists; i++) { + uint16_t deltaListSize = getDeltaListSize(&deltaZone->deltaLists[i + 1]); + byte data[2]; + storeUInt16LE(data, deltaListSize); + result = writeToBufferedWriter(bufferedWriter, data, sizeof(data)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write delta list size"); + } + } + + startSavingDeltaMemory(deltaZone, bufferedWriter); + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return finishSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + abortSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) +{ + // The exact amount of memory used depends upon the number of zones. + // Compute the maximum potential memory size. + size_t maxMemSize = memorySize; + unsigned int numZones; + for (numZones = 1; numZones <= MAX_ZONES; numZones++) { + size_t memSize = getZoneMemorySize(numZones, memorySize); + if (memSize > maxMemSize) { + maxMemSize = memSize; + } + } + // Saving a delta index requires a header ... + return (sizeof(struct di_header) + // ... plus a DeltaListSaveInfo per delta list + // plus an extra byte per delta list ... + + numLists * (sizeof(DeltaListSaveInfo) + 1) + // ... plus the delta list memory + + maxMemSize); +} + +/**********************************************************************/ +int validateDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + int result = validateDeltaLists(&deltaIndex->deltaZones[z]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int assertNotAtEnd(const DeltaIndexEntry *deltaEntry, int errorCode) +{ + return ASSERT_WITH_ERROR_CODE(!deltaEntry->atEnd, errorCode, + "operation is invalid because the list entry " + "is at the end of the delta list"); +} + +/**********************************************************************/ +static void prefetchDeltaList(const DeltaMemory *deltaZone, + const DeltaList *deltaList) +{ + const byte *memory = deltaZone->memory; + const byte *addr = &memory[getDeltaListStart(deltaList) / CHAR_BIT]; + unsigned int size = getDeltaListSize(deltaList) / CHAR_BIT; + prefetchRange(addr, size, false); +} + +/**********************************************************************/ +int startDeltaIndexSearch(const DeltaIndex *deltaIndex, + unsigned int listNumber, unsigned int key, + bool readOnly, DeltaIndexEntry *deltaEntry) +{ + int result + = ASSERT_WITH_ERROR_CODE((listNumber < deltaIndex->numLists), + UDS_CORRUPT_DATA, + "Delta list number (%u) is out of range (%u)", + listNumber, deltaIndex->numLists); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); + DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + listNumber -= deltaZone->firstList; + result = ASSERT_WITH_ERROR_CODE((listNumber < deltaZone->numLists), + UDS_CORRUPT_DATA, + "Delta list number (%u)" + " is out of range (%u) for zone (%u)", + listNumber, deltaZone->numLists, zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaList *deltaList; + if (deltaIndex->isMutable) { + deltaList = &deltaZone->deltaLists[listNumber + 1]; + if (!readOnly) { + // Here is the lazy writing of the index for a checkpoint + lazyFlushDeltaList(deltaZone, listNumber); + } + } else { + // Translate the immutable delta list header into a temporary full + // delta list header + deltaList = &deltaEntry->tempDeltaList; + deltaList->startOffset = getImmutableStart(deltaZone->memory, listNumber); + unsigned int endOffset = getImmutableStart(deltaZone->memory, + listNumber + 1); + deltaList->size = endOffset - deltaList->startOffset; + deltaList->saveKey = 0; + deltaList->saveOffset = 0; + } + + if (key > deltaList->saveKey) { + deltaEntry->key = deltaList->saveKey; + deltaEntry->offset = deltaList->saveOffset; + } else { + deltaEntry->key = 0; + deltaEntry->offset = 0; + if (key == 0) { + // This usually means we're about to walk the entire delta list, so get + // all of it into the CPU cache. + prefetchDeltaList(deltaZone, deltaList); + } + } + + deltaEntry->atEnd = false; + deltaEntry->deltaZone = deltaZone; + deltaEntry->deltaList = deltaList; + deltaEntry->entryBits = 0; + deltaEntry->isCollision = false; + deltaEntry->listNumber = listNumber; + deltaEntry->listOverflow = false; + deltaEntry->valueBits = deltaZone->valueBits; + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((__noinline__)) +int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + + const DeltaList *deltaList = deltaEntry->deltaList; + deltaEntry->offset += deltaEntry->entryBits; + unsigned int size = getDeltaListSize(deltaList); + if (unlikely(deltaEntry->offset >= size)) { + deltaEntry->atEnd = true; + deltaEntry->delta = 0; + deltaEntry->isCollision = false; + return ASSERT_WITH_ERROR_CODE((deltaEntry->offset == size), + UDS_CORRUPT_DATA, + "next offset past end of delta list"); + } + + decodeDelta(deltaEntry); + + unsigned int nextOffset = deltaEntry->offset + deltaEntry->entryBits; + if (nextOffset > size) { + // This is not an assertion because validateChapterIndexPage() wants to + // handle this error. + logWarning("Decoded past the end of the delta list"); + return UDS_CORRUPT_DATA; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) +{ + int result = ASSERT(!deltaEntry->isCollision, "entry is not a collision"); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaList *deltaList = deltaEntry->deltaList; + deltaList->saveKey = deltaEntry->key - deltaEntry->delta; + deltaList->saveOffset = deltaEntry->offset; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, + unsigned int key, const byte *name, bool readOnly, + DeltaIndexEntry *deltaEntry) +{ + int result = startDeltaIndexSearch(deltaIndex, listNumber, key, readOnly, + deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + do { + result = nextDeltaIndexEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } while (!deltaEntry->atEnd && (key > deltaEntry->key)); + + result = rememberDeltaIndexOffset(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + if (!deltaEntry->atEnd && (key == deltaEntry->key)) { + DeltaIndexEntry collisionEntry; + collisionEntry = *deltaEntry; + for (;;) { + result = nextDeltaIndexEntry(&collisionEntry); + if (result != UDS_SUCCESS) { + return result; + } + if (collisionEntry.atEnd || !collisionEntry.isCollision) { + break; + } + byte collisionName[COLLISION_BYTES]; + getBytes(deltaEntry->deltaZone->memory, + getCollisionOffset(&collisionEntry), collisionName, + COLLISION_BYTES); + if (memcmp(collisionName, name, COLLISION_BYTES) == 0) { + *deltaEntry = collisionEntry; + break; + } + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) +{ + int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(deltaEntry->isCollision, UDS_BAD_STATE, + "Cannot get full block name from a" + " non-collision delta index entry"); + if (result != UDS_SUCCESS) { + return result; + } + + getBytes(deltaEntry->deltaZone->memory, getCollisionOffset(deltaEntry), + name, COLLISION_BYTES); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int assertMutableEntry(const DeltaIndexEntry *deltaEntry) +{ + return ASSERT_WITH_ERROR_CODE(deltaEntry->deltaList + != &deltaEntry->tempDeltaList, + UDS_BAD_STATE, + "delta index is mutable"); +} + +/**********************************************************************/ +int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE(((value & ((1 << deltaEntry->valueBits) - 1)) + == value), UDS_INVALID_ARGUMENT, + "Value (%u) being set in a delta index is " + "too large (must fit in %u bits)", + value, deltaEntry->valueBits); + if (result != UDS_SUCCESS) { + return result; + } + + setField(value, deltaEntry->deltaZone->memory, + getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, + unsigned int value, const byte *name) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + if (deltaEntry->isCollision) { + /* + * The caller wants us to insert a collision entry onto a collision + * entry. This happens when we find a collision and attempt to add the + * name again to the index. This is normally a fatal error unless we + * are replaying a closed chapter while we are rebuilding a master + * index. + */ + return UDS_DUPLICATE_NAME; + } + + if (deltaEntry->offset < deltaEntry->deltaList->saveOffset) { + // The saved entry offset is after the new entry and will no longer be + // valid, so replace it with the insertion point. + result = rememberDeltaIndexOffset(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (name != NULL) { + // We are inserting a collision entry which is placed after this entry + result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT((key == deltaEntry->key), + "incorrect key for collision entry"); + if (result != UDS_SUCCESS) { + return result; + } + + deltaEntry->offset += deltaEntry->entryBits; + setDelta(deltaEntry, 0); + setCollision(deltaEntry); + result = insertBits(deltaEntry, deltaEntry->entryBits); + } else if (deltaEntry->atEnd) { + // We are inserting a new entry at the end of the delta list + result = ASSERT((key >= deltaEntry->key), "key past end of list"); + if (result != UDS_SUCCESS) { + return result; + } + + setDelta(deltaEntry, key - deltaEntry->key); + deltaEntry->key = key; + deltaEntry->atEnd = false; + result = insertBits(deltaEntry, deltaEntry->entryBits); + } else { + // We are inserting a new entry which requires the delta in the + // following entry to be updated. + result = ASSERT((key < deltaEntry->key), "key precedes following entry"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT((key >= deltaEntry->key - deltaEntry->delta), + "key effects following entry's delta"); + if (result != UDS_SUCCESS) { + return result; + } + + int oldEntrySize = deltaEntry->entryBits; + DeltaIndexEntry nextEntry = *deltaEntry; + unsigned int nextValue = getDeltaEntryValue(&nextEntry); + setDelta(deltaEntry, key - (deltaEntry->key - deltaEntry->delta)); + deltaEntry->key = key; + setDelta(&nextEntry, nextEntry.key - key); + nextEntry.offset += deltaEntry->entryBits; + // The 2 new entries are always bigger than the 1 entry we are replacing + int additionalSize + = deltaEntry->entryBits + nextEntry.entryBits - oldEntrySize; + result = insertBits(deltaEntry, additionalSize); + if (result != UDS_SUCCESS) { + return result; + } + encodeEntry(&nextEntry, nextValue, NULL); + } + if (result != UDS_SUCCESS) { + return result; + } + encodeEntry(deltaEntry, value, name); + + DeltaMemory *deltaZone = deltaEntry->deltaZone; + deltaZone->recordCount++; + deltaZone->collisionCount += deltaEntry->isCollision ? 1 : 0; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaIndexEntry nextEntry = *deltaEntry; + result = nextDeltaIndexEntry(&nextEntry); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaMemory *deltaZone = deltaEntry->deltaZone; + + if (deltaEntry->isCollision) { + // This is a collision entry, so just remove it + deleteBits(deltaEntry, deltaEntry->entryBits); + nextEntry.offset = deltaEntry->offset; + deltaZone->collisionCount -= 1; + } else if (nextEntry.atEnd) { + // This entry is at the end of the list, so just remove it + deleteBits(deltaEntry, deltaEntry->entryBits); + nextEntry.key -= deltaEntry->delta; + nextEntry.offset = deltaEntry->offset; + } else { + // The delta in the next entry needs to be updated. + unsigned int nextValue = getDeltaEntryValue(&nextEntry); + int oldSize = deltaEntry->entryBits + nextEntry.entryBits; + if (nextEntry.isCollision) { + // The next record is a collision. It needs to be rewritten as a + // non-collision with a larger delta. + nextEntry.isCollision = false; + deltaZone->collisionCount -= 1; + } + setDelta(&nextEntry, deltaEntry->delta + nextEntry.delta); + nextEntry.offset = deltaEntry->offset; + // The 1 new entry is always smaller than the 2 entries we are replacing + deleteBits(deltaEntry, oldSize - nextEntry.entryBits); + encodeEntry(&nextEntry, nextValue, NULL); + } + deltaZone->recordCount--; + deltaZone->discardCount++; + *deltaEntry = nextEntry; + + DeltaList *deltaList = deltaEntry->deltaList; + if (deltaEntry->offset < deltaList->saveOffset) { + // The saved entry offset is after the entry we just removed and it + // will no longer be valid. We must force the next search to start at + // the beginning. + deltaList->saveKey = 0; + deltaList->saveOffset = 0; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return deltaIndex->deltaZones[zoneNumber].firstList; +} + +/**********************************************************************/ +unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return deltaIndex->deltaZones[zoneNumber].numLists; +} + +/**********************************************************************/ +uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + uint64_t bitCount = 0; + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + unsigned int i; + for (i = 0; i < deltaZone->numLists; i++) { + bitCount += getDeltaListSize(&deltaZone->deltaLists[i + 1]); + } + return bitCount; +} + +/**********************************************************************/ +uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) +{ + uint64_t bitCount = 0; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + bitCount += getDeltaIndexZoneDlistBitsUsed(deltaIndex, z); + } + return bitCount; +} + +/**********************************************************************/ +uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) +{ + uint64_t byteCount = 0; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; + byteCount += deltaZone->size; + } + return byteCount * CHAR_BIT; +} + +/**********************************************************************/ +void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats) +{ + memset(stats, 0, sizeof(DeltaIndexStats)); + stats->memoryAllocated = deltaIndex->numZones * sizeof(DeltaMemory); + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; + stats->memoryAllocated += getDeltaMemoryAllocated(deltaZone); + stats->rebalanceTime += deltaZone->rebalanceTime; + stats->rebalanceCount += deltaZone->rebalanceCount; + stats->recordCount += deltaZone->recordCount; + stats->collisionCount += deltaZone->collisionCount; + stats->discardCount += deltaZone->discardCount; + stats->overflowCount += deltaZone->overflowCount; + stats->numLists += deltaZone->numLists; + } +} + +/**********************************************************************/ +unsigned int getDeltaIndexPageCount(unsigned int numEntries, + unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits, + size_t bytesPerPage) +{ + // Compute the number of bits needed for all the entries + size_t bitsPerIndex + = getDeltaMemorySize(numEntries, meanDelta, numPayloadBits); + // Compute the number of bits needed for a single delta list + unsigned int bitsPerDeltaList = bitsPerIndex / numLists; + // Adjust the bits per index, adding the immutable delta list headers + bitsPerIndex += numLists * IMMUTABLE_HEADER_SIZE; + // Compute the number of usable bits on an immutable index page + unsigned int bitsPerPage + = (bytesPerPage - sizeof(DeltaPageHeader)) * CHAR_BIT; + // Adjust the bits per page, taking away one immutable delta list header + // and one delta list representing internal fragmentation + bitsPerPage -= IMMUTABLE_HEADER_SIZE + bitsPerDeltaList; + // Now compute the number of pages needed + return (bitsPerIndex + bitsPerPage - 1) / bitsPerPage; +} + +/**********************************************************************/ +void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + logRatelimit(logInfo, "List 0x%X Key 0x%X Offset 0x%X%s%s ListSize 0x%X%s", + deltaEntry->listNumber, deltaEntry->key, deltaEntry->offset, + deltaEntry->atEnd ? " end" : "", + deltaEntry->isCollision ? " collision" : "", + getDeltaListSize(deltaEntry->deltaList), + deltaEntry->listOverflow ? " overflow" : ""); + deltaEntry->listOverflow = false; +} diff --git a/uds/deltaIndex.h b/uds/deltaIndex.h new file mode 100644 index 0000000..af2d762 --- /dev/null +++ b/uds/deltaIndex.h @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.h#4 $ + */ + +#ifndef DELTAINDEX_H +#define DELTAINDEX_H 1 + +#include "compiler.h" +#include "deltaMemory.h" + +enum { + // the number of extra bytes and bits needed to store a collision entry + COLLISION_BYTES = UDS_CHUNK_NAME_SIZE, + COLLISION_BITS = COLLISION_BYTES * CHAR_BIT +}; + +typedef struct deltaIndex { + DeltaMemory *deltaZones; // The zones + unsigned int numZones; // The number of zones + unsigned int numLists; // The number of delta lists + unsigned int listsPerZone; // Lists per zone (last zone can be smaller) + bool isMutable; // True if this index is mutable + byte tag; // Tag belonging to this delta index +} DeltaIndex; + +/* + * A DeltaIndexPage describes a single page of a chapter index. The deltaIndex + * field allows the page to be treated as an immutable DeltaIndex. We use the + * deltaMemory field to treat the chapter index page as a single zone index, + * and without the need to do an additional memory allocation. + */ + +typedef struct deltaIndexPage { + DeltaIndex deltaIndex; + // These values are loaded from the DeltaPageHeader + unsigned int lowestListNumber; + unsigned int highestListNumber; + uint64_t virtualChapterNumber; + // This structure describes the single zone of a delta index page. + DeltaMemory deltaMemory; +} DeltaIndexPage; + +/* + * Notes on the DeltaIndexEntries: + * + * The fields documented as "public" can be read by any code that uses a + * DeltaIndex. The fields documented as "private" carry information + * between DeltaIndex method calls and should not be used outside the + * DeltaIndex module. + * + * (1) The DeltaIndexEntry is used like an iterator when searching a delta + * list. + * + * (2) And it is also the result of a successful search and can be used to + * refer to the element found by the search. + * + * (3) And it is also the result of an unsuccessful search and can be used + * to refer to the insertion point for a new record. + * + * (4) If atEnd==true, the DeltaListEntry can only be used as the insertion + * point for a new record at the end of the list. + * + * (5) If atEnd==false and isCollision==true, the DeltaListEntry fields + * refer to a collision entry in the list, and the DeltaListEntry can + * be used a a reference to this entry. + * + * (6) If atEnd==false and isCollision==false, the DeltaListEntry fields + * refer to a non-collision entry in the list. Such DeltaListEntries + * can be used as a reference to a found entry, or an insertion point + * for a non-collision entry before this entry, or an insertion point + * for a collision entry that collides with this entry. + */ + +typedef struct deltaIndexEntry { + // Public fields + unsigned int key; // The key for this entry + bool atEnd; // We are after the last entry in the list + bool isCollision; // This record is a collision + // Private fields (but DeltaIndex_t1 cheats and looks at them) + bool listOverflow; // This delta list overflowed + unsigned short valueBits; // The number of bits used for the value + unsigned short entryBits; // The number of bits used for the entire entry + DeltaMemory *deltaZone; // The delta index zone + DeltaList *deltaList; // The delta list containing the entry, + unsigned int listNumber; // The delta list number + uint32_t offset; // Bit offset of this entry within the list + unsigned int delta; // The delta between this and previous entry + DeltaList tempDeltaList; // Temporary delta list for immutable indices +} DeltaIndexEntry; + +typedef struct { + size_t memoryAllocated; // Number of bytes allocated + RelTime rebalanceTime; // The time spent rebalancing + int rebalanceCount; // Number of memory rebalances + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int numLists; // The number of delta lists +} DeltaIndexStats; + +/** + * Initialize a delta index. + * + * @param deltaIndex The delta index to initialize + * @param numZones The number of zones in the index + * @param numLists The number of delta lists in the index + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param memorySize The number of bytes in memory for the index + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, + unsigned int numLists, unsigned int meanDelta, + unsigned int numPayloadBits, size_t memorySize) + __attribute__((warn_unused_result)); + +/** + * Initialize an immutable delta index page. + * + * @param deltaIndexPage The delta index page to initialize + * @param expectedNonce If non-zero, the expected nonce. + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param memory The memory page + * @param memSize The size of the memory page + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, + uint64_t expectedNonce, + unsigned int meanDelta, + unsigned int numPayloadBits, + byte *memory, + size_t memSize) + __attribute__((warn_unused_result)); + +/** + * Uninitialize a delta index. + * + * @param deltaIndex The delta index to uninitialize + **/ +void uninitializeDeltaIndex(DeltaIndex *deltaIndex); + +/** + * Empty the delta index. + * + * @param deltaIndex The delta index being emptied. + **/ +void emptyDeltaIndex(const DeltaIndex *deltaIndex); + +/** + * Empty a zone of the delta index. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone being emptied + **/ +void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Pack delta lists from a mutable delta index into an immutable delta index + * page. A range of delta lists (starting with a specified list index) is + * copied from the mutable delta index into a memory page used in the immutable + * index. The number of lists copied onto the page is returned to the caller. + * + * @param deltaIndex The delta index being converted + * @param headerNonce The header nonce to store + * @param headerNativeEndian If true, write native endian header + * @param memory The memory page to use + * @param memSize The size of the memory page + * @param virtualChapterNumber The virtual chapter number + * @param firstList The first delta list number to be copied + * @param numLists The number of delta lists that were copied + * + * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists + * argument contains the number of lists copied. + **/ +int packDeltaIndexPage(const DeltaIndex *deltaIndex, + uint64_t headerNonce, + bool headerNativeEndian, + byte *memory, + size_t memSize, + uint64_t virtualChapterNumber, + unsigned int firstList, + unsigned int *numLists) + __attribute__((warn_unused_result)); + + +/** + * Set the tag value used when saving and/or restoring a delta index. + * + * @param deltaIndex The delta index + * @param tag The tag value + **/ +void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag); + +/** + * Start restoring a delta index from an input stream. + * + * @param deltaIndex The delta index to read into + * @param bufferedReaders The buffered readers to read the delta index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, + BufferedReader **bufferedReaders, int numReaders) + __attribute__((warn_unused_result)); + +/** + * Have all the data been read while restoring a delta index from an + * input stream? + * + * @param deltaIndex The delta index + * + * @return true if all the data are read + **/ +bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex); + +/** + * Restore a saved delta list + * + * @param deltaIndex The delta index + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) + __attribute__((warn_unused_result)); + +/** + * Abort restoring a delta index from an input stream. + * + * @param deltaIndex The delta index + **/ +void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex); + +/** + * Start saving a delta index zone to a buffered output stream. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) + __attribute__((warn_unused_result)); + +/** + * Have all the data been written while saving a delta index zone to an + * output stream? If the answer is yes, it is still necessary to call + * finishSavingDeltaIndex(), which will return quickly. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return true if all the data are written + **/ +bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Finish saving a delta index zone to an output stream. Force the writing + * of all of the remaining data. If an error occurred asynchronously + * during the save operation, it will be returned here. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Abort saving a delta index zone to an output stream. If an error + * occurred asynchronously during the save operation, it will be dropped. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a delta index + * + * @param numLists The number of delta lists in the index + * @param memorySize The number of bytes in memory for the index + * + * @return numBytes The number of bytes required to save the master index + **/ +size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) + __attribute__((warn_unused_result)); + +/** + * Validate the delta index + * + * @param deltaIndex The delta index + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int validateDeltaIndex(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Prepare to search for an entry in the specified delta list. + * + *

This is always the first routine to be called when dealing with delta + * index entries. It is always followed by calls to nextDeltaIndexEntry to + * iterate through a delta list. The fields of the DeltaIndexEntry argument + * will be set up for iteration, but will not contain an entry from the list. + * + * @param deltaIndex The delta index to search + * @param listNumber The delta list number + * @param key First delta list key that the caller is interested in + * @param readOnly True if this is a read-only operation + * @param iterator The index entry being used to search through the list + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startDeltaIndexSearch(const DeltaIndex *deltaIndex, + unsigned int listNumber, unsigned int key, + bool readOnly, DeltaIndexEntry *iterator) + __attribute__((warn_unused_result)); + +/** + * Find the next entry in the specified delta list + * + * @param deltaEntry Info about an entry, which is updated to describe the + * following entry + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Remember the position of a delta index entry, so that we can use it when + * starting the next search. + * + * @param deltaEntry Info about an entry found during a search. This should + * be the first entry that matches the key exactly (i.e. + * not a collision entry), or the first entry with a key + * greater than the entry sought for. + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Find the delta index entry, or the insertion point for a delta index + * entry. + * + * @param deltaIndex The delta index to search + * @param listNumber The delta list number + * @param key The key field being looked for + * @param name The 256 bit full name + * @param readOnly True if this is a read-only index search + * @param deltaEntry Updated to describe the entry being looked for + * + * @return UDS_SUCCESS or an error code + **/ +int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, + unsigned int key, const byte *name, bool readOnly, + DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Get the full name from a collision DeltaIndexEntry + * + * @param deltaEntry The delta index record + * @param name The 256 bit full name + * + * @return UDS_SUCCESS or an error code + **/ +int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) + __attribute__((warn_unused_result)); + +/** + * Get the bit offset into delta memory of a delta index entry. + * + * @param deltaEntry The delta index entry + * + * @return the bit offset into delta memory + **/ +static INLINE uint64_t getDeltaEntryOffset(const DeltaIndexEntry *deltaEntry) +{ + return getDeltaListStart(deltaEntry->deltaList) + deltaEntry->offset; +} + +/** + * Get the number of bits used to encode the entry key (the delta). + * + * @param entry The delta index record + * + * @return the number of bits used to encode the key + **/ +static INLINE unsigned int getDeltaEntryKeyBits(const DeltaIndexEntry *entry) +{ + /* + * Derive keyBits by subtracting the sizes of the other two fields from the + * total. We don't actually use this for encoding/decoding, so it doesn't + * need to be super-fast. We save time where it matters by not storing it. + */ + return (entry->entryBits - entry->valueBits + - (entry->isCollision ? COLLISION_BITS : 0)); +} + +/** + * Get the value field of the DeltaIndexEntry + * + * @param deltaEntry The delta index record + * + * @return the value + **/ +static INLINE unsigned int getDeltaEntryValue(const DeltaIndexEntry *deltaEntry) +{ + return getField(deltaEntry->deltaZone->memory, + getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); +} + +/** + * Set the value field of the DeltaIndexEntry + * + * @param deltaEntry The delta index record + * @param value The new value + * + * @return UDS_SUCCESS or an error code + **/ +int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) + __attribute__((warn_unused_result)); + +/** + * Create a new entry in the delta index + * + * @param deltaEntry The delta index entry that indicates the insertion point + * for the new record. For a collision entry, this is the + * non-collision entry that the new entry collides with. + * For a non-collision entry, this new entry is inserted + * before the specified entry. + * @param key The key field + * @param value The value field + * @param name For collision entries, the 256 bit full name; + * Otherwise null + * + * @return UDS_SUCCESS or an error code + **/ +int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, + unsigned int value, const byte *name) + __attribute__((warn_unused_result)); + +/** + * Remove an existing delta index entry, and advance to the next entry in + * the delta list. + * + * @param deltaEntry On call the delta index record to remove. After + * returning, the following entry in the delta list. + * + * @return UDS_SUCCESS or an error code + **/ +int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Map a delta list number to a delta zone number + * + * @param deltaIndex The delta index + * @param listNumber The delta list number + * + * @return the zone number containing the delta list + **/ +static INLINE unsigned int getDeltaIndexZone(const DeltaIndex *deltaIndex, + unsigned int listNumber) +{ + return listNumber / deltaIndex->listsPerZone; +} + +/** + * Get the first delta list number in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return the first delta list index in the zone + **/ +unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Get the number of delta lists in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return the number of delta lists in the zone + **/ +unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Get the number of bytes used for master index entries in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return The number of bits in use + **/ +uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes used for master index entries. + * + * @param deltaIndex The delta index + * + * @return The number of bits in use + **/ +uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for master index entries. + * + * @param deltaIndex The delta index + * + * @return The number of bits allocated + **/ +uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Get the delta index statistics. + * + * @param deltaIndex The delta index + * @param stats The statistics + **/ +void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats); + +/** + * Get the number of pages needed for an immutable delta index. + * + * @param numEntries The number of entries in the index + * @param numLists The number of delta lists + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param bytesPerPage The number of bytes in a page + * + * @return the number of pages needed for the index + **/ +unsigned int getDeltaIndexPageCount(unsigned int numEntries, + unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits, + size_t bytesPerPage); + +/** + * Log a delta index entry, and any error conditions related to the entry. + * + * @param deltaEntry The delta index entry. + **/ +void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry); + +#endif /* DELTAINDEX_H */ diff --git a/uds/deltaMemory.c b/uds/deltaMemory.c new file mode 100644 index 0000000..2b30714 --- /dev/null +++ b/uds/deltaMemory.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.c#3 $ + */ +#include "deltaMemory.h" + +#include "bits.h" +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "timeUtils.h" +#include "typeDefs.h" +#include "uds.h" + +/* + * The DeltaMemory structure manages the memory that stores delta lists. + * + * The "mutable" form of DeltaMemory is used for the master index and for + * an open chapter index. The "immutable" form of DeltaMemory is used for + * regular chapter indices. + */ + +// This is the number of guard bits that are needed in the tail guard list +enum { GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT }; + +/** + * Get the offset of the first byte that a delta list bit stream resides in + * + * @param deltaList The delta list + * + * @return the number byte offset + **/ +static INLINE uint64_t getDeltaListByteStart(const DeltaList *deltaList) +{ + return getDeltaListStart(deltaList) / CHAR_BIT; +} + +/** + * Get the actual number of bytes that a delta list bit stream resides in + * + * @param deltaList The delta list + * + * @return the number of bytes + **/ +static INLINE uint16_t getDeltaListByteSize(const DeltaList *deltaList) +{ + uint16_t startBitOffset = getDeltaListStart(deltaList) % CHAR_BIT; + uint16_t bitSize = getDeltaListSize(deltaList); + return ((unsigned int) startBitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; +} + +/** + * Get the number of bytes in the delta lists headers. + * + * @param numLists The number of delta lists + * + * @return the number of bytes in the delta lists headers + **/ +static INLINE size_t getSizeOfDeltaLists(unsigned int numLists) +{ + return (numLists + 2) * sizeof(DeltaList); +} + +/** + * Get the size of the flags array (in bytes) + * + * @param numLists The number of delta lists + * + * @return the number of bytes for an array that has one bit per delta + * list, plus the necessary guard bytes. + **/ +static INLINE size_t getSizeOfFlags(unsigned int numLists) +{ + return (numLists + CHAR_BIT - 1) / CHAR_BIT + POST_FIELD_GUARD_BYTES; +} + +/** + * Get the number of bytes of scratch memory for the delta lists. + * + * @param numLists The number of delta lists + * + * @return the number of bytes of scratch memory for the delta lists + **/ +static INLINE size_t getSizeOfTempOffsets(unsigned int numLists) +{ + return (numLists + 2) * sizeof(uint64_t); +} + +/**********************************************************************/ + +/** + * Clear the transfers flags. + * + * @param deltaMemory The delta memory + **/ +static void clearTransferFlags(DeltaMemory *deltaMemory) +{ + memset(deltaMemory->flags, 0, getSizeOfFlags(deltaMemory->numLists)); + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; +} + +/**********************************************************************/ + +/** + * Set the transfer flags for delta lists that are not empty, and count how + * many there are. + * + * @param deltaMemory The delta memory + **/ +static void flagNonEmptyDeltaLists(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + unsigned int i; + for (i = 0; i < deltaMemory->numLists; i++) { + if (getDeltaListSize(&deltaMemory->deltaLists[i + 1]) > 0) { + setOne(deltaMemory->flags, i, 1); + deltaMemory->numTransfers++; + } + } +} + +/**********************************************************************/ +void emptyDeltaLists(DeltaMemory *deltaMemory) +{ + // Zero all the delta list headers + DeltaList *deltaLists = deltaMemory->deltaLists; + memset(deltaLists, 0, getSizeOfDeltaLists(deltaMemory->numLists)); + + /* + * Initialize delta lists to be empty. We keep 2 extra delta list + * descriptors, one before the first real entry and one after so that we + * don't need to bounds check the array access when calculating + * preceeding and following gap sizes. + * + * Because the delta list headers were zeroed, the head guard list is + * already at offset zero and size zero. + * + * The end guard list contains guard bytes so that the bit field + * utilities can safely read past the end of any byte we are interested + * in. + */ + uint64_t numBits = (uint64_t) deltaMemory->size * CHAR_BIT; + deltaLists[deltaMemory->numLists + 1].startOffset = numBits - GUARD_BITS; + deltaLists[deltaMemory->numLists + 1].size = GUARD_BITS; + + // Set all the bits in the end guard list. Do not use the bit field + // utilities. + memset(deltaMemory->memory + deltaMemory->size - POST_FIELD_GUARD_BYTES, + ~0, POST_FIELD_GUARD_BYTES); + + // Evenly space out the real delta lists. The sizes are already zero, so + // we just need to set the starting offsets. + uint64_t spacing = (numBits - GUARD_BITS) / deltaMemory->numLists; + uint64_t offset = spacing / 2; + unsigned int i; + for (i = 1; i <= deltaMemory->numLists; i++) { + deltaLists[i].startOffset = offset; + offset += spacing; + } + + // Update the statistics + deltaMemory->discardCount += deltaMemory->recordCount; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; +} + +/**********************************************************************/ +/** + * Compute the Huffman coding parameters for the given mean delta + * + * @param meanDelta The mean delta value + * @param minBits The number of bits in the minimal key code + * @param minKeys The number of keys used in a minimal code + * @param incrKeys The number of keys used for another code bit + **/ +static void computeCodingConstants(unsigned int meanDelta, + unsigned short *minBits, + unsigned int *minKeys, + unsigned int *incrKeys) +{ + // We want to compute the rounded value of log(2) * meanDelta. Since we + // cannot always use floating point, use a really good integer approximation. + *incrKeys = (836158UL * meanDelta + 603160UL) / 1206321UL; + *minBits = computeBits(*incrKeys + 1); + *minKeys = (1 << *minBits) - *incrKeys; +} + +/**********************************************************************/ +/** + * Rebalance a range of delta lists within memory. + * + * @param deltaMemory A delta memory structure + * @param first The first delta list index + * @param last The last delta list index + **/ +static void rebalanceDeltaMemory(const DeltaMemory *deltaMemory, + unsigned int first, unsigned int last) +{ + if (first == last) { + DeltaList *deltaList = &deltaMemory->deltaLists[first]; + uint64_t newStart = deltaMemory->tempOffsets[first]; + // We need to move only one list, and we know it is safe to do so + if (getDeltaListStart(deltaList) != newStart) { + // Compute the first source byte + uint64_t source = getDeltaListByteStart(deltaList); + // Update the delta list location + deltaList->startOffset = newStart; + // Now use the same computation to locate the first destination byte + uint64_t destination = getDeltaListByteStart(deltaList); + memmove(deltaMemory->memory + destination, deltaMemory->memory + source, + getDeltaListByteSize(deltaList)); + } + } else { + // There is more than one list. Divide the problem in half, and use + // recursive calls to process each half. Note that after this + // computation, first <= middle, and middle < last. + unsigned int middle = (first + last) / 2; + const DeltaList *deltaList = &deltaMemory->deltaLists[middle]; + uint64_t newStart = deltaMemory->tempOffsets[middle]; + // The direction that our middle list is moving determines which half + // of the problem must be processed first. + if (newStart > getDeltaListStart(deltaList)) { + rebalanceDeltaMemory(deltaMemory, middle + 1, last); + rebalanceDeltaMemory(deltaMemory, first, middle); + } else { + rebalanceDeltaMemory(deltaMemory, first, middle); + rebalanceDeltaMemory(deltaMemory, middle + 1, last); + } + } +} + +/**********************************************************************/ +int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, + unsigned int firstList, unsigned int numLists, + unsigned int meanDelta, unsigned int numPayloadBits) +{ + if (numLists == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize delta memory with 0 " + "delta lists"); + } + byte *memory = NULL; + int result = ALLOCATE(size, byte, "delta list", &memory); + if (result != UDS_SUCCESS) { + return result; + } + uint64_t *tempOffsets = NULL; + result = ALLOCATE(numLists + 2, uint64_t, "delta list temp", + &tempOffsets); + if (result != UDS_SUCCESS) { + FREE(memory); + return result; + } + byte *flags = NULL; + result = ALLOCATE(getSizeOfFlags(numLists), byte, "delta list flags", + &flags); + if (result != UDS_SUCCESS) { + FREE(memory); + FREE(tempOffsets); + return result; + } + + computeCodingConstants(meanDelta, &deltaMemory->minBits, + &deltaMemory->minKeys, &deltaMemory->incrKeys); + deltaMemory->valueBits = numPayloadBits; + deltaMemory->memory = memory; + deltaMemory->deltaLists = NULL; + deltaMemory->tempOffsets = tempOffsets; + deltaMemory->flags = flags; + deltaMemory->bufferedWriter = NULL; + deltaMemory->size = size; + deltaMemory->rebalanceTime = 0; + deltaMemory->rebalanceCount = 0; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; + deltaMemory->discardCount = 0; + deltaMemory->overflowCount = 0; + deltaMemory->firstList = firstList; + deltaMemory->numLists = numLists; + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; + deltaMemory->tag = 'm'; + + // Allocate the delta lists. + result = ALLOCATE(deltaMemory->numLists + 2, DeltaList, + "delta lists", &deltaMemory->deltaLists); + if (result != UDS_SUCCESS) { + uninitializeDeltaMemory(deltaMemory); + return result; + } + + emptyDeltaLists(deltaMemory); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void uninitializeDeltaMemory(DeltaMemory *deltaMemory) +{ + FREE(deltaMemory->flags); + deltaMemory->flags = NULL; + FREE(deltaMemory->tempOffsets); + deltaMemory->tempOffsets = NULL; + FREE(deltaMemory->deltaLists); + deltaMemory->deltaLists = NULL; + FREE(deltaMemory->memory); + deltaMemory->memory = NULL; +} + +/**********************************************************************/ +void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, + size_t size, unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits) +{ + computeCodingConstants(meanDelta, &deltaMemory->minBits, + &deltaMemory->minKeys, &deltaMemory->incrKeys); + deltaMemory->valueBits = numPayloadBits; + deltaMemory->memory = memory; + deltaMemory->deltaLists = NULL; + deltaMemory->tempOffsets = NULL; + deltaMemory->flags = NULL; + deltaMemory->bufferedWriter = NULL; + deltaMemory->size = size; + deltaMemory->rebalanceTime = 0; + deltaMemory->rebalanceCount = 0; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; + deltaMemory->discardCount = 0; + deltaMemory->overflowCount = 0; + deltaMemory->firstList = 0; + deltaMemory->numLists = numLists; + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; + deltaMemory->tag = 'p'; +} + +/**********************************************************************/ +bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory) +{ + return deltaMemory->numTransfers == 0; +} + +/**********************************************************************/ +int startRestoringDeltaMemory(DeltaMemory *deltaMemory) +{ + // Extend and balance memory to receive the delta lists + int result = extendDeltaMemory(deltaMemory, 0, 0, false); + if (result != UDS_SUCCESS) { + return UDS_SUCCESS; + } + + // The tail guard list needs to be set to ones + DeltaList *deltaList = &deltaMemory->deltaLists[deltaMemory->numLists + 1]; + setOne(deltaMemory->memory, getDeltaListStart(deltaList), + getDeltaListSize(deltaList)); + + flagNonEmptyDeltaLists(deltaMemory); + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int readDeltaListSaveInfo(BufferedReader *reader, + DeltaListSaveInfo *dlsi) +{ + byte buffer[sizeof(DeltaListSaveInfo)]; + int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + dlsi->tag = buffer[0]; + dlsi->bitOffset = buffer[1]; + dlsi->byteCount = getUInt16LE(&buffer[2]); + dlsi->index = getUInt32LE(&buffer[4]); + return result; +} + +/**********************************************************************/ +int readSavedDeltaList(DeltaListSaveInfo *dlsi, + byte data[DELTA_LIST_MAX_BYTE_COUNT], + BufferedReader *bufferedReader) +{ + int result = readDeltaListSaveInfo(bufferedReader, dlsi); + if (result == UDS_END_OF_FILE) { + return UDS_END_OF_FILE; + } + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to read delta list data"); + } + if ((dlsi->bitOffset >= CHAR_BIT) + || (dlsi->byteCount > DELTA_LIST_MAX_BYTE_COUNT)) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "corrupt delta list data"); + } + if (dlsi->tag == 'z') { + return UDS_END_OF_FILE; + } + result = readFromBufferedReader(bufferedReader, data, dlsi->byteCount); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to read delta list data"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + unsigned int listNumber = dlsi->index - deltaMemory->firstList; + if (listNumber >= deltaMemory->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "invalid delta list number %u not in" + " range [%u,%u)", + dlsi->index, deltaMemory->firstList, + deltaMemory->firstList + + deltaMemory->numLists); + } + + if (getField(deltaMemory->flags, listNumber, 1) == 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected delta list number %u", + dlsi->index); + } + + DeltaList *deltaList = &deltaMemory->deltaLists[listNumber + 1]; + uint16_t bitSize = getDeltaListSize(deltaList); + unsigned int byteCount + = ((unsigned int) dlsi->bitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; + if (dlsi->byteCount != byteCount) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected delta list size %u != %u", + dlsi->byteCount, byteCount); + } + + moveBits(data, dlsi->bitOffset, deltaMemory->memory, + getDeltaListStart(deltaList), bitSize); + setZero(deltaMemory->flags, listNumber, 1); + deltaMemory->numTransfers--; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void abortRestoringDeltaMemory(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + emptyDeltaLists(deltaMemory); +} + +/**********************************************************************/ +void startSavingDeltaMemory(DeltaMemory *deltaMemory, + BufferedWriter *bufferedWriter) +{ + flagNonEmptyDeltaLists(deltaMemory); + deltaMemory->bufferedWriter = bufferedWriter; +} + +/**********************************************************************/ +int finishSavingDeltaMemory(DeltaMemory *deltaMemory) +{ + unsigned int i; + for (i = 0; + !areDeltaMemoryTransfersDone(deltaMemory) + && (i < deltaMemory->numLists); + i++) { + lazyFlushDeltaList(deltaMemory, i); + } + if (deltaMemory->numTransfers > 0) { + deltaMemory->transferStatus + = logWarningWithStringError(UDS_CORRUPT_DATA, + "Not all delta lists written"); + } + deltaMemory->bufferedWriter = NULL; + return deltaMemory->transferStatus; +} + +/**********************************************************************/ +void abortSavingDeltaMemory(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + deltaMemory->bufferedWriter = NULL; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int writeDeltaListSaveInfo(BufferedWriter *bufferedWriter, + DeltaListSaveInfo *dlsi) +{ + byte buffer[sizeof(DeltaListSaveInfo)]; + buffer[0] = dlsi->tag; + buffer[1] = dlsi->bitOffset; + storeUInt16LE(&buffer[2], dlsi->byteCount); + storeUInt32LE(&buffer[4], dlsi->index); + return writeToBufferedWriter(bufferedWriter, buffer, sizeof(buffer)); +} + +/**********************************************************************/ +void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex) +{ + ASSERT_LOG_ONLY((getField(deltaMemory->flags, flushIndex, 1) != 0), + "flush bit is set"); + setZero(deltaMemory->flags, flushIndex, 1); + deltaMemory->numTransfers--; + + DeltaList *deltaList = &deltaMemory->deltaLists[flushIndex + 1]; + DeltaListSaveInfo dlsi; + dlsi.tag = deltaMemory->tag; + dlsi.bitOffset = getDeltaListStart(deltaList) % CHAR_BIT; + dlsi.byteCount = getDeltaListByteSize(deltaList); + dlsi.index = deltaMemory->firstList + flushIndex; + + int result = writeDeltaListSaveInfo(deltaMemory->bufferedWriter, &dlsi); + if (result != UDS_SUCCESS) { + if (deltaMemory->transferStatus == UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write delta list memory"); + deltaMemory->transferStatus = result; + } + } + result = writeToBufferedWriter(deltaMemory->bufferedWriter, + deltaMemory->memory + + getDeltaListByteStart(deltaList), + dlsi.byteCount); + if (result != UDS_SUCCESS) { + if (deltaMemory->transferStatus == UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write delta list memory"); + deltaMemory->transferStatus = result; + } + } +} + +/**********************************************************************/ +int writeGuardDeltaList(BufferedWriter *bufferedWriter) +{ + DeltaListSaveInfo dlsi; + dlsi.tag = 'z'; + dlsi.bitOffset = 0; + dlsi.byteCount = 0; + dlsi.index = 0; + int result = writeToBufferedWriter(bufferedWriter, (const byte *) &dlsi, + sizeof(DeltaListSaveInfo)); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write guard delta list"); + } + return result; +} + +/**********************************************************************/ +int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, + size_t growingSize, bool doCopy) +{ + if (!isMutable(deltaMemory)) { + return logErrorWithStringError(UDS_BAD_STATE, + "Attempt to read into an immutable delta" + " list memory"); + } + + AbsTime startTime = currentTime(CLOCK_MONOTONIC); + + // Calculate the amount of space that is in use. Include the space that + // has a planned use. + DeltaList *deltaLists = deltaMemory->deltaLists; + size_t usedSpace = growingSize; + unsigned int i; + for (i = 0; i <= deltaMemory->numLists + 1; i++) { + usedSpace += getDeltaListByteSize(&deltaLists[i]); + } + + if (deltaMemory->size < usedSpace) { + return UDS_OVERFLOW; + } + + // Compute the new offsets of the delta lists + size_t spacing = (deltaMemory->size - usedSpace) / deltaMemory->numLists; + deltaMemory->tempOffsets[0] = 0; + for (i = 0; i <= deltaMemory->numLists; i++) { + deltaMemory->tempOffsets[i + 1] = (deltaMemory->tempOffsets[i] + + getDeltaListByteSize(&deltaLists[i]) + + spacing); + deltaMemory->tempOffsets[i] *= CHAR_BIT; + deltaMemory->tempOffsets[i] + += getDeltaListStart(&deltaLists[i]) % CHAR_BIT; + if (i == 0) { + deltaMemory->tempOffsets[i + 1] -= spacing / 2; + } + if (i + 1 == growingIndex) { + deltaMemory->tempOffsets[i + 1] += growingSize; + } + } + deltaMemory->tempOffsets[deltaMemory->numLists + 1] + = (deltaMemory->size * CHAR_BIT + - getDeltaListSize(&deltaLists[deltaMemory->numLists + 1])); + // When we rebalance the delta list, we will include the end guard list + // in the rebalancing. It contains the end guard data, which must be + // copied. + if (doCopy) { + rebalanceDeltaMemory(deltaMemory, 1, deltaMemory->numLists + 1); + AbsTime endTime = currentTime(CLOCK_MONOTONIC); + deltaMemory->rebalanceCount++; + deltaMemory->rebalanceTime += timeDifference(endTime, startTime); + } else { + for (i = 1; i <= deltaMemory->numLists + 1; i++) { + deltaLists[i].startOffset = deltaMemory->tempOffsets[i]; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int validateDeltaLists(const DeltaMemory *deltaMemory) +{ + // Validate the delta index fields set by restoring a delta index + if (deltaMemory->collisionCount > deltaMemory->recordCount) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta index contains more collisions" + " (%ld) than records (%ld)", + deltaMemory->collisionCount, + deltaMemory->recordCount); + } + + // Validate the delta lists + DeltaList *deltaLists = deltaMemory->deltaLists; + if (getDeltaListStart(&deltaLists[0]) != 0) { + return logWarningWithStringError(UDS_BAD_STATE, + "the head guard delta list does not start" + " at 0: %llu", + getDeltaListStart(&deltaLists[0])); + } + uint64_t numBits = getDeltaListEnd(&deltaLists[deltaMemory->numLists + 1]); + if (numBits != deltaMemory->size * CHAR_BIT) { + return logWarningWithStringError(UDS_BAD_STATE, + "the tail guard delta list does not end " + "at end of allocated memory: %" PRIu64 + " != %zd", + numBits, deltaMemory->size * CHAR_BIT); + } + int numGuardBits = getDeltaListSize(&deltaLists[deltaMemory->numLists + 1]); + if (numGuardBits < GUARD_BITS) { + return logWarningWithStringError(UDS_BAD_STATE, + "the tail guard delta list does not " + "contain sufficient guard bits: %d < %d", + numGuardBits, GUARD_BITS); + } + unsigned int i; + for (i = 0; i <= deltaMemory->numLists + 1; i++) { + if (getDeltaListStart(&deltaLists[i]) > getDeltaListEnd(&deltaLists[i])) { + return logWarningWithStringError(UDS_BAD_STATE, + "invalid delta list %u: [%" PRIu64 + ", %llu)", + i, + getDeltaListStart(&deltaLists[i]), + getDeltaListEnd(&deltaLists[i])); + } + if (i > deltaMemory->numLists) { + // The rest of the checks do not apply to the tail guard list + continue; + } + if (getDeltaListEnd(&deltaLists[i]) + > getDeltaListStart(&deltaLists[i + 1])) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta lists %u and %u overlap: %" + PRIu64 " > %llu", + i, i + 1, + getDeltaListEnd(&deltaLists[i]), + getDeltaListStart(&deltaLists[i + 1])); + } + if (i == 0) { + // The rest of the checks do not apply to the head guard list + continue; + } + if (deltaLists[i].saveOffset > getDeltaListSize(&deltaLists[i])) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta lists %u saved offset is larger" + " than the list: %u > %u", + i, deltaLists[i].saveOffset, + getDeltaListSize(&deltaLists[i])); + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory) +{ + return (deltaMemory->size + + getSizeOfDeltaLists(deltaMemory->numLists) + + getSizeOfFlags(deltaMemory->numLists) + + getSizeOfTempOffsets(deltaMemory->numLists)); +} + +/**********************************************************************/ +size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, + unsigned int numPayloadBits) +{ + unsigned short minBits; + unsigned int incrKeys, minKeys; + computeCodingConstants(meanDelta, &minBits, &minKeys, &incrKeys); + // On average, each delta is encoded into about minBits+1.5 bits. + return (numEntries * (numPayloadBits + minBits + 1) + numEntries / 2); +} diff --git a/uds/deltaMemory.h b/uds/deltaMemory.h new file mode 100644 index 0000000..1ffb3fd --- /dev/null +++ b/uds/deltaMemory.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.h#1 $ + */ + +#ifndef DELTAMEMORY_H +#define DELTAMEMORY_H 1 + +#include "bits.h" +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "cpu.h" +#include "timeUtils.h" + +/* + * We encode the delta list information into 16 bytes per list. + * + * Because the master index has 1 million delta lists, each byte of header + * information ends up costing us 1MB. We have an incentive to keep the + * size down. + * + * The master index delta list memory is currently about 780MB in size, + * which is more than 6 gigabits. Therefore we need at least 33 bits to + * address the master index memory and we use the uint64_t type. + * + * The master index delta lists have 256 entries of about 24 bits each, + * which is 6K bits. The index needs 13 bits to represent the size of a + * delta list and we use the uint16_t type. + */ + +typedef struct deltaList { + uint64_t startOffset; // The offset of the delta list start within memory + uint16_t size; // The number of bits in the delta list + uint16_t saveOffset; // Where the last search "found" the key + unsigned int saveKey; // The key for the record just before saveOffset. +} DeltaList; + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) deltaMemory { + byte *memory; // The delta list memory + DeltaList *deltaLists; // The delta list headers + uint64_t *tempOffsets; // Temporary starts of delta lists + byte *flags; // Transfer flags + BufferedWriter *bufferedWriter; // Buffered writer for saving an index + size_t size; // The size of delta list memory + RelTime rebalanceTime; // The time spent rebalancing + int rebalanceCount; // Number of memory rebalances + unsigned short valueBits; // The number of bits of value + unsigned short minBits; // The number of bits in the minimal key code + unsigned int minKeys; // The number of keys used in a minimal code + unsigned int incrKeys; // The number of keys used for another code bit + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int firstList; // The index of the first delta list + unsigned int numLists; // The number of delta lists + unsigned int numTransfers; // Number of transfer flags that are set + int transferStatus; // Status of the transfers in progress + byte tag; // Tag belonging to this delta index +} DeltaMemory; + +typedef struct deltaListSaveInfo { + uint8_t tag; // Tag identifying which delta index this list is in + uint8_t bitOffset; // Bit offset of the start of the list data + uint16_t byteCount; // Number of bytes of list data + uint32_t index; // The delta list number within the delta index +} DeltaListSaveInfo; + +// The maximum size of a single delta list (in bytes). We add guard bytes +// to this because such a buffer can be used with moveBits. +enum { DELTA_LIST_MAX_BYTE_COUNT = ((UINT16_MAX + CHAR_BIT) / CHAR_BIT + + POST_FIELD_GUARD_BYTES) }; + +/** + * Initialize delta list memory. + * + * @param deltaMemory A delta memory structure + * @param size The initial size of the memory array + * @param firstList The index of the first delta list + * @param numLists The number of delta lists + * @param meanDelta The mean delta + * @param numPayloadBits The number of payload bits + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, + unsigned int firstList, unsigned int numLists, + unsigned int meanDelta, unsigned int numPayloadBits) + __attribute__((warn_unused_result)); + +/** + * Uninitialize delta list memory. + * + * @param deltaMemory A delta memory structure + **/ +void uninitializeDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Initialize delta list memory to refer to a cached page. + * + * @param deltaMemory A delta memory structure + * @param memory The memory page + * @param size The size of the memory page + * @param numLists The number of delta lists + * @param meanDelta The mean delta + * @param numPayloadBits The number of payload bits + **/ +void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, + size_t size, unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits); + +/** + * Empty the delta lists. + * + * @param deltaMemory The delta memory + **/ +void emptyDeltaLists(DeltaMemory *deltaMemory); + +/** + * Is there a delta list memory save or restore in progress? + * + * @param deltaMemory A delta memory structure + * + * @return true if there are no delta lists that need to be saved or + * restored + **/ +bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory); + +/** + * Start restoring delta list memory from a file descriptor + * + * @param deltaMemory A delta memory structure + * + * @return error code or UDS_SUCCESS + **/ +int startRestoringDeltaMemory(DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Read a saved delta list from a file descriptor + * + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * @param bufferedReader The buffered reader to read the delta list from + * + * @return error code or UDS_SUCCESS + * or UDS_END_OF_FILE at end of the data stream + **/ +int readSavedDeltaList(DeltaListSaveInfo *dlsi, + byte data[DELTA_LIST_MAX_BYTE_COUNT], + BufferedReader *bufferedReader) + __attribute__((warn_unused_result)); + +/** + * Restore a saved delta list + * + * @param deltaMemory A delta memory structure + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) + __attribute__((warn_unused_result)); + +/** + * Abort restoring delta list memory from an input stream. + * + * @param deltaMemory A delta memory structure + **/ +void abortRestoringDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Start saving delta list memory to a buffered output stream + * + * @param deltaMemory A delta memory structure + * @param bufferedWriter The index state component being written + **/ +void startSavingDeltaMemory(DeltaMemory *deltaMemory, + BufferedWriter *bufferedWriter); + +/** + * Finish saving delta list memory to an output stream. Force the writing + * of all of the remaining data. If an error occurred asynchronously + * during the save operation, it will be returned here. + * + * @param deltaMemory A delta memory structure + * + * @return error code or UDS_SUCCESS + **/ +int finishSavingDeltaMemory(DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Abort saving delta list memory to an output stream. If an error + * occurred asynchronously during the save operation, it will be dropped. + * + * @param deltaMemory A delta memory structure + **/ +void abortSavingDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Flush a delta list to an output stream + * + * @param deltaMemory A delta memory structure + * @param flushIndex Index of the delta list that may need to be flushed. + **/ +void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex); + +/** + * Write a guard delta list to mark the end of the saved data + * + * @param bufferedWriter The buffered writer to write the guard delta list to + * + * @return error code or UDS_SUCCESS + **/ +int writeGuardDeltaList(BufferedWriter *bufferedWriter) + __attribute__((warn_unused_result)); + +/** + * Extend the memory used by the delta lists and rebalance the lists in the + * new chunk. + * + *

The delta memory contains N delta lists, which are guarded by two + * empty delta lists. The valid delta lists are numbered 1 to N, and the + * guards are numbered 0 and (N+1); + * + *

When the delta lista are bit streams, it is possible that the tail + * of list J and the head of list (J+1) are in the same byte. In this case + * oldOffsets[j]+sizes[j]==oldOffset[j]-1. We handle this correctly. + * + * @param deltaMemory A delta memory structure + * @param growingIndex Index of the delta list that needs additional space + * left before it (from 1 to N+1). + * @param growingSize Number of additional bytes needed before growingIndex + * @param doCopy True to copy the data, False to just balance the space + * + * @return UDS_SUCCESS or an error code + **/ +int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, + size_t growingSize, bool doCopy) + __attribute__((warn_unused_result)); + +/** + * Validate the delta list headers. + * + * @param deltaMemory A delta memory structure + * + * @return UDS_SUCCESS or an error code + **/ +int validateDeltaLists(const DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for delta index entries and any + * associated overhead. + * + * @param deltaMemory A delta memory structure + * + * @return The number of bytes allocated + **/ +size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory); + +/** + * Get the expected number of bits used in a delta index + * + * @param numEntries The number of index entries + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * + * @return The expected size of a delta index in bits + **/ +size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, + unsigned int numPayloadBits) + __attribute__((warn_unused_result)); + +/** + * Get the bit offset to the start of the delta list bit stream + * + * @param deltaList The delta list header + * + * @return the start of the delta list + **/ +static INLINE uint64_t getDeltaListStart(const DeltaList *deltaList) +{ + return deltaList->startOffset; +} + +/** + * Get the number of bits in a delta list bit stream + * + * @param deltaList The delta list header + * + * @return the size of the delta list + **/ +static INLINE uint16_t getDeltaListSize(const DeltaList *deltaList) +{ + return deltaList->size; +} + +/** + * Get the bit offset to the end of the delta list bit stream + * + * @param deltaList The delta list header + * + * @return the end of the delta list + **/ +static INLINE uint64_t getDeltaListEnd(const DeltaList *deltaList) +{ + return getDeltaListStart(deltaList) + getDeltaListSize(deltaList); +} + +/** + * Identify mutable vs. immutable delta memory + * + * Mutable delta memory contains delta lists that can be modified, and is + * initialized using initializeDeltaMemory(). + * + * Immutable delta memory contains packed delta lists, cannot be modified, + * and is initialized using initializeDeltaMemoryPage(). + * + * For mutable delta memory, all of the following expressions are true. + * And for immutable delta memory, all of the following expressions are + * false. + * deltaLists != NULL + * tempOffsets != NULL + * flags != NULL + * + * @param deltaMemory A delta memory structure + * + * @return true if the delta memory is mutable + **/ +static INLINE bool isMutable(const DeltaMemory *deltaMemory) +{ + return deltaMemory->deltaLists != NULL; +} + +/** + * Lazily flush a delta list to an output stream + * + * @param deltaMemory A delta memory structure + * @param flushIndex Index of the delta list that may need to be flushed. + **/ +static INLINE void lazyFlushDeltaList(DeltaMemory *deltaMemory, + unsigned int flushIndex) +{ + if (getField(deltaMemory->flags, flushIndex, 1) != 0) { + flushDeltaList(deltaMemory, flushIndex); + } +} +#endif /* DELTAMEMORY_H */ diff --git a/uds/errors.c b/uds/errors.c new file mode 100644 index 0000000..5aab19e --- /dev/null +++ b/uds/errors.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/errors.c#11 $ + */ + +#include "errors.h" + +#include "common.h" +#include "permassert.h" +#include "stringUtils.h" + +#ifdef __KERNEL__ +#include +#endif + +static const struct errorInfo successful = { "UDS_SUCCESS", "Success" }; + +#ifdef __KERNEL__ +static const char *const messageTable[] = { + [EPERM] = "Operation not permitted", + [ENOENT] = "No such file or directory", + [ESRCH] = "No such process", + [EINTR] = "Interrupted system call", + [EIO] = "Input/output error", + [ENXIO] = "No such device or address", + [E2BIG] = "Argument list too long", + [ENOEXEC] = "Exec format error", + [EBADF] = "Bad file descriptor", + [ECHILD] = "No child processes", + [EAGAIN] = "Resource temporarily unavailable", + [ENOMEM] = "Cannot allocate memory", + [EACCES] = "Permission denied", + [EFAULT] = "Bad address", + [ENOTBLK] = "Block device required", + [EBUSY] = "Device or resource busy", + [EEXIST] = "File exists", + [EXDEV] = "Invalid cross-device link", + [ENODEV] = "No such device", + [ENOTDIR] = "Not a directory", + [EISDIR] = "Is a directory", + [EINVAL] = "Invalid argument", + [ENFILE] = "Too many open files in system", + [EMFILE] = "Too many open files", + [ENOTTY] = "Inappropriate ioctl for device", + [ETXTBSY] = "Text file busy", + [EFBIG] = "File too large", + [ENOSPC] = "No space left on device", + [ESPIPE] = "Illegal seek", + [EROFS] = "Read-only file system", + [EMLINK] = "Too many links", + [EPIPE] = "Broken pipe", + [EDOM] = "Numerical argument out of domain", + [ERANGE] = "Numerical result out of range" +}; +#endif + +static const struct errorInfo errorList[] = { + { "UDS_UNINITIALIZED", "UDS library is not initialized" }, + { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, + { "UDS_EMODULE_LOAD", "Could not load modules" }, + { "UDS_ENOTHREADS", "Could not create a new thread" }, + { "UDS_NOCONTEXT", "Could not find the requested library context" }, + { "UDS_DISABLED", "UDS library context is disabled" }, + { "UDS_CORRUPT_COMPONENT", "Corrupt saved component" }, + { "UDS_UNKNOWN_ERROR", "Unknown error" }, + { "UDS_UNUSED_CODE_8", "Unused error code 8" }, + { "UDS_UNUSED_CODE_9", "Unused error code 9" }, + { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, + { "UDS_NO_INDEXSESSION", "Index session not known" }, + { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, + { "UDS_SHORT_READ", "Could not read requested number of bytes" }, + { "UDS_UNUSED_CODE_14", "Unused error code 14" }, + { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, + { "UDS_VOLUME_OVERFLOW", "Memory overflow due to storage failure" }, + { "UDS_UNUSED_CODE_17", "Unused error code 17" }, + { "UDS_UNUSED_CODE_18", "Unused error code 18" }, + { "UDS_UNUSED_CODE_19", "Unused error code 19" }, + { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, + { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, + { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, + { "UDS_UNUSED_CODE_23", "Unused error code 23" }, + { "UDS_UNUSED_CODE_24", "Unused error code 24" }, + { "UDS_UNUSED_CODE_25", "Unused error code 25" }, + { "UDS_UNUSED_CODE_26", "Unused error code 26" }, + { "UDS_UNUSED_CODE_27", "Unused error code 27" }, + { "UDS_INVALID_MEMORY_SIZE", + "Configured memory too small or unsupported size" }, + { "UDS_UNUSED_CODE_29", "Unused error code 29" }, + { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, + { "UDS_CONF_REQUIRED", "A configuration is required" }, + { "UDS_UNUSED_CODE_32", "Unused error code 32" }, + { "UDS_UNUSED_CODE_33", "Unused error code 33" }, + { "UDS_UNUSED_CODE_34", "Unused error code 34" }, + { "UDS_UNUSED_CODE_35", "Unused error code 35" }, + { "UDS_UNUSED_CODE_36", "Unused error code 36" }, + { "UDS_NO_INDEX", "No index found" }, + { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, + { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, + { "UDS_UNUSED_CODE_40", "Unused error code 40" }, + { "UDS_UNUSED_CODE_41", "Unused error code 41" }, + { "UDS_UNUSED_CODE_42", "Unused error code 42" }, + { "UDS_UNUSED_CODE_43", "Unused error code 43" }, + { "UDS_END_OF_FILE", "Unexpected end of file" }, + { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, + { "UDS_UNUSED_CODE_46", "Unused error code 46" }, + { "UDS_INSUFFICIENT_INDEX_SPACE", "Insufficient index space" }, + { "UDS_UNUSED_CODE_48", "Unused error code 48" }, + { "UDS_UNUSED_CODE_49", "Unused error code 49" }, + { "UDS_SUSPENDED", "Index suspended"}, + { "UDS_UNUSED_CODE_51", "Unused error code 51" }, + { "UDS_INDEXSESSION_IN_USE", "Index session in use"}, + { "UDS_CALLBACK_REQUIRED", "A callback function is required"}, + { "UDS_INVALID_OPERATION_TYPE", "Invalid type of request operation"}, +}; + +static const struct errorInfo internalErrorList[] = { + { "UDS_INTERNAL_UNUSED_0", "Unused internal error 0" }, + { "UDS_OVERFLOW", "Index overflow" }, + { "UDS_INTERNAL_UNUSED_2", "Unused internal error 2" }, + { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, + { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, + { "UDS_DUPLICATE_NAME", + "Attempt to enter the same name into a delta index twice" }, + { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, + { "UDS_INJECTED_ERROR", "Injected error" }, + { "UDS_ASSERTION_FAILED", "Assertion failed" }, + { "UDS_INTERNAL_UNUSED_9", "Unused internal error 9" }, + { "UDS_QUEUED", "Request queued" }, + { "UDS_INTERNAL_UNUSED_11", "Unused internal error 11" }, + { "UDS_INTERNAL_UNUSED_12", "Unused internal error 12" }, + { "UDS_BUFFER_ERROR", "Buffer error" }, + { "UDS_INTERNAL_UNUSED_14", "Unused internal error 14" }, + { "UDS_INTERNAL_UNUSED_15", "Unused internal error 15" }, + { "UDS_NO_DIRECTORY", "Expected directory is missing" }, + { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, + { "UDS_INTERNAL_UNUSED_18", "Unused internal error 18" }, + { "UDS_INTERNAL_UNUSED_19", "Unused internal error 19" }, + { "UDS_ALREADY_REGISTERED", "Error range already registered" }, + { "UDS_BAD_IO_DIRECTION", "Bad I/O direction" }, + { "UDS_INCORRECT_ALIGNMENT", "Offset not at block alignment" }, + { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" }, +}; + +typedef struct errorBlock { + const char *name; + int base; + int last; + int max; + const ErrorInfo *infos; +} ErrorBlock; + +enum { + MAX_ERROR_BLOCKS = 6 // needed for testing +}; + +static struct errorInformation { + int allocated; + int count; + ErrorBlock blocks[MAX_ERROR_BLOCKS]; +} registeredErrors = { + .allocated = MAX_ERROR_BLOCKS, + .count = 2, + .blocks = { + { + .name = "UDS Error", + .base = UDS_ERROR_CODE_BASE, + .last = UDS_ERROR_CODE_LAST, + .max = UDS_ERROR_CODE_BLOCK_END, + .infos = errorList, + }, + { + .name = "UDS Internal Error", + .base = UDS_INTERNAL_ERROR_CODE_BASE, + .last = UDS_INTERNAL_ERROR_CODE_LAST, + .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, + .infos = internalErrorList, + } + } +}; + +/** + * Fetch the error info (if any) for the error number. + * + * @param errnum the error number + * @param infoPtr the place to store the info for this error (if known), + * otherwise set to NULL + * + * @return the name of the error block (if known), NULL othersise + **/ +static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) +{ + + if (errnum == UDS_SUCCESS) { + if (infoPtr != NULL) { + *infoPtr = &successful; + } + return NULL; + } + + ErrorBlock *block; + for (block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if ((errnum >= block->base) && (errnum < block->last)) { + if (infoPtr != NULL) { + *infoPtr = block->infos + (errnum - block->base); + } + return block->name; + } else if ((errnum >= block->last) && (errnum < block->max)) { + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return block->name; + } + } + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return NULL; +} + +/** + * Return string describing a system error message + * + * @param errnum System error number + * @param buf Buffer that can be used to contain the return value + * @param buflen Length of the buffer + * + * @return The error string, which may be a string constant or may be + * returned in the buf argument + **/ +#ifdef __KERNEL__ +static const char *systemStringError(int errnum, char *buf, size_t buflen) +{ + const char *errorString = NULL; + if ((errnum > 0) && (errnum < COUNT_OF(messageTable))) { + errorString = messageTable[errnum]; + } + + size_t len = ((errorString == NULL) + ? snprintf(buf, buflen, "Unknown error %d", errnum) + : snprintf(buf, buflen, "%s", errorString)); + if (len < buflen) { + return buf; + } + + buf[0] = '\0'; + return "System error"; +} +#else +static INLINE const char *systemStringError(int errnum, char *buf, + size_t buflen) +{ + return strerror_r(errnum, buf, buflen); +} +#endif + +/*****************************************************************************/ +const char *stringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + char *buffer = buf; + char *bufEnd = buf + buflen; + + if (isUnrecoverable(errnum)) { + buffer = appendToBuffer(buffer, bufEnd, "Unrecoverable error: "); + errnum = sansUnrecoverable(errnum); + } + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, + "%s: %s", blockName, info->message); + } else { + buffer = appendToBuffer(buffer, bufEnd, + "Unknown %s %d", blockName, errnum); + } + } else if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->message); + } else { + const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); + if (tmp != buffer) { + buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); + } else { + buffer += strlen(tmp); + } + } + return buf; +} + +/*****************************************************************************/ +const char *stringErrorName(int errnum, char *buf, size_t buflen) +{ + errnum = sansUnrecoverable(errnum); + + char *buffer = buf; + char *bufEnd = buf + buflen; + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); + } else { + buffer = appendToBuffer(buffer, bufEnd, "%s %d", blockName, errnum); + } + } else if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); + } else { + const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); + if (tmp != buffer) { + buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); + } else { + buffer += strlen(tmp); + } + } + return buf; +} + +/*****************************************************************************/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize) +{ + int result = ASSERT(firstError < lastReservedError, + "bad error block range"); + if (result != UDS_SUCCESS) { + return result; + } + + if (registeredErrors.count == registeredErrors.allocated) { + // could reallocate and grow, but should never happen + return UDS_OVERFLOW; + } + + ErrorBlock *block; + for (block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if (strcmp(blockName, block->name) == 0) { + return UDS_DUPLICATE_NAME; + } + // check for overlap in error ranges + if ((firstError < block->max) && (lastReservedError > block->base)) { + return UDS_ALREADY_REGISTERED; + } + } + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = blockName, + .base = firstError, + .last = firstError + (infoSize / sizeof(ErrorInfo)), + .max = lastReservedError, + .infos = infos + }; + + return UDS_SUCCESS; +} diff --git a/uds/errors.h b/uds/errors.h new file mode 100644 index 0000000..faccd5a --- /dev/null +++ b/uds/errors.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/errors.h#4 $ + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include "compiler.h" +#include "typeDefs.h" +#include "uds-error.h" + +enum udsInternalErrorCodes { + /** Used as a base value for reporting internal errors */ + UDS_INTERNAL_ERROR_CODE_BASE = 66560, + /** Unused */ + UDS_INTERNAL_UNUSED_0 = UDS_INTERNAL_ERROR_CODE_BASE + 0, + /** Index overflow */ + UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, + /** Unused */ + UDS_INTERNAL_UNUSED_2 = UDS_INTERNAL_ERROR_CODE_BASE + 2, + /** Invalid argument passed to internal routine */ + UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, + /** UDS data structures are in an invalid state */ + UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, + /** Attempt to enter the same name into an internal structure twice */ + UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, + /** An internal protocol violation between system components */ + UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, + /** An error created by test case processing */ + UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, + /** An assertion failed */ + UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, + /** Unused */ + UDS_INTERNAL_UNUSED_9 = UDS_INTERNAL_ERROR_CODE_BASE + 9, + /** Not an actual error, but reporting that the result will be delayed */ + UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, + /** Unused */ + UDS_INTERNAL_UNUSED_11 = UDS_INTERNAL_ERROR_CODE_BASE + 11, + /** Unused */ + UDS_INTERNAL_UNUSED_12 = UDS_INTERNAL_ERROR_CODE_BASE + 12, + /** A problem has occured with a Buffer */ + UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, + /** Unused */ + UDS_INTERNAL_UNUSED_14 = UDS_INTERNAL_ERROR_CODE_BASE + 14, + /** Unused */ + UDS_INTERNAL_UNUSED_15 = UDS_INTERNAL_ERROR_CODE_BASE + 15, + /** No directory was found where one was expected */ + UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, + /** Checkpoint not completed */ + UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, + /** Unused */ + UDS_INTERNAL_UNUSED_18 = UDS_INTERNAL_ERROR_CODE_BASE + 18, + /** Unused */ + UDS_INTERNAL_UNUSED_19 = UDS_INTERNAL_ERROR_CODE_BASE + 19, + /** This error range has already been registered */ + UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, + /** Either read-only or write-only */ + UDS_BAD_IO_DIRECTION = UDS_INTERNAL_ERROR_CODE_BASE + 21, + /** Cannot do I/O at this offset */ + UDS_INCORRECT_ALIGNMENT = UDS_INTERNAL_ERROR_CODE_BASE + 22, + /** Attempt to read or write data outside the bounds established for it */ + UDS_OUT_OF_RANGE = UDS_INTERNAL_ERROR_CODE_BASE + 23, + /** One more than the last UDS_INTERNAL error code */ + UDS_INTERNAL_ERROR_CODE_LAST, + /** One more than the last error this block will ever use */ + UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 +}; + +enum { + ERRBUF_SIZE = 128 // default size for buffer passed to stringError +}; + +// Error attributes - or into top half of error code +enum { UDS_UNRECOVERABLE = (1 << 17) }; + +const char *stringError(int errnum, char *buf, size_t buflen); +const char *stringErrorName(int errnum, char *buf, size_t buflen); + +/* + * Identify that an result code is a successful result. + * + * @param result A result code + * + * @return true if the result represents a success. + */ +__attribute__((warn_unused_result)) +static INLINE bool isSuccessful(int result) +{ + return (result == UDS_SUCCESS) || (result == UDS_QUEUED); +} + +/* + * Identify that an result code has been marked unrecoverable. + * + * @param result A result code + * + * @return true if the result has been marked unrecoverable. + */ +__attribute__((warn_unused_result)) +static INLINE bool isUnrecoverable(int result) +{ + return (result & UDS_UNRECOVERABLE) != 0; +} + +/* + * Mark a result code as unrecoverable. + * + * @param result A result code + * + * @return the result code with the unrecoverable marker added + */ +__attribute__((warn_unused_result)) +static INLINE int makeUnrecoverable(int result) +{ + return isSuccessful(result) ? result : (result | UDS_UNRECOVERABLE); +} + +/* + * Remove the unrecoverable marker from a result code. + * + * @param result A result code + * + * @return the result code with the unrecoverable marker removed + */ +__attribute__((warn_unused_result)) +static INLINE int sansUnrecoverable(int result) +{ + return result & ~UDS_UNRECOVERABLE; +} + +typedef struct errorInfo { + const char *name; + const char *message; +} ErrorInfo; + +/** + * Register an error code block for stringError and stringErrorName. + * + * @param blockName the name of the block of error codes + * @param firstError the first error code in the block + * @param lastReservedError one past the highest possible error in the bloc + * @param infos a pointer to the error info array for the block + * @param infoSize the size of the error info array, which + * determines the last actual error for which + * information is available + * + * @return a success or error code, particularly UDS_DUPLICATE_NAME if the + * block name is already present, or UDS_ALREADY_REGISTERED if a + * block with the specified error code is present + **/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize); + +/** + * Return the first error between result1 and result2. + * + * @param result1 A success or error code. + * @param result2 A success or error code. + * + * @return result1 if that is an error, else result2 + **/ +static INLINE int firstError(int result1, int result2) +{ + return result1 == UDS_SUCCESS ? result2 : result1; +} + +#endif /* ERRORS_H */ diff --git a/uds/geometry.c b/uds/geometry.c new file mode 100644 index 0000000..6d8cfa6 --- /dev/null +++ b/uds/geometry.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/geometry.c#3 $ + */ + +#include "geometry.h" + +#include "deltaIndex.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/**********************************************************************/ +static int initializeGeometry(Geometry *geometry, + size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume) +{ + int result = ASSERT_WITH_ERROR_CODE(bytesPerPage >= BYTES_PER_RECORD, + UDS_BAD_STATE, + "page is smaller than a record: %zu", + bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE(chaptersPerVolume > sparseChaptersPerVolume, + UDS_INVALID_ARGUMENT, + "sparse chapters per volume (%u) must be less" + " than chapters per volume (%u)", + sparseChaptersPerVolume, + chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + + geometry->bytesPerPage = bytesPerPage; + geometry->recordPagesPerChapter = recordPagesPerChapter; + geometry->chaptersPerVolume = chaptersPerVolume; + geometry->sparseChaptersPerVolume = sparseChaptersPerVolume; + geometry->denseChaptersPerVolume = + chaptersPerVolume - sparseChaptersPerVolume; + + // Calculate the number of records in a page, chapter, and volume. + geometry->recordsPerPage = bytesPerPage / BYTES_PER_RECORD; + geometry->recordsPerChapter + = geometry->recordsPerPage * recordPagesPerChapter; + geometry->recordsPerVolume + = (unsigned long) geometry->recordsPerChapter * chaptersPerVolume; + geometry->openChapterLoadRatio = DEFAULT_OPEN_CHAPTER_LOAD_RATIO; + + // Initialize values for delta chapter indexes. + geometry->chapterMeanDelta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; + geometry->chapterPayloadBits = computeBits(recordPagesPerChapter - 1); + // We want 1 delta list for every 64 records in the chapter. The "| 077" + // ensures that the chapterDeltaListBits computation does not underflow. + geometry->chapterDeltaListBits + = computeBits((geometry->recordsPerChapter - 1) | 077) - 6; + geometry->deltaListsPerChapter = 1 << geometry->chapterDeltaListBits; + // We need enough address bits to achieve the desired mean delta. + geometry->chapterAddressBits + = (DEFAULT_CHAPTER_MEAN_DELTA_BITS - geometry->chapterDeltaListBits + + computeBits(geometry->recordsPerChapter - 1)); + // Let the delta index code determine how many pages are needed for the index + geometry->indexPagesPerChapter + = getDeltaIndexPageCount(geometry->recordsPerChapter, + geometry->deltaListsPerChapter, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, + bytesPerPage); + + // Now that we have the size of a chapter index, we can calculate the + // space used by chapters and volumes. + geometry->pagesPerChapter + = geometry->indexPagesPerChapter + recordPagesPerChapter; + geometry->pagesPerVolume = geometry->pagesPerChapter * chaptersPerVolume; + geometry->headerPagesPerVolume = 1; + geometry->bytesPerVolume = bytesPerPage * + (geometry->pagesPerVolume + geometry->headerPagesPerVolume); + geometry->bytesPerChapter = bytesPerPage * geometry->pagesPerChapter; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeGeometry(size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume, + Geometry **geometryPtr) +{ + Geometry *geometry; + int result = ALLOCATE(1, Geometry, "geometry", &geometry); + if (result != UDS_SUCCESS) { + return result; + } + result = initializeGeometry(geometry, bytesPerPage, recordPagesPerChapter, + chaptersPerVolume, sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + freeGeometry(geometry); + return result; + } + + *geometryPtr = geometry; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int copyGeometry(Geometry *source, Geometry **geometryPtr) +{ + return makeGeometry(source->bytesPerPage, + source->recordPagesPerChapter, + source->chaptersPerVolume, + source->sparseChaptersPerVolume, + geometryPtr); +} + +/**********************************************************************/ +void freeGeometry(Geometry *geometry) +{ + FREE(geometry); +} + +/**********************************************************************/ +uint64_t mapToVirtualChapterNumber(Geometry *geometry, + uint64_t newestVirtualChapter, + unsigned int physicalChapter) +{ + unsigned int newestPhysicalChapter + = mapToPhysicalChapter(geometry, newestVirtualChapter); + uint64_t virtualChapter + = newestVirtualChapter - newestPhysicalChapter + physicalChapter; + if (physicalChapter > newestPhysicalChapter) { + virtualChapter -= geometry->chaptersPerVolume; + } + return virtualChapter; +} + +/**********************************************************************/ +bool hasSparseChapters(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter) +{ + return (isSparse(geometry) + && ((newestVirtualChapter - oldestVirtualChapter + 1) + > geometry->denseChaptersPerVolume)); +} + +/**********************************************************************/ +bool isChapterSparse(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter, + uint64_t virtualChapterNumber) +{ + return (hasSparseChapters(geometry, oldestVirtualChapter, + newestVirtualChapter) + && ((virtualChapterNumber + geometry->denseChaptersPerVolume) + <= newestVirtualChapter)); +} + +/**********************************************************************/ +bool areSamePhysicalChapter(const Geometry *geometry, + uint64_t chapter1, + uint64_t chapter2) +{ + return ((chapter1 % geometry->chaptersPerVolume) + == (chapter2 % geometry->chaptersPerVolume)); +} diff --git a/uds/geometry.h b/uds/geometry.h new file mode 100644 index 0000000..47f771d --- /dev/null +++ b/uds/geometry.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/geometry.h#3 $ + */ + +#ifndef GEOMETRY_H +#define GEOMETRY_H 1 + +#include "compiler.h" +#include "typeDefs.h" +#include "uds.h" +#include "uds-block.h" + +/** + * Geometry defines constants and a record that parameterize the layout of an + * Albireo index volume. + * + *

An index volume is divided into a fixed number of fixed-size + * chapters, each consisting of a fixed number of fixed-size + * pages. The volume layout is defined by two assumptions and four + * parameters. The assumptions (constants) are that index records are + * 64 bytes (32-byte block name plus 32-byte metadata) and that open + * chapter index hash slots are one byte long. The four parameters are + * the number of bytes in a page, the number of chapters in a volume, + * the number of record pages in a chapter, and the number of chapters + * that are sparse. From these parameters, we derive the rest of the + * layout and derived properties, ranging from the number of pages in + * a chapter to the number of records in the volume. + * + *

The default geometry is 64 KByte pages, 1024 chapters, 256 + * record pages in a chapter, and zero sparse chapters. This will + * allow us to store 2^28 entries (indexing 1TB of 4K blocks) in an + * approximately 16.5 MByte volume using fourteen index pages in each + * chapter. + **/ +typedef struct geometry { + /** Length of a page in a chapter, in bytes */ + size_t bytesPerPage; + /** Number of record pages in a chapter */ + unsigned int recordPagesPerChapter; + /** Number of (total) chapters in a volume */ + unsigned int chaptersPerVolume; + /** Number of sparsely-indexed chapters in a volume */ + unsigned int sparseChaptersPerVolume; + /** Number of bits used to determine delta list numbers */ + unsigned int chapterDeltaListBits; + + // These are derived properties, expressed as fields for convenience. + /** Total number of pages in a volume, excluding header */ + unsigned int pagesPerVolume; + /** Total number of header pages per volume */ + unsigned int headerPagesPerVolume; + /** Total number of bytes in a volume, including header */ + size_t bytesPerVolume; + /** Total number of bytes in a chapter */ + size_t bytesPerChapter; + /** Number of pages in a chapter */ + unsigned int pagesPerChapter; + /** Number of index pages in a chapter index */ + unsigned int indexPagesPerChapter; + /** The minimum ratio of hash slots to records in an open chapter */ + unsigned int openChapterLoadRatio; + /** Number of records that fit on a page */ + unsigned int recordsPerPage; + /** Number of records that fit in a chapter */ + unsigned int recordsPerChapter; + /** Number of records that fit in a volume */ + uint64_t recordsPerVolume; + /** Number of deltaLists per chapter index */ + unsigned int deltaListsPerChapter; + /** Mean delta in chapter indexes */ + unsigned int chapterMeanDelta; + /** Number of bits needed for record page numbers */ + unsigned int chapterPayloadBits; + /** Number of bits used to compute addresses for chapter delta lists */ + unsigned int chapterAddressBits; + /** Number of densely-indexed chapters in a volume */ + unsigned int denseChaptersPerVolume; +} Geometry; + +enum { + /* The number of bytes in a record (name + metadata) */ + BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_MAX_BLOCK_DATA_SIZE), + + /* The default length of a page in a chapter, in bytes */ + DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, + + /* The default maximum number of records per page */ + DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, + + /** The default number of record pages in a chapter */ + DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, + + /** The default number of record pages in a chapter for a small index */ + SMALL_RECORD_PAGES_PER_CHAPTER = 64, + + /** The default number of chapters in a volume */ + DEFAULT_CHAPTERS_PER_VOLUME = 1024, + + /** The default number of sparsely-indexed chapters in a volume */ + DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, + + /** The log2 of the default mean delta */ + DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, + + /** The log2 of the number of delta lists in a large chapter */ + DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, + + /** The log2 of the number of delta lists in a small chapter */ + SMALL_CHAPTER_DELTA_LIST_BITS = 10, + + /** The default min ratio of slots to records in an open chapter */ + DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2, + + /** Checkpoint every n chapters written. Default is to not checkpoint */ + DEFAULT_CHECKPOINT_FREQUENCY = 0 +}; + +/** + * Allocate and initialize all fields of a volume geometry using the + * specified layout parameters. + * + * @param bytesPerPage The length of a page in a chapter, in bytes + * @param recordPagesPerChapter The number of pages in a chapter + * @param chaptersPerVolume The number of chapters in a volume + * @param sparseChaptersPerVolume The number of sparse chapters in a volume + * @param geometryPtr A pointer to hold the new geometry + * + * @return UDS_SUCCESS or an error code + **/ +int makeGeometry(size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume, + Geometry **geometryPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate a new geometry and initialize it with the same parameters as an + * existing geometry. + * + * @param source The geometry record to copy + * @param geometryPtr A pointer to hold the new geometry + * + * @return UDS_SUCCESS or an error code + **/ +int copyGeometry(Geometry *source, + Geometry **geometryPtr) + __attribute__((warn_unused_result)); + +/** + * Clean up a geometry and its memory. + * + * @param geometry The geometry record to free + **/ +void freeGeometry(Geometry *geometry); + +/** + * Map a virtual chapter number to a physical chapter number + * + * @param geometry The geometry + * @param virtualChapter The virtual chapter number + * + * @return the corresponding physical chapter number + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned int mapToPhysicalChapter(const Geometry *geometry, + uint64_t virtualChapter) +{ + return (virtualChapter % geometry->chaptersPerVolume); +} + +/** + * Convert a physical chapter number to its current virtual chapter number. + * + * @param geometry The geometry + * @param newestVirtualChapter The number of the newest virtual chapter + * @param physicalChapter The physical chapter number to convert + * + * @return The current virtual chapter number of the physical chapter + * in question + **/ +uint64_t mapToVirtualChapterNumber(Geometry *geometry, + uint64_t newestVirtualChapter, + unsigned int physicalChapter); + +/** + * Check whether this geometry is for a sparse index. + * + * @param geometry The geometry to check + * + * @return true if this geometry has sparse chapters + **/ +__attribute__((warn_unused_result)) +static INLINE bool isSparse(const Geometry *geometry) +{ + return (geometry->sparseChaptersPerVolume > 0); +} + +/** + * Check whether any sparse chapters have been filled. + * + * @param geometry The geometry of the index + * @param oldestVirtualChapter The number of the oldest chapter in the + * index + * @param newestVirtualChapter The number of the newest chapter in the + * index + * + * @return true if the index has filled at least one sparse chapter + **/ +bool hasSparseChapters(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter) + __attribute__((warn_unused_result)); + +/** + * Check whether a chapter is sparse or dense. + * + * @param geometry The geometry of the index containing the chapter + * @param oldestVirtualChapter The number of the oldest chapter in the index + * @param newestVirtualChapter The number of the newest chapter in the index + * @param virtualChapterNumber The number of the chapter to check + * + * @return true if the chapter is sparse + **/ +bool isChapterSparse(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter, + uint64_t virtualChapterNumber) + __attribute__((warn_unused_result)); + +/** + * Check whether two virtual chapter numbers correspond to the same + * physical chapter. + * + * @param geometry The geometry of the index + * @param chapter1 The first chapter to compare + * @param chapter2 The second chapter to compare + * + * @return true if both chapters correspond to the same + * physical chapter + **/ +bool areSamePhysicalChapter(const Geometry *geometry, + uint64_t chapter1, + uint64_t chapter2) + __attribute__((warn_unused_result)); + +#endif /* GEOMETRY_H */ diff --git a/uds/hashUtils.c b/uds/hashUtils.c new file mode 100644 index 0000000..45b2c81 --- /dev/null +++ b/uds/hashUtils.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.c#2 $ + */ + +#include "hashUtils.h" + +#include "errors.h" +#include "logger.h" +#include "permassert.h" +#include "stringUtils.h" +#include "uds.h" + +/** + * Convert a byte string to the hex representation. + * + * @param data binary data to convert + * @param dataLen length of binary data + * @param hex target to write hex string into + * @param hexLen capacity of target string + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexLen + * is too short. + **/ +static int dataToHex(const unsigned char *data, size_t dataLen, + char *hex, size_t hexLen) +{ + if (hexLen < 2 * dataLen + 1) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "hex data incorrect size"); + } + size_t i; + for (i = 0; i < dataLen; ++i) { + int rc = fixedSprintf(__func__, &hex[2 * i], hexLen - (2 * i), + UDS_INVALID_ARGUMENT, "%02X", data[i]); + + if (rc != UDS_SUCCESS) { + return rc; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int chunkNameToHex(const UdsChunkName *chunkName, + char *hexData, size_t hexDataLen) +{ + return dataToHex(chunkName->name, UDS_CHUNK_NAME_SIZE, + hexData, hexDataLen); +} + +/**********************************************************************/ +int chunkDataToHex(const UdsChunkData *chunkData, + char *hexData, size_t hexDataLen) +{ + return dataToHex(chunkData->data, UDS_MAX_BLOCK_DATA_SIZE, + hexData, hexDataLen); +} + +/**********************************************************************/ +unsigned int computeBits(unsigned int maxValue) +{ + // __builtin_clz() counts leading (high-order) zero bits, so if + // we ever need this to be fast, under GCC we can do: + // return ((maxValue == 0) ? 0 : (32 - __builtin_clz(maxValue))); + + unsigned int bits = 0; + while (maxValue > 0) { + maxValue >>= 1; + bits++; + } + return bits; +} + +/**********************************************************************/ +void hashUtilsCompileTimeAssertions(void) +{ + STATIC_ASSERT((UDS_CHUNK_NAME_SIZE % sizeof(uint64_t)) == 0); + STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16); +} diff --git a/uds/hashUtils.h b/uds/hashUtils.h new file mode 100644 index 0000000..2d6d0a8 --- /dev/null +++ b/uds/hashUtils.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.h#1 $ + */ + +#ifndef HASH_UTILS_H +#define HASH_UTILS_H 1 + +#include "compiler.h" +#include "common.h" +#include "geometry.h" +#include "numeric.h" +#include "uds.h" + +// How various portions of a hash are apportioned. Size dependent. +enum { + MASTER_INDEX_BYTES_OFFSET = 0, // size 8 + CHAPTER_INDEX_BYTES_OFFSET = 8, // size 6 + SAMPLE_BYTES_OFFSET = 14, // size 2 + MASTER_INDEX_BYTES_COUNT = 8, + CHAPTER_INDEX_BYTES_COUNT = 6, + SAMPLE_BYTES_COUNT = 2, +}; + +/** + * Extract the portion of a block name used by the chapter index. + * + * @param name The block name + * + * @return The chapter index bytes + **/ +static INLINE uint64_t extractChapterIndexBytes(const UdsChunkName *name) +{ + // Get the high order 16 bits, then the low order 32 bits + uint64_t bytes + = (uint64_t) getUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET]) << 32; + bytes |= getUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2]); + return bytes; +} + +/** + * Extract the portion of a block name used by the master index. + * + * @param name The block name + * + * @return The master index portion of the block name + **/ +static INLINE uint64_t extractMasterIndexBytes(const UdsChunkName *name) +{ + return getUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET]); +} + +/** + * Extract the portion of a block name used for sparse sampling. + * + * @param name The block name + * + * @return The sparse sample portion of the block name + **/ +static INLINE uint32_t extractSamplingBytes(const UdsChunkName *name) +{ + return getUInt16BE(&name->name[SAMPLE_BYTES_OFFSET]); +} + +/** + * For a given block, find the chapter delta list to use + * + * @param name The block name to hash + * @param geometry The geometry to use + * + * @return The chapter delta list where we expect to find the given blockname + **/ +static INLINE unsigned int hashToChapterDeltaList(const UdsChunkName *name, + const Geometry *geometry) +{ + return (unsigned int) ((extractChapterIndexBytes(name) + >> geometry->chapterAddressBits) + & ((1 << geometry->chapterDeltaListBits) - 1)); +} + +/** + * For a given block, find the chapter delta address to use + * + * @param name The block name to hash + * @param geometry The geometry to use + * + * @return The chapter delta address to use + **/ +static INLINE unsigned int hashToChapterDeltaAddress(const UdsChunkName *name, + const Geometry *geometry) +{ + return (unsigned int) (extractChapterIndexBytes(name) + & ((1 << geometry->chapterAddressBits) - 1)); +} + +/** + * For a given block name, find the slot in the open chapter hash table + * where it is expected to reside. + * + * @param name The block name to hash + * @param slotCount The size of the hash table + * + * @return the record number in the index page where we expect to find + # the given blockname + **/ +static INLINE unsigned int nameToHashSlot(const UdsChunkName *name, + unsigned int slotCount) +{ + return (unsigned int) (extractChapterIndexBytes(name) % slotCount); +} + +/** + * Convert a chunk name to hex to make it more readable. + * + * @param chunkName The chunk name + * @param hexData The resulting hexdata from the given chunk name + * @param hexDataLen The capacity of hexData + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexDataLen + * is too short. + **/ +int chunkNameToHex(const UdsChunkName *chunkName, + char *hexData, + size_t hexDataLen) + __attribute__((warn_unused_result)); + +/** + * Convert chunk data to hex to make it more readable. + * + * @param chunkData The chunk data + * @param hexData The resulting hexdata from the given chunk data + * @param hexDataLen The capacity of hexData + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexDataLen + * is too short. + **/ +int chunkDataToHex(const UdsChunkData *chunkData, + char *hexData, + size_t hexDataLen) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bits required to store a field with the given + * maximum value. + * + * @param maxValue The maximum value of the field + * + * @return the number of bits required + **/ +unsigned int computeBits(unsigned int maxValue) + __attribute__((warn_unused_result)); + +/** + * FOR TESTING. Set the portion of a block name used by the chapter index. + * + * @param name The block name + * @param value The value to store + **/ +static INLINE void setChapterIndexBytes(UdsChunkName *name, uint64_t value) +{ + // Store the high order bytes, then the low-order bytes + storeUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET], + (uint16_t)(value >> 32)); + storeUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2], + (uint32_t)value); +} + +/** + * FOR TESTING. Set the bits used to find a chapter delta list + * + * @param name The block name + * @param geometry The geometry to use + * @param value The value to store + **/ +static INLINE void setChapterDeltaListBits(UdsChunkName *name, + const Geometry *geometry, + uint64_t value) +{ + uint64_t deltaAddress = hashToChapterDeltaAddress(name, geometry); + deltaAddress |= value << geometry->chapterAddressBits; + setChapterIndexBytes(name, deltaAddress); +} + +/** + * FOR TESTING. Set the portion of a block name used by the master index. + * + * @param name The block name + * @param val The value to store + **/ +static INLINE void setMasterIndexBytes(UdsChunkName *name, uint64_t val) +{ + storeUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET], val); +} + +/** + * Set the portion of a block name used for sparse sampling. + * + * @param name The block name + * @param value The value to store + **/ +static INLINE void setSamplingBytes(UdsChunkName *name, uint32_t value) +{ + storeUInt16BE(&name->name[SAMPLE_BYTES_OFFSET], (uint16_t)value); +} + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if UDS_CHUNK_NAME_SIZE is not an integer + * multiple of 8. + **/ +void hashUtilsCompileTimeAssertions(void); + +#endif /* HASH_UTILS_H */ diff --git a/uds/index.c b/uds/index.c new file mode 100644 index 0000000..a84d50f --- /dev/null +++ b/uds/index.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/index.c#15 $ + */ + +#include "index.h" + +#include "hashUtils.h" +#include "indexCheckpoint.h" +#include "indexInternals.h" +#include "logger.h" + +static const uint64_t NO_LAST_CHECKPOINT = UINT_MAX; + + +/** + * Replay an index which was loaded from a checkpoint. + * + * @param index The index to replay + * @param lastCheckpointChapter The number of the chapter where the + * last checkpoint was made + * + * @return UDS_SUCCESS or an error code. + **/ +static int replayIndexFromCheckpoint(Index *index, + uint64_t lastCheckpointChapter) +{ + // Find the volume chapter boundaries + uint64_t lowestVCN, highestVCN; + bool isEmpty = false; + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, + &highestVCN, &isEmpty); + index->volume->lookupMode = oldLookupMode; + if (result != UDS_SUCCESS) { + return logFatalWithStringError(result, + "cannot replay index: " + "unknown volume chapter boundaries"); + } + if (lowestVCN > highestVCN) { + logFatal("cannot replay index: no valid chapters exist"); + return UDS_CORRUPT_COMPONENT; + } + + if (isEmpty) { + // The volume is empty, so the index should also be empty + if (index->newestVirtualChapter != 0) { + logFatal("cannot replay index from empty volume"); + return UDS_CORRUPT_COMPONENT; + } + return UDS_SUCCESS; + } + + unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; + index->oldestVirtualChapter = lowestVCN; + index->newestVirtualChapter = highestVCN + 1; + if (index->newestVirtualChapter == lowestVCN + chaptersPerVolume) { + // skip the chapter shadowed by the open chapter + index->oldestVirtualChapter++; + } + + uint64_t firstReplayChapter = lastCheckpointChapter; + if (firstReplayChapter < index->oldestVirtualChapter) { + firstReplayChapter = index->oldestVirtualChapter; + } + return replayVolume(index, firstReplayChapter); +} + +/**********************************************************************/ +static int loadIndex(Index *index, bool allowReplay) +{ + bool replayRequired = false; + + int result = loadIndexState(index->state, &replayRequired); + if (result != UDS_SUCCESS) { + return result; + } + + if (replayRequired && !allowReplay) { + return logErrorWithStringError( + UDS_INDEX_NOT_SAVED_CLEANLY, + "index not saved cleanly: open chapter missing"); + } + + uint64_t lastCheckpointChapter + = ((index->lastCheckpoint != NO_LAST_CHECKPOINT) + ? index->lastCheckpoint : 0); + + logInfo("loaded index from chapter %llu through chapter %llu", + index->oldestVirtualChapter, lastCheckpointChapter); + + if (replayRequired) { + result = replayIndexFromCheckpoint(index, lastCheckpointChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + setActiveChapters(index->zones[i]); + } + + index->loadedType = replayRequired ? LOAD_REPLAY : LOAD_LOAD; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int rebuildIndex(Index *index) +{ + // Find the volume chapter boundaries + uint64_t lowestVCN, highestVCN; + bool isEmpty = false; + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, + &highestVCN, &isEmpty); + index->volume->lookupMode = oldLookupMode; + if (result != UDS_SUCCESS) { + return logFatalWithStringError(result, + "cannot rebuild index: " + "unknown volume chapter boundaries"); + } + if (lowestVCN > highestVCN) { + logFatal("cannot rebuild index: no valid chapters exist"); + return UDS_CORRUPT_COMPONENT; + } + + if (isEmpty) { + index->newestVirtualChapter = index->oldestVirtualChapter = 0; + } else { + unsigned int numChapters = index->volume->geometry->chaptersPerVolume; + index->newestVirtualChapter = highestVCN + 1; + index->oldestVirtualChapter = lowestVCN; + if (index->newestVirtualChapter + == (index->oldestVirtualChapter + numChapters)) { + // skip the chapter shadowed by the open chapter + index->oldestVirtualChapter++; + } + } + + if ((index->newestVirtualChapter - index->oldestVirtualChapter) > + index->volume->geometry->chaptersPerVolume) { + return logFatalWithStringError(UDS_CORRUPT_COMPONENT, + "cannot rebuild index: " + "volume chapter boundaries too large"); + } + + setMasterIndexOpenChapter(index->masterIndex, 0); + if (isEmpty) { + index->loadedType = LOAD_EMPTY; + return UDS_SUCCESS; + } + + result = replayVolume(index, index->oldestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + setActiveChapters(index->zones[i]); + } + + index->loadedType = LOAD_REBUILD; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + IndexLoadContext *loadContext, + Index **newIndex) +{ + Index *index; + int result = allocateIndex(layout, config, userParams, zoneCount, loadType, + &index); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "could not allocate index"); + } + + index->loadContext = loadContext; + + uint64_t nonce = getVolumeNonce(layout); + result = makeMasterIndex(config, zoneCount, nonce, &index->masterIndex); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "could not make master index"); + } + + result = addIndexStateComponent(index->state, MASTER_INDEX_INFO, NULL, + index->masterIndex); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = addIndexStateComponent(index->state, &INDEX_PAGE_MAP_INFO, + index->volume->indexPageMap, NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeChapterWriter(index, getIndexVersion(layout), + &index->chapterWriter); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + if ((loadType == LOAD_LOAD) || (loadType == LOAD_REBUILD)) { + if (!index->existed) { + freeIndex(index); + return UDS_NO_INDEX; + } + result = loadIndex(index, loadType == LOAD_REBUILD); + switch (result) { + case UDS_SUCCESS: + break; + case ENOMEM: + // We should not try a rebuild for this error. + logErrorWithStringError(result, "index could not be loaded"); + break; + default: + logErrorWithStringError(result, "index could not be loaded"); + if (loadType == LOAD_REBUILD) { + result = rebuildIndex(index); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "index could not be rebuilt"); + } + } + break; + } + } else { + index->loadedType = LOAD_CREATE; + discardIndexStateData(index->state); + } + + if (result != UDS_SUCCESS) { + freeIndex(index); + return logUnrecoverable(result, "fatal error in makeIndex"); + } + + if (index->loadContext != NULL) { + lockMutex(&index->loadContext->mutex); + index->loadContext->status = INDEX_READY; + // If we get here, suspend is meaningless, but notify any thread trying + // to suspend us so it doesn't hang. + broadcastCond(&index->loadContext->cond); + unlockMutex(&index->loadContext->mutex); + } + + index->hasSavedOpenChapter = index->loadedType == LOAD_LOAD; + *newIndex = index; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndex(Index *index) +{ + if (index == NULL) { + return; + } + freeChapterWriter(index->chapterWriter); + + if (index->masterIndex != NULL) { + freeMasterIndex(index->masterIndex); + } + releaseIndex(index); +} + +/**********************************************************************/ +int saveIndex(Index *index) +{ + waitForIdleChapterWriter(index->chapterWriter); + int result = finishCheckpointing(index); + if (result != UDS_SUCCESS) { + logInfo("save index failed"); + return result; + } + beginSave(index, false, index->newestVirtualChapter); + + result = saveIndexState(index->state); + if (result != UDS_SUCCESS) { + logInfo("save index failed"); + index->lastCheckpoint = index->prevCheckpoint; + } else { + index->hasSavedOpenChapter = true; + logInfo("finished save (vcn %llu)", index->lastCheckpoint); + } + return result; +} + +/** + * Get the zone for a request. + * + * @param index The index + * @param request The request + * + * @return The zone for the request + **/ +static IndexZone *getRequestZone(Index *index, Request *request) +{ + return index->zones[request->zoneNumber]; +} + +/** + * Search an index zone. This function is only correct for LRU. + * + * @param zone The index zone to query. + * @param request The request originating the query. + * + * @return UDS_SUCCESS or an error code + **/ +static int searchIndexZone(IndexZone *zone, Request *request) +{ + MasterIndexRecord record; + int result = getMasterIndexRecord(zone->index->masterIndex, + &request->chunkName, &record); + if (result != UDS_SUCCESS) { + return result; + } + + bool found = false; + if (record.isFound) { + result = getRecordFromZone(zone, request, &found, record.virtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + if (found) { + request->location = computeIndexRegion(zone, record.virtualChapter); + } + } + + /* + * If a record has overflowed a chapter index in more than one chapter + * (or overflowed in one chapter and collided with an existing record), + * it will exist as a collision record in the master index, but we won't + * find it in the volume. This case needs special handling. + */ + bool overflowRecord = (record.isFound && record.isCollision && !found); + uint64_t chapter = zone->newestVirtualChapter; + if (found || overflowRecord) { + if ((request->action == REQUEST_QUERY) + && (!request->update || overflowRecord)) { + /* This is a query without update, or with nothing to update */ + return UDS_SUCCESS; + } + + if (record.virtualChapter != chapter) { + /* + * Update the master index to reference the new chapter for the block. + * If the record had been deleted or dropped from the chapter index, it + * will be back. + */ + result = setMasterIndexRecordChapter(&record, chapter); + } else if (request->action != REQUEST_UPDATE) { + /* The record is already in the open chapter, so we're done */ + return UDS_SUCCESS; + } + } else { + // The record wasn't in the master index, so check whether the name + // is in a cached sparse chapter. + if (!isMasterIndexSample(zone->index->masterIndex, &request->chunkName) + && isSparse(zone->index->volume->geometry)) { + // Passing UINT64_MAX triggers a search of the entire sparse cache. + result = searchSparseCacheInZone(zone, request, UINT64_MAX, &found); + if (result != UDS_SUCCESS) { + return result; + } + + if (found) { + request->location = LOC_IN_SPARSE; + } + } + + if (request->action == REQUEST_QUERY) { + if (!found || !request->update) { + // This is a query without update or for a new record, so we're done. + return UDS_SUCCESS; + } + } + + /* + * Add a new entry to the master index referencing the open chapter. + * This needs to be done both for new records, and for records from + * cached sparse chapters. + */ + result = putMasterIndexRecord(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The master index encountered a delta list overflow. The condition + * was already logged. We will go on without adding the chunk to the + * open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) { + return result; + } + + UdsChunkData *metadata; + if (!found || (request->action == REQUEST_UPDATE)) { + // This is a new record or we're updating an existing record. + metadata = &request->newMetadata; + } else { + // This is a duplicate, so move the record to the open chapter (for LRU). + metadata = &request->oldMetadata; + } + return putRecordInZone(zone, request, metadata); +} + +/**********************************************************************/ +static int removeFromIndexZone(IndexZone *zone, Request *request) +{ + MasterIndexRecord record; + int result = getMasterIndexRecord(zone->index->masterIndex, + &request->chunkName, &record); + if (result != UDS_SUCCESS) { + return result; + } + + if (!record.isFound) { + // The name does not exist in master index, so there is nothing to remove. + return UDS_SUCCESS; + } + + if (!record.isCollision) { + // Non-collision records are hints, so resolve the name in the chapter. + bool found; + int result = getRecordFromZone(zone, request, &found, + record.virtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + if (!found) { + // The name does not exist in the chapter, so there is nothing to remove. + return UDS_SUCCESS; + } + } + + request->location = computeIndexRegion(zone, record.virtualChapter); + + /* + * Delete the master index entry for the named record only. Note that a + * later search might later return stale advice if there is a colliding name + * in the same chapter, but it's a very rare case (1 in 2^21). + */ + result = removeMasterIndexRecord(&record); + if (result != UDS_SUCCESS) { + return result; + } + + // If the record is in the open chapter, we must remove it or mark it + // deleted to avoid trouble if the record is added again later. + if (request->location == LOC_IN_OPEN_CHAPTER) { + bool hashExists = false; + removeFromOpenChapter(zone->openChapter, &request->chunkName, &hashExists); + result = ASSERT(hashExists, "removing record not found in open chapter"); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Simulate the creation of a sparse cache barrier message by the triage + * queue, and the later execution of that message in an index zone. + * + * If the index receiving the request is multi-zone or dense, this function + * does nothing. This simulation is an optimization for single-zone sparse + * indexes. It also supports unit testing of indexes without routers and + * queues. + * + * @param zone the index zone responsible for the index request + * @param request the index request about to be executed + * + * @return UDS_SUCCESS always + **/ +static int simulateIndexZoneBarrierMessage(IndexZone *zone, Request *request) +{ + // Do nothing unless this is a single-zone sparse index. + if ((zone->index->zoneCount > 1) + || !isSparse(zone->index->volume->geometry)) { + return UDS_SUCCESS; + } + + // Check if the index request is for a sampled name in a sparse chapter. + uint64_t sparseVirtualChapter = triageIndexRequest(zone->index, request); + if (sparseVirtualChapter == UINT64_MAX) { + // Not indexed, not a hook, or in a chapter that is still dense, which + // means there should be no change to the sparse chapter index cache. + return UDS_SUCCESS; + } + + /* + * The triage queue would have generated and enqueued a barrier message + * preceding this request, which we simulate by directly invoking the + * execution hook for an equivalent message. + */ + BarrierMessageData barrier = { .virtualChapter = sparseVirtualChapter }; + return executeSparseCacheBarrierMessage(zone, &barrier); +} + +/**********************************************************************/ +static int dispatchIndexZoneRequest(IndexZone *zone, Request *request) +{ + if (!request->requeued) { + // Single-zone sparse indexes don't have a triage queue to generate cache + // barrier requests, so see if we need to synthesize a barrier. + int result = simulateIndexZoneBarrierMessage(zone, request); + if (result != UDS_SUCCESS) { + return result; + } + } + + // Set the default location. It will be overwritten if we find the chunk. + request->location = LOC_UNAVAILABLE; + + int result; + switch (request->action) { + case REQUEST_INDEX: + case REQUEST_UPDATE: + case REQUEST_QUERY: + result = makeUnrecoverable(searchIndexZone(zone, request)); + break; + + case REQUEST_DELETE: + result = makeUnrecoverable(removeFromIndexZone(zone, request)); + break; + + default: + result = logWarningWithStringError(UDS_INVALID_ARGUMENT, + "attempted to execute invalid action:" + " %d", + request->action); + break; + } + + return result; +} + +/**********************************************************************/ +int dispatchIndexRequest(Index *index, Request *request) +{ + return dispatchIndexZoneRequest(getRequestZone(index, request), request); +} + +/**********************************************************************/ +static int rebuildIndexPageMap(Index *index, uint64_t vcn) +{ + Geometry *geometry = index->volume->geometry; + unsigned int chapter = mapToPhysicalChapter(geometry, vcn); + unsigned int expectedListNumber = 0; + unsigned int indexPageNumber; + for (indexPageNumber = 0; + indexPageNumber < geometry->indexPagesPerChapter; + indexPageNumber++) { + DeltaIndexPage *chapterIndexPage; + int result = getPage(index->volume, chapter, indexPageNumber, + CACHE_PROBE_INDEX_FIRST, NULL, &chapterIndexPage); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to read index page %u" + " in chapter %u", + indexPageNumber, chapter); + } + unsigned int lowestDeltaList = chapterIndexPage->lowestListNumber; + unsigned int highestDeltaList = chapterIndexPage->highestListNumber; + if (lowestDeltaList != expectedListNumber) { + return logErrorWithStringError(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, indexPageNumber); + } + result = updateIndexPageMap(index->volume->indexPageMap, vcn, chapter, + indexPageNumber, highestDeltaList); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to update chapter %u index page" + " %u", + chapter, indexPageNumber); + } + expectedListNumber = highestDeltaList + 1; + } + return UDS_SUCCESS; +} + +/** + * Add an entry to the master index when rebuilding. + * + * @param index The index to query. + * @param name The block name of interest. + * @param virtualChapter The virtual chapter number to write to the + * master index + * @param willBeSparseChapter True if this entry will be in the sparse portion + * of the index at the end of rebuilding + * + * @return UDS_SUCCESS or an error code + **/ +static int replayRecord(Index *index, + const UdsChunkName *name, + uint64_t virtualChapter, + bool willBeSparseChapter) +{ + if (willBeSparseChapter && !isMasterIndexSample(index->masterIndex, name)) { + // This entry will be in a sparse chapter after the rebuild completes, + // and it is not a sample, so just skip over it. + return UDS_SUCCESS; + } + + MasterIndexRecord record; + int result = getMasterIndexRecord(index->masterIndex, name, &record); + if (result != UDS_SUCCESS) { + return result; + } + + bool updateRecord; + if (record.isFound) { + if (record.isCollision) { + if (record.virtualChapter == virtualChapter) { + /* The record is already correct, so we don't need to do anything */ + return UDS_SUCCESS; + } + updateRecord = true; + } else if (record.virtualChapter == virtualChapter) { + /* + * There is a master index entry pointing to the current + * chapter, but we don't know if it is for the same name as the + * one we are currently working on or not. For now, we're just + * going to assume that it isn't. This will create one extra + * collision record if there was a deleted record in the current + * chapter. + */ + updateRecord = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to disk to see if + * the record exists, since we will likely have just read the record from + * disk (i.e. we know it's there). The exception to this is when we + * already find an entry in the master index that has a different chapter. + * In this case, we need to search that chapter to determine if the + * master index entry was for the same record or a different one. + */ + result = searchVolumePageCache(index->volume, NULL, name, + record.virtualChapter, NULL, + &updateRecord); + if (result != UDS_SUCCESS) { + return result; + } + } + } else { + updateRecord = false; + } + + if (updateRecord) { + /* + * Update the master index to reference the new chapter for the block. + * If the record had been deleted or dropped from the chapter index, it + * will be back. + */ + result = setMasterIndexRecordChapter(&record, virtualChapter); + } else { + /* + * Add a new entry to the master index referencing the open + * chapter. This should be done regardless of whether we are a brand + * new record or a sparse record, i.e. one that doesn't exist in the + * index but does on disk, since for a sparse record, we would want to + * un-sparsify if it did exist. + */ + result = putMasterIndexRecord(&record, virtualChapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* Ignore duplicate record and delta list overflow errors */ + return UDS_SUCCESS; + } + + return result; +} + +/**********************************************************************/ +void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber) +{ + index->prevCheckpoint = index->lastCheckpoint; + index->lastCheckpoint = ((openChapterNumber == 0) + ? NO_LAST_CHECKPOINT + : openChapterNumber - 1); + + const char *what = (checkpoint ? "checkpoint" : "save"); + logInfo("beginning %s (vcn %llu)", what, index->lastCheckpoint); +} + +/** + * Suspend the index if necessary and wait for a signal to resume. + * + * @param index The index to replay + * + * @return true if the replay should terminate + **/ +static bool checkForSuspend(Index *index) +{ + if (index->loadContext == NULL) { + return false; + } + + lockMutex(&index->loadContext->mutex); + if (index->loadContext->status != INDEX_SUSPENDING) { + unlockMutex(&index->loadContext->mutex); + return false; + } + + // Notify that we are suspended and wait for the resume. + index->loadContext->status = INDEX_SUSPENDED; + broadcastCond(&index->loadContext->cond); + + while ((index->loadContext->status != INDEX_OPENING) + && (index->loadContext->status != INDEX_FREEING)) { + waitCond(&index->loadContext->cond, &index->loadContext->mutex); + } + + bool retVal = (index->loadContext->status == INDEX_FREEING); + unlockMutex(&index->loadContext->mutex); + return retVal; +} + +/**********************************************************************/ +int replayVolume(Index *index, uint64_t fromVCN) +{ + int result; + uint64_t uptoVCN = index->newestVirtualChapter; + logInfo("Replaying volume from chapter %llu through chapter %" + PRIu64, + fromVCN, uptoVCN); + setMasterIndexOpenChapter(index->masterIndex, uptoVCN); + setMasterIndexOpenChapter(index->masterIndex, fromVCN); + + /* + * At least two cases to deal with here! + * - index loaded but replaying from lastCheckpoint; maybe full, maybe not + * - index failed to load, full rebuild + * Starts empty, then dense-only, then dense-plus-sparse. + * Need to sparsify while processing individual chapters. + */ + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + /* + * Go through each record page of each chapter and add the records back to + * the master index. This should not cause anything to be written to either + * the open chapter or on disk volume. Also skip the on disk chapter + * corresponding to upto, as this would have already been + * purged from the master index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the + * index page map. + */ + const Geometry *geometry = index->volume->geometry; + uint64_t oldIPMupdate = getLastUpdate(index->volume->indexPageMap); + uint64_t vcn; + for (vcn = fromVCN; vcn < uptoVCN; ++vcn) { + if (checkForSuspend(index)) { + logInfo("Replay interrupted by index shutdown at chapter %llu", vcn); + return UDS_SHUTTINGDOWN; + } + + bool willBeSparseChapter = isChapterSparse(geometry, fromVCN, uptoVCN, + vcn); + unsigned int chapter = mapToPhysicalChapter(geometry, vcn); + prefetchVolumePages(&index->volume->volumeStore, + mapToPhysicalPage(geometry, chapter, 0), + geometry->pagesPerChapter); + setMasterIndexOpenChapter(index->masterIndex, vcn); + result = rebuildIndexPageMap(index, vcn); + if (result != UDS_SUCCESS) { + index->volume->lookupMode = oldLookupMode; + return logErrorWithStringError(result, + "could not rebuild index page map for" + " chapter %u", + chapter); + } + + unsigned int j; + for (j = 0; j < geometry->recordPagesPerChapter; j++) { + unsigned int recordPageNumber = geometry->indexPagesPerChapter + j; + byte *recordPage; + result = getPage(index->volume, chapter, recordPageNumber, + CACHE_PROBE_RECORD_FIRST, &recordPage, NULL); + if (result != UDS_SUCCESS) { + index->volume->lookupMode = oldLookupMode; + return logUnrecoverable(result, "could not get page %d", + recordPageNumber); + } + unsigned int k; + for (k = 0; k < geometry->recordsPerPage; k++) { + const byte *nameBytes = recordPage + (k * BYTES_PER_RECORD); + + UdsChunkName name; + memcpy(&name.name, nameBytes, UDS_CHUNK_NAME_SIZE); + + result = replayRecord(index, &name, vcn, willBeSparseChapter); + if (result != UDS_SUCCESS) { + char hexName[(2 * UDS_CHUNK_NAME_SIZE) + 1]; + if (chunkNameToHex(&name, hexName, sizeof(hexName)) != UDS_SUCCESS) { + strncpy(hexName, "", sizeof(hexName)); + } + index->volume->lookupMode = oldLookupMode; + return logUnrecoverable(result, + "could not find block %s during rebuild", + hexName); + } + } + } + } + index->volume->lookupMode = oldLookupMode; + + // We also need to reap the chapter being replaced by the open chapter + setMasterIndexOpenChapter(index->masterIndex, uptoVCN); + + uint64_t newIPMupdate = getLastUpdate(index->volume->indexPageMap); + + if (newIPMupdate != oldIPMupdate) { + logInfo("replay changed index page map update from %llu to %llu", + oldIPMupdate, newIPMupdate); + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +void getIndexStats(Index *index, UdsIndexStats *counters) +{ + uint64_t cwAllocated = getChapterWriterMemoryAllocated(index->chapterWriter); + // We're accessing the master index while not on a zone thread, but that's + // safe to do when acquiring statistics. + MasterIndexStats denseStats, sparseStats; + getMasterIndexStats(index->masterIndex, &denseStats, &sparseStats); + + counters->entriesIndexed = (denseStats.recordCount + + sparseStats.recordCount); + counters->memoryUsed = ((uint64_t) denseStats.memoryAllocated + + (uint64_t) sparseStats.memoryAllocated + + (uint64_t) getCacheSize(index->volume) + + cwAllocated); + counters->collisions = (denseStats.collisionCount + + sparseStats.collisionCount); + counters->entriesDiscarded = (denseStats.discardCount + + sparseStats.discardCount); + counters->checkpoints = getCheckpointCount(index->checkpoint); +} + +/**********************************************************************/ +void advanceActiveChapters(Index *index) +{ + index->newestVirtualChapter++; + if (areSamePhysicalChapter(index->volume->geometry, + index->newestVirtualChapter, + index->oldestVirtualChapter)) { + index->oldestVirtualChapter++; + } +} + +/**********************************************************************/ +uint64_t triageIndexRequest(Index *index, Request *request) +{ + MasterIndexTriage triage; + lookupMasterIndexName(index->masterIndex, &request->chunkName, &triage); + if (!triage.inSampledChapter) { + // Not indexed or not a hook. + return UINT64_MAX; + } + + IndexZone *zone = getRequestZone(index, request); + if (!isZoneChapterSparse(zone, triage.virtualChapter)) { + return UINT64_MAX; + } + + // XXX Optimize for a common case by remembering the chapter from the most + // recent barrier message and skipping this chapter if is it the same. + + // Return the sparse chapter number to trigger the barrier messages. + return triage.virtualChapter; +} diff --git a/uds/index.h b/uds/index.h new file mode 100644 index 0000000..d2bc805 --- /dev/null +++ b/uds/index.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/index.h#3 $ + */ + +#ifndef INDEX_H +#define INDEX_H + +#include "chapterWriter.h" +#include "indexLayout.h" +#include "indexSession.h" +#include "indexZone.h" +#include "loadType.h" +#include "masterIndexOps.h" +#include "volume.h" + + +/** + * Index checkpoint state private to indexCheckpoint.c. + **/ +typedef struct indexCheckpoint IndexCheckpoint; + +typedef struct index { + bool existed; + bool hasSavedOpenChapter; + LoadType loadedType; + IndexLoadContext *loadContext; + IndexLayout *layout; + IndexState *state; + MasterIndex *masterIndex; + Volume *volume; + unsigned int zoneCount; + IndexZone **zones; + + /* + * ATTENTION!!! + * The meaning of the next two fields has changed. + * + * They now represent the oldest and newest chapters only at load time, + * and when the index is quiescent. At other times, they may lag individual + * zones' views of the index depending upon the progress made by the chapter + * writer. + */ + uint64_t oldestVirtualChapter; + uint64_t newestVirtualChapter; + + uint64_t lastCheckpoint; + uint64_t prevCheckpoint; + ChapterWriter *chapterWriter; + + // checkpoint state used by indexCheckpoint.c + IndexCheckpoint *checkpoint; +} Index; + +/** + * Construct a new index from the given configuration. + * + * @param layout The index layout + * @param config The configuration to use + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param zoneCount The number of zones for this index to use + * @param loadType How to create the index: it can be create only, allow + * loading from files, and allow rebuilding from the volume + * @param loadContext The load context to use + * @param newIndex A pointer to hold a pointer to the new index + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + IndexLoadContext *loadContext, + Index **newIndex) + __attribute__((warn_unused_result)); + +/** + * Save an index. + * + * Before saving an index and while saving an index, the caller must ensure + * that there are no index requests in progress. + * + * Some users follow saveIndex immediately with a freeIndex. But some tests + * use the IndexLayout to modify the saved index. The Index will then have + * some cached information that does not reflect these updates. + * + * @param index The index to save + * + * @return UDS_SUCCESS if successful + **/ +int saveIndex(Index *index) __attribute__((warn_unused_result)); + +/** + * Clean up the index and its memory. + * + * @param index The index to destroy. + **/ +void freeIndex(Index *index); + +/** + * Perform the index operation specified by the action field of a UDS request. + * + * For UDS API requests, this searches the index for the chunk name in the + * request. If the chunk name is already present in the index, the location + * field of the request will be set to the IndexRegion where it was found. If + * the action is not DELETE, the oldMetadata field of the request will also be + * filled in with the prior metadata for the name. + * + * If the API request action is: + * + * REQUEST_INDEX, a record will be added to the open chapter with the + * metadata in the request for new records, and the existing metadata for + * existing records + * + * REQUEST_UPDATE, a record will be added to the open chapter with the + * metadata in the request + * + * REQUEST_QUERY, if the update flag is set in the request, any record + * found will be moved to the open chapter. In all other cases the contents + * of the index will remain unchanged. + * + * REQUEST_REMOVE, the any entry with the name will removed from the index + * + * For non-API requests, no chunk name search is involved. + * + * @param index The index + * @param request The originating request + * + * @return UDS_SUCCESS, UDS_QUEUED, or an error code + **/ +int dispatchIndexRequest(Index *index, Request *request) + __attribute__((warn_unused_result)); + +/** + * Internal helper to prepare the index for saving. + * + * @param index the index + * @param checkpoint whether the save is a checkpoint + * @param openChapterNumber the virtual chapter number of the open chapter + **/ +void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber); + +/** + * Replay the volume file to repopulate the master index. + * + * @param index The index + * @param fromVCN The virtual chapter to start replaying + * + * @return UDS_SUCCESS if successful + **/ +int replayVolume(Index *index, uint64_t fromVCN) + __attribute__((warn_unused_result)); + +/** + * Gather statistics from the master index, volume, and cache. + * + * @param index The index + * @param counters the statistic counters for the index + **/ +void getIndexStats(Index *index, UdsIndexStats *counters); + +/** + * Set lookup state for this index. Disabling lookups means assume + * all records queried are new (intended for debugging uses, e.g., + * albfill). + * + * @param index The index + * @param enabled The new lookup state + **/ +void setIndexLookupState(Index *index, bool enabled); + +/** + * Advance the newest virtual chapter. If this will overwrite the oldest + * virtual chapter, advance that also. + * + * @param index The index to advance + **/ +void advanceActiveChapters(Index *index); + +/** + * Triage an index request, deciding whether it requires that a sparse cache + * barrier message precede it. + * + * This resolves the chunk name in the request in the master index, + * determining if it is a hook or not, and if a hook, what virtual chapter (if + * any) it might be found in. If a virtual chapter is found, it checks whether + * that chapter appears in the sparse region of the index. If all these + * conditions are met, the (sparse) virtual chapter number is returned. In all + * other cases it returns UINT64_MAX. + * + * @param index the index that will process the request + * @param request the index request containing the chunk name to triage + * + * @return the sparse chapter number for the sparse cache barrier message, or + * UINT64_MAX if the request does not require a barrier + **/ +uint64_t triageIndexRequest(Index *index, Request *request) + __attribute__((warn_unused_result)); + +#endif /* INDEX_H */ diff --git a/uds/indexCheckpoint.c b/uds/indexCheckpoint.c new file mode 100644 index 0000000..9c803b6 --- /dev/null +++ b/uds/indexCheckpoint.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.c#2 $ + */ + +#include "indexCheckpoint.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "threads.h" +#include "typeDefs.h" + +/** + * index checkpointState values + * + * @note The order of these values is significant, + * see indexState.c doIndexStateCheckpointInZone(). + **/ +typedef enum checkpointState { + NOT_CHECKPOINTING, + CHECKPOINT_IN_PROGRESS, + CHECKPOINT_ABORTING +} CheckpointState; + +/** + * Private structure which tracks checkpointing. + **/ +struct indexCheckpoint { + Mutex mutex; // covers this group of fields + uint64_t chapter; // vcn of the starting chapter + CheckpointState state; // is checkpoint in progress or aborting + unsigned int zonesBusy; // count of zones not yet done + unsigned int frequency; // number of chapters between checkpoints + uint64_t checkpoints; // number of checkpoints this session +}; + +/** + * Enum return value of indexCheckpointTrigger function. + **/ +typedef enum indexCheckpointTriggerValue { + ICTV_IDLE, //< no checkpointing right now + ICTV_START, //< start a new checkpoint now + ICTV_CONTINUE, //< continue checkpointing if needed + ICTV_FINISH, //< finish checkpointing, next time will start new cycle + ICTV_ABORT //< immediately abort checkpointing +} IndexCheckpointTriggerValue; + +typedef int CheckpointFunction(Index *index, unsigned int zone); + +// These functions are called while holding the checkpoint->mutex but are +// expected to release it. +// +static CheckpointFunction doCheckpointStart; +static CheckpointFunction doCheckpointProcess; +static CheckpointFunction doCheckpointFinish; +static CheckpointFunction doCheckpointAbort; + +CheckpointFunction *const checkpointFuncs[] = { + NULL, + doCheckpointStart, + doCheckpointProcess, + doCheckpointFinish, + doCheckpointAbort +}; + +/**********************************************************************/ +int makeIndexCheckpoint(Index *index) +{ + IndexCheckpoint *checkpoint; + int result + = ALLOCATE(1, IndexCheckpoint, "IndexCheckpoint", &checkpoint); + if (result != UDS_SUCCESS) { + return result; + } + + result = initMutex(&checkpoint->mutex); + if (result != UDS_SUCCESS) { + FREE(checkpoint); + return result; + } + + checkpoint->checkpoints = 0; + + index->checkpoint = checkpoint; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndexCheckpoint(IndexCheckpoint *checkpoint) +{ + if (checkpoint != NULL) { + destroyMutex(&checkpoint->mutex); + FREE(checkpoint); + } +} + +/**********************************************************************/ +unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) +{ + lockMutex(&checkpoint->mutex); + unsigned int frequency = checkpoint->frequency; + unlockMutex(&checkpoint->mutex); + return frequency; +} + +/**********************************************************************/ +unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, + unsigned int frequency) +{ + lockMutex(&checkpoint->mutex); + unsigned int oldFrequency = checkpoint->frequency; + checkpoint->frequency = frequency; + unlockMutex(&checkpoint->mutex); + return oldFrequency; +} + +/**********************************************************************/ +uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) +{ + return checkpoint->checkpoints; +} + +/**********************************************************************/ +static IndexCheckpointTriggerValue +getCheckpointAction(IndexCheckpoint *checkpoint, + uint64_t virtualChapter) +{ + if (checkpoint->frequency == 0) { + return ICTV_IDLE; + } + unsigned int value = virtualChapter % checkpoint->frequency; + if (checkpoint->state == CHECKPOINT_ABORTING) { + return ICTV_ABORT; + } else if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { + if (value == checkpoint->frequency - 1) { + return ICTV_FINISH; + } else { + return ICTV_CONTINUE; + } + } else { + if (value == 0) { + return ICTV_START; + } else { + return ICTV_IDLE; + } + } +} + +/**********************************************************************/ +int processCheckpointing(Index *index, + unsigned int zone, + uint64_t newVirtualChapter) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + lockMutex(&checkpoint->mutex); + + IndexCheckpointTriggerValue ictv + = getCheckpointAction(checkpoint, newVirtualChapter); + + if (ictv == ICTV_START) { + checkpoint->chapter = newVirtualChapter; + } + + CheckpointFunction *func = checkpointFuncs[ictv]; + if (func == NULL) { + // nothing to do in idle state + unlockMutex(&checkpoint->mutex); + return UDS_SUCCESS; + } + + return (*func)(index, zone); +} + +/**********************************************************************/ +int processChapterWriterCheckpointSaves(Index *index) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + + int result = UDS_SUCCESS; + + lockMutex(&checkpoint->mutex); + if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { + result = + performIndexStateCheckpointChapterSynchronizedSaves(index->state); + + if (result != UDS_SUCCESS) { + checkpoint->state = CHECKPOINT_ABORTING; + logInfo("checkpoint failed"); + index->lastCheckpoint = index->prevCheckpoint; + } + } + + unlockMutex(&checkpoint->mutex); + return result; +} + +/** + * Helper function used to abort checkpoint if an error has occurred. + * + * @param index the index + * @param result the error result + * + * @return result + **/ +static int abortCheckpointing(Index *index, int result) +{ + if (index->checkpoint->state != NOT_CHECKPOINTING) { + index->checkpoint->state = CHECKPOINT_ABORTING; + logInfo("checkpoint failed"); + index->lastCheckpoint = index->prevCheckpoint; + } + return result; +} + +/**********************************************************************/ +int finishCheckpointing(Index *index) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + + int result = processChapterWriterCheckpointSaves(index); + if (result != UDS_SUCCESS) { + return result; + } + + lockMutex(&checkpoint->mutex); + + unsigned int z; + for (z = 0; z < index->zoneCount; ++z) { + if (checkpoint->state != CHECKPOINT_IN_PROGRESS) { + break; + } + result = doCheckpointFinish(index, z); + // reacquire mutex released by doCheckpointFinish + lockMutex(&checkpoint->mutex); + if (result != UDS_SUCCESS) { + break; + } + } + + if ((result == UDS_SUCCESS) && + (checkpoint->state == CHECKPOINT_IN_PROGRESS)) { + result = finishIndexStateCheckpoint(index->state); + if (result == UDS_SUCCESS) { + checkpoint->state = NOT_CHECKPOINTING; + } + } + + unlockMutex(&checkpoint->mutex); + return result; +} + +/** + * Starts an incremental checkpoint. + * + * Called by the first zone to finish a chapter which starts a checkpoint. + * + * @param index the index + * @param zone the zone number + * + * @return UDS_SUCCESS or an error code + **/ +static int doCheckpointStart(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + beginSave(index, true, checkpoint->chapter); + int result = startIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot start index checkpoint"); + index->lastCheckpoint = index->prevCheckpoint; + unlockMutex(&checkpoint->mutex); + return result; + } + + checkpoint->state = CHECKPOINT_IN_PROGRESS; + checkpoint->zonesBusy = index->zoneCount; + + return doCheckpointProcess(index, zone); +} + +/**********************************************************************/ +static int doCheckpointProcess(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + unlockMutex(&checkpoint->mutex); + CompletionStatus status = CS_NOT_COMPLETED; + int result = performIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + lockMutex(&checkpoint->mutex); + logErrorWithStringError(result, "cannot continue index checkpoint"); + result = abortCheckpointing(index, result); + unlockMutex(&checkpoint->mutex); + } else if (status == CS_JUST_COMPLETED) { + lockMutex(&checkpoint->mutex); + if (--checkpoint->zonesBusy == 0) { + checkpoint->checkpoints += 1; + logInfo("finished checkpoint"); + result = finishIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "%s checkpoint finish failed", + __func__); + } + checkpoint->state = NOT_CHECKPOINTING; + } + unlockMutex(&checkpoint->mutex); + } + return result; +} + +/**********************************************************************/ +static int doCheckpointAbort(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + CompletionStatus status = CS_NOT_COMPLETED; + int result = abortIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot abort index checkpoint"); + } else if (status == CS_JUST_COMPLETED) { + if (--checkpoint->zonesBusy == 0) { + logInfo("aborted checkpoint"); + result = abortIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "checkpoint abort failed"); + } + checkpoint->state = NOT_CHECKPOINTING; + } + } + unlockMutex(&checkpoint->mutex); + + return result; +} + +/**********************************************************************/ +static int doCheckpointFinish(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + CompletionStatus status = CS_NOT_COMPLETED; + unlockMutex(&checkpoint->mutex); + int result = finishIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot finish index checkpoint"); + lockMutex(&checkpoint->mutex); + result = abortCheckpointing(index, result); + unlockMutex(&checkpoint->mutex); + } else if (status == CS_JUST_COMPLETED) { + lockMutex(&checkpoint->mutex); + if (--checkpoint->zonesBusy == 0) { + checkpoint->checkpoints += 1; + logInfo("finished checkpoint"); + result = finishIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "%s checkpoint finish failed", + __func__); + } + checkpoint->state = NOT_CHECKPOINTING; + } + unlockMutex(&checkpoint->mutex); + } + return result; +} diff --git a/uds/indexCheckpoint.h b/uds/indexCheckpoint.h new file mode 100644 index 0000000..02d2936 --- /dev/null +++ b/uds/indexCheckpoint.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.h#1 $ + */ + +#ifndef INDEX_CHECKPOINT_H +#define INDEX_CHECKPOINT_H + +#include "index.h" + +/** + * Construct and initialize the checkpoint sub-structure of an index. + * + * @param index the index receive the new checkpoint structure. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexCheckpoint(Index *index) __attribute__((warn_unused_result)); + +/** + * Free the checkpoint sub-structure of an index. + * + * @param checkpoint the structure to free + **/ +void freeIndexCheckpoint(IndexCheckpoint *checkpoint); + +/** + * Get the current checkpointing frequency of an index. + * + * @param checkpoint the checkpoint state of the index + * + * @return the number of chapters between checkpoints + **/ +unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) + __attribute__((warn_unused_result)); + +/** + * Set checkpointing frequency for the index. + * + * @param checkpoint the checkpoint state of the index + * @param frequency The new checkpointing frequency + * + * @return the old checkpointing frequency + **/ +unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, + unsigned int frequency); + +/** + * Gets the number of checkpoints completed during the lifetime of this index + * + * @param checkpoint the checkpoint state of the index + * + * @return the number of checkpoints completed + **/ +uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) + __attribute__((warn_unused_result)); + +/** + * If incremental checkpointing is in progress, finish it. + * + * @param index The index + * + * @return UDS_SUCCESS or an error code + * + * @note This function is called automatically during normal operation; + * its presence here is for tests that expect checkpointing to + * have completed at some point in their logic. It is not an + * error to call this function if checkpointing is not in + * progress, it silently returns success. + **/ +int finishCheckpointing(Index *index) __attribute__((warn_unused_result)); + +/** + * Process one zone's incremental checkpoint operation. Automatically + * starts, processes, and finishes a checkpoint over multiple invocations + * as successive chapters are closed and written. + * + * Uses its own mutex to serialize the starting and finishing or aborting, + * but allows parallel execution of the incremental progress. + * + * @param index The index to checkpoint + * @param zone The current zone number + * @param newVirtualChapter The number of the chapter which the calling + * zone has just opened + * + * @return UDS_SUCCESS or an error code. + **/ +int processCheckpointing(Index *index, + unsigned int zone, + uint64_t newVirtualChapter) + __attribute__((warn_unused_result)); + +/** + * Process saves done outside any zone by the chapter writer. + * + * Grabs the mutex associated with processCheckpointing(). + * + * @param index The index to process. + * + * @return UDS_SUCCESS or an error code. + **/ +int processChapterWriterCheckpointSaves(Index *index) + __attribute__((warn_unused_result)); + +#endif // INDEX_CHECKPOINT_H diff --git a/uds/indexComponent.c b/uds/indexComponent.c new file mode 100644 index 0000000..c932b8d --- /dev/null +++ b/uds/indexComponent.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.c#8 $ + */ + +#include "indexComponent.h" + +#include "compiler.h" +#include "errors.h" +#include "indexLayout.h" +#include "indexState.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "typeDefs.h" + +/*****************************************************************************/ +int makeIndexComponent(IndexState *state, + const IndexComponentInfo *info, + unsigned int zoneCount, + void *data, + void *context, + IndexComponent **componentPtr) +{ + if ((info == NULL) || (info->name == NULL)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "invalid component or directory specified"); + } + if (info->loader == NULL) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "no .loader function specified " + "for component %s", + info->name); + } + if ((info->saver == NULL) && (info->incremental == NULL)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "neither .saver function nor .incremental " + "function specified for component %s", + info->name); + } + + IndexComponent *component = NULL; + int result = ALLOCATE(1, IndexComponent, "index component", &component); + if (result != UDS_SUCCESS) { + return result; + } + + component->componentData = data; + component->context = context; + component->info = info; + component->numZones = info->multiZone ? zoneCount : 1; + component->state = state; + component->writeZones = NULL; + *componentPtr = component; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void freeWriteZones(IndexComponent *component) +{ + if (component->writeZones != NULL) { + unsigned int z; + for (z = 0; z < component->numZones; ++z) { + WriteZone *wz = component->writeZones[z]; + if (wz == NULL) { + continue; + } + freeBufferedWriter(wz->writer); + FREE(wz); + } + FREE(component->writeZones); + component->writeZones = NULL; + } +} + +/*****************************************************************************/ +void freeIndexComponent(IndexComponent **componentPtr) +{ + if (componentPtr == NULL) { + return; + } + IndexComponent *component = *componentPtr; + if (component == NULL) { + return; + } + *componentPtr = NULL; + + freeWriteZones(component); + FREE(component); +} + +/** + * Destroy, deallocate, and expunge a read portal. + * + * @param readPortal the readzone array + **/ +static void freeReadPortal(ReadPortal *readPortal) +{ + if (readPortal == NULL) { + return; + } + unsigned int z; + for (z = 0; z < readPortal->zones; ++z) { + if (readPortal->readers[z] != NULL) { + freeBufferedReader(readPortal->readers[z]); + } + } + FREE(readPortal->readers); + FREE(readPortal); +} + +/*****************************************************************************/ +int getBufferedReaderForPortal(ReadPortal *portal, + unsigned int part, + BufferedReader **readerPtr) +{ + if (part >= portal->zones) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%s: cannot access zone %u of %u", + __func__, part, portal->zones); + } + IndexComponent *component = portal->component; + if (component->info->ioStorage && (portal->readers[part] == NULL)) { + int result = openStateBufferedReader(component->state, + component->info->kind, part, + &portal->readers[part]); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot make buffered reader " + "for zone %u", __func__, part); + } + } + *readerPtr = portal->readers[part]; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int readIndexComponent(IndexComponent *component) +{ + ReadPortal *portal; + int result = ALLOCATE(1, ReadPortal, "index component read portal", &portal); + if (result != UDS_SUCCESS) { + return result; + } + int readZones = component->state->loadZones; + result = ALLOCATE(readZones, BufferedReader *, "read zone buffered readers", + &portal->readers); + if (result != UDS_SUCCESS) { + FREE(portal); + return result; + } + + portal->component = component; + portal->zones = readZones; + result = (*component->info->loader)(portal); + freeReadPortal(portal); + return result; +} + +/** + * Determine the writeZone structure for the specified component and zone. + * + * @param [in] component the index component + * @param [in] zone the zone number + * @param [out] writeZonePtr the resulting write zone instance + * + * @return UDS_SUCCESS or an error code + **/ +static int resolveWriteZone(const IndexComponent *component, + unsigned int zone, + WriteZone **writeZonePtr) +{ + int result = ASSERT(writeZonePtr != NULL, + "output parameter is null"); + if (result != UDS_SUCCESS) { + return result; + } + + if (component->writeZones == NULL) { + return logErrorWithStringError(UDS_BAD_STATE, + "cannot resolve index component write zone:" + " not allocated"); + } + + if (zone >= component->numZones) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "cannot resolve index component write zone:" + " zone out of range"); + } + *writeZonePtr = component->writeZones[zone]; + return UDS_SUCCESS; +} + +/** + * Non-incremental save function used to emulate a regular save + * using an incremental save function as a basis. + * + * @param component the index component + * @param writer the buffered writer + * @param zone the zone number + * + * @return UDS_SUCCESS or an error code + **/ +static int indexComponentSaverIncrementalWrapper(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + IncrementalWriter incrFunc = component->info->incremental; + bool completed = false; + + int result = (*incrFunc)(component, writer, zone, IWC_START, &completed); + if (result != UDS_SUCCESS) { + return result; + } + + if (!completed) { + result = (*incrFunc)(component, writer, zone, IWC_FINISH, &completed); + if (result != UDS_SUCCESS) { + return result; + } + } + + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/** + * Specify that writing to a specific zone file has finished. + * + * If a syncer has been registered with the index component, the file + * descriptor will be enqueued upon it for fsyncing and closing. + * If not, or if the enqueue fails, the file will be fsynced and closed + * immediately. + * + * @param writeZone the index component write zone + * + * @return UDS_SUCCESS or an error code + **/ +static int doneWithZone(WriteZone *writeZone) +{ + const IndexComponent *component = writeZone->component; + if (writeZone->writer != NULL) { + int result = flushBufferedWriter(writeZone->writer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot flush buffered writer for " + "%s component (zone %u)", + component->info->name, writeZone->zone); + } + } + return UDS_SUCCESS; +} + +/** + * Construct the array of WriteZone instances for this component. + * + * @param component the index component + * + * @return UDS_SUCCESS or an error code + * + * If this is a multizone component, each zone will be fully defined, + * otherwise zone 0 stands in for the single state file. + **/ +static int makeWriteZones(IndexComponent *component) +{ + unsigned int z; + if (component->writeZones != NULL) { + // just reinitialize states + for (z = 0; z < component->numZones; ++z) { + WriteZone *wz = component->writeZones[z]; + wz->phase = IWC_IDLE; + } + return UDS_SUCCESS; + } + + int result = ALLOCATE(component->numZones, WriteZone *, + "index component write zones", &component->writeZones); + if (result != UDS_SUCCESS) { + return result; + } + + for (z = 0; z < component->numZones; ++z) { + result = ALLOCATE(1, WriteZone, "plain write zone", + &component->writeZones[z]); + if (result != UDS_SUCCESS) { + freeWriteZones(component); + return result; + } + *component->writeZones[z] = (WriteZone) { + .component = component, + .phase = IWC_IDLE, + .zone = z, + }; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int openBufferedWriters(IndexComponent *component) +{ + int result = UDS_SUCCESS; + WriteZone **wzp; + for (wzp = component->writeZones; + wzp < component->writeZones + component->numZones; + ++wzp) { + WriteZone *wz = *wzp; + wz->phase = IWC_START; + + result = ASSERT(wz->writer == NULL, "write zone writer already exists"); + if (result != UDS_SUCCESS) { + return result; + } + + if (component->info->ioStorage) { + int result = openStateBufferedWriter(component->state, + component->info->kind, wz->zone, + &wz->writer); + if (result != UDS_SUCCESS) { + return result; + } + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int startIndexComponentSave(IndexComponent *component) +{ + int result = makeWriteZones(component); + if (result != UDS_SUCCESS) { + return result; + } + + result = openBufferedWriters(component); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int startIndexComponentIncrementalSave(IndexComponent *component) +{ + return startIndexComponentSave(component); +} + +/*****************************************************************************/ +int writeIndexComponent(IndexComponent *component) +{ + Saver saver = component->info->saver; + if ((saver == NULL) && (component->info->incremental != NULL)) { + saver = indexComponentSaverIncrementalWrapper; + } + + int result = startIndexComponentSave(component); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int z; + for (z = 0; z < component->numZones; ++z) { + WriteZone *writeZone = component->writeZones[z]; + + result = (*saver)(component, writeZone->writer, z); + if (result != UDS_SUCCESS) { + break; + } + + result = doneWithZone(writeZone); + if (result != UDS_SUCCESS) { + break; + } + + freeBufferedWriter(writeZone->writer); + writeZone->writer = NULL; + } + + if (result != UDS_SUCCESS) { + freeWriteZones(component); + return logErrorWithStringError(result, "index component write failed"); + } + + return UDS_SUCCESS; +} + +/** + * Close a specific buffered writer in a component write zone. + * + * @param writeZone the write zone + * + * @return UDS_SUCCESS or an error code + * + * @note closing a buffered writer causes its file descriptor to be + * passed to doneWithZone + **/ +static int closeBufferedWriter(WriteZone *writeZone) +{ + if (writeZone->writer == NULL) { + return UDS_SUCCESS; + } + + int result = doneWithZone(writeZone); + freeBufferedWriter(writeZone->writer); + writeZone->writer = NULL; + + return result; +} + +/** + * Faux incremental saver function for index components which only define + * a simple saver. Conforms to IncrementalWriter signature. + * + * @param [in] component the index component + * @param [in] writer the buffered writer that does the output + * @param [in] zone the zone number + * @param [in] command the incremental writer command + * @param [out] completed if non-NULL, set to whether the save is complete + * + * @return UDS_SUCCESS or an error code + * + * @note This wrapper always calls the non-incremental saver when + * the IWC_START command is issued, and always reports that + * the save is complete unless the saver failed. + **/ +static int wrapSaverAsIncremental(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed) +{ + int result = UDS_SUCCESS; + + if ((command >= IWC_START) && (command <= IWC_FINISH)) { + result = (*component->info->saver)(component, writer, zone); + if ((result == UDS_SUCCESS) && (writer != NULL)) { + noteBufferedWriterUsed(writer); + } + } + if ((result == UDS_SUCCESS) && (completed != NULL)) { + *completed = true; + } + return result; +} + +/** + * Return the appropriate incremental writer function depending on + * the component's type and whether this is the first zone. + * + * @param component the index component + * + * @return the correct IncrementalWriter function to use, or + * NULL signifying no progress can be made at this time. + **/ +static IncrementalWriter getIncrementalWriter(IndexComponent *component) +{ + IncrementalWriter incrFunc = component->info->incremental; + + if (incrFunc == NULL) { + incrFunc = &wrapSaverAsIncremental; + } + + return incrFunc; +} + +/*****************************************************************************/ +int performIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) +{ + CompletionStatus comp = CS_NOT_COMPLETED; + + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + if (wz->phase == IWC_IDLE) { + comp = CS_COMPLETED_PREVIOUSLY; + } else if (wz->phase == IWC_DONE) { + comp = CS_JUST_COMPLETED; + wz->phase = IWC_IDLE; + } else if (!component->info->chapterSync) { + bool done = false; + IncrementalWriter incrFunc = getIncrementalWriter(component); + int result = (*incrFunc)(component, wz->writer, zone, wz->phase, &done); + if (result != UDS_SUCCESS) { + if (wz->phase == IWC_ABORT) { + wz->phase = IWC_IDLE; + } else { + wz->phase = IWC_ABORT; + } + return result; + } + if (done) { + comp = CS_JUST_COMPLETED; + wz->phase = IWC_IDLE; + } else if (wz->phase == IWC_START) { + wz->phase = IWC_CONTINUE; + } + } + + if (completed != NULL) { + *completed = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int performIndexComponentChapterWriterSave(IndexComponent *component) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, 0, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + bool done = false; + IncrementalWriter incrFunc = getIncrementalWriter(component); + int result = ASSERT(incrFunc != NULL, "no writer function"); + if (result != UDS_SUCCESS) { + return result; + } + result = (*incrFunc)(component, wz->writer, 0, wz->phase, &done); + if (result != UDS_SUCCESS) { + if (wz->phase == IWC_ABORT) { + wz->phase = IWC_IDLE; + } else { + wz->phase = IWC_ABORT; + } + return result; + } + if (done) { + wz->phase = IWC_DONE; + } else if (wz->phase == IWC_START) { + wz->phase = IWC_CONTINUE; + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int finishIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + CompletionStatus comp; + switch (wz->phase) { + case IWC_IDLE: + comp = CS_COMPLETED_PREVIOUSLY; + break; + + case IWC_DONE: + comp = CS_JUST_COMPLETED; + break; + + default: + comp = CS_NOT_COMPLETED; + } + + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase >= IWC_START) && (wz->phase < IWC_ABORT)) { + bool done = false; + int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); + if (result != UDS_SUCCESS) { + wz->phase = IWC_ABORT; + return result; + } + if (!done) { + logWarning("finish incremental save did not complete for %s zone %u", + component->info->name, zone); + return UDS_CHECKPOINT_INCOMPLETE; + } + wz->phase = IWC_IDLE; + comp = CS_JUST_COMPLETED; + } + + if (completed != NULL) { + *completed = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int finishIndexComponentIncrementalSave(IndexComponent *component) +{ + unsigned int zone; + for (zone = 0; zone < component->numZones; ++zone) { + WriteZone *wz = component->writeZones[zone]; + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + // Note: this is only safe if no other threads are currently processing + // this particular index + bool done = false; + int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); + if (result != UDS_SUCCESS) { + return result; + } + if (!done) { + logWarning("finishing incremental save did not complete for %s zone %u", + component->info->name, zone); + return UDS_UNEXPECTED_RESULT; + } + wz->phase = IWC_IDLE; + } + + if ((wz->writer != NULL) && !wasBufferedWriterUsed(wz->writer)) { + return logErrorWithStringError(UDS_CHECKPOINT_INCOMPLETE, + "component %s zone %u did not get written", + component->info->name, zone); + } + + int result = closeBufferedWriter(wz); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *status) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + CompletionStatus comp = CS_COMPLETED_PREVIOUSLY; + + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); + wz->phase = IWC_IDLE; + if (result != UDS_SUCCESS) { + return result; + } + comp = CS_JUST_COMPLETED; + } + + if (status != NULL) { + *status = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexComponentIncrementalSave(IndexComponent *component) +{ + int result = UDS_SUCCESS; + unsigned int zone; + for (zone = 0; zone < component->numZones; ++zone) { + WriteZone *wz = component->writeZones[zone]; + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + // Note: this is only safe if no other threads are currently processing + // this particular index + result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); + wz->phase = IWC_IDLE; + if (result != UDS_SUCCESS) { + return result; + } + } + + int result = closeBufferedWriter(wz); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardIndexComponent(IndexComponent *component) +{ + if (!component->info->ioStorage) { + return UDS_INVALID_ARGUMENT; + } + + unsigned int numZones = 0; + unsigned int saveSlot = 0; + int result = findLatestIndexSaveSlot(component->state->layout, &numZones, + &saveSlot); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int oldSaveSlot = component->state->saveSlot; + component->state->saveSlot = saveSlot; + + unsigned int z; + for (z = 0; z < numZones; ++z) { + BufferedWriter *writer; + int result = openStateBufferedWriter(component->state, + component->info->kind, z, &writer); + if (result != UDS_SUCCESS) { + break; + } + result = writeZerosToBufferedWriter(writer, UDS_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + break; + } + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + break; + } + freeBufferedWriter(writer); + } + + component->state->saveSlot = oldSaveSlot; + return result; +} diff --git a/uds/indexComponent.h b/uds/indexComponent.h new file mode 100644 index 0000000..22066b1 --- /dev/null +++ b/uds/indexComponent.h @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.h#5 $ + */ + +#ifndef INDEX_COMPONENT_H +#define INDEX_COMPONENT_H 1 + +#include "common.h" + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "regionIdentifiers.h" + +typedef enum completionStatus { + CS_NOT_COMPLETED, // operation has not completed + CS_JUST_COMPLETED, // operation just completed + CS_COMPLETED_PREVIOUSLY // operation completed previously +} CompletionStatus; + +typedef struct readPortal { + struct indexComponent *component; + BufferedReader **readers; + unsigned int zones; +} ReadPortal; + +/** + * Prototype for functions which can load an index component from its + * saved state. + * + * @param portal A component portal which can be used to load the + * specified component. + * @return UDS_SUCCESS or an error code + **/ +typedef int (*Loader)(ReadPortal *portal); + +/** + * Prototype for functions which can save an index component. + * + * @param component The index component. + * @param writer A buffered writer. + * @param zone The zone number. + * + * @return UDS_SUCCESS or an error code + **/ +typedef int (*Saver)(struct indexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +/** + * Command code used by IncrementalWriter function protocol. + **/ +typedef enum incrementalWriterCommand { + IWC_START, //< start an incremental save + IWC_CONTINUE, //< continue an incremental save + IWC_FINISH, //< force finish of incremental save + IWC_ABORT, //< abort incremental save + IWC_IDLE = -1,//< not a command, used internally to signify not in progress + IWC_DONE = -2 //< not a command, used internally to signify async completion +} IncrementalWriterCommand; + +typedef struct writeZone { + struct indexComponent *component; + IncrementalWriterCommand phase; + BufferedWriter *writer; + unsigned int zone; +} WriteZone; + +/** + * @param [in] component The index component. + * @param [in] writer A buffered writer. + * @param [in] zone The zone number (0 for non-multi-zone). + * @param [in] command The incremental writer command. + * @param [out] completed If non-NULL, set to whether save is done. + * + * @return UDS_SUCCESS or an error code + **/ +typedef int (*IncrementalWriter)(struct indexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed); + +/** + * The structure describing how to load or save an index component. + * At least one of saver or incremental must be specified. + **/ +typedef struct indexComponentInfo { + RegionKind kind; // Region kind + const char *name; // The name of the component (for logging) + bool saveOnly; // Used for saves but not checkpoints + bool chapterSync; // Saved by the chapter writer + bool multiZone; // Does this component have multiple zones? + bool ioStorage; // Do we do I/O directly to storage? + Loader loader; // The function load this component + Saver saver; // The function to store this component + IncrementalWriter incremental; // The function for incremental writing +} IndexComponentInfo; + +/** + * The structure representing a savable (and loadable) part of an index. + **/ +typedef struct indexComponent { + const IndexComponentInfo *info; // IndexComponentInfo specification + void *componentData; // The object to load or save + void *context; // The context used to load or save + struct indexState *state; // The index state + unsigned int numZones; // Number of zones in write portal + WriteZone **writeZones; // State for writing component +} IndexComponent; + +/** + * Make an index component + * + * @param state The index state in which this component instance + * shall reside. + * @param info The component info specification for this component. + * @param zoneCount How many active zones are in use. + * @param data Component-specific data. + * @param context Component-specific context. + * @param componentPtr Where to store the resulting component. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexComponent(struct indexState *state, + const IndexComponentInfo *info, + unsigned int zoneCount, + void *data, + void *context, + IndexComponent **componentPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy and index component. + * + * @param componentPtr A pointer to the component to be freed. + **/ +void freeIndexComponent(IndexComponent **componentPtr); + +/** + * Return the index component name for this component. + **/ +static INLINE const char *indexComponentName(IndexComponent *component) +{ + return component->info->name; +} + +/** + * Return the index component data for this component. + **/ +static INLINE void *indexComponentData(IndexComponent *component) +{ + return component->componentData; +} + +/** + * Return the index component context for this component. + **/ +static INLINE void *indexComponentContext(IndexComponent *component) +{ + return component->context; +} + +/** + * Determine whether this component may be skipped for a checkpoint. + * + * @param component the component, + * + * @return whether the component may be skipped + **/ +static INLINE bool skipIndexComponentOnCheckpoint(IndexComponent *component) +{ + return component->info->saveOnly; +} + +/** + * Determine whether actual saving during a checkpoint should be + * invoked by the chapter writer thread. + **/ +static INLINE bool +deferIndexComponentCheckpointToChapterWriter(IndexComponent *component) +{ + return component->info->chapterSync; +} + +/** + * Determine whether a replay is required if component is missing. + * + * @param component the component + * + * @return whether the component is final (that is, contains shutdown state) + **/ +static INLINE bool +missingIndexComponentRequiresReplay(IndexComponent *component) +{ + return component->info->saveOnly; +} + +/** + * Read a component's state. + * + * @param component The component to read. + * + * @return UDS_SUCCESS, an error code from reading, or UDS_INVALID_ARGUMENT + * if the component is NULL. + **/ +int readIndexComponent(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Write a state file. + * + * @param component The component to write + * + * @return UDS_SUCCESS, an error code from writing, or UDS_INVALID_ARGUMENT + * if the component is NULL. + **/ +int writeIndexComponent(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Start an incremental save for this component (all zones). + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + **/ +int startIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Perform an incremental save for a component in a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + * + * @note If an incremental save is not supported, a regular + * save will be performed if this is the first call in zone 0. + **/ + int performIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Perform an incremental save for a non-multizone component synchronized + * with the chapter writer. + * + * @param component The index component. + **/ +int performIndexComponentChapterWriterSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental save currently in progress in + * a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental save in all zones and complete + * the overal save. + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + * + * @note If all zones call finishIndexComponentZoneSave first, only + * the common non-index-related completion code is required, + * which protects access to the index data structures from the + * invoking thread. + **/ +int finishIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Abort the incremental save currently in progress in a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + * + * @note "Completed" in this case means completed or aborted. + * Once any zone calls this function the entire save is + * useless unless every zone indicates CS_COMPLETED_PREVIOUSLY. + **/ +int abortIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Abort an incremental save currently in progress + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + * + * @note If all zones call abortIndexComponentZoneSave first, only + * the common non-index-related completion code is required, + * which protects access to the index data structures from the + * invoking thread. + **/ +int abortIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Remove or invalidate component state. + * + * @param component The component whose file is to be removed. If NULL + * no action is taken. + **/ +__attribute__((warn_unused_result)) +int discardIndexComponent(IndexComponent *component); + +/** + * Get a buffered reader for the specified component part. + * + * @param [in] portal The component portal. + * @param [in] part The component ordinal number. + * @param [out] readerPtr Where to put the buffered reader. + * + * @return UDS_SUCCESS or an error code. + * + * @note the reader is managed by the component portal + **/ +__attribute__((warn_unused_result)) +int getBufferedReaderForPortal(ReadPortal *portal, + unsigned int part, + BufferedReader **readerPtr); + +#endif /* INDEX_COMPONENT_H */ diff --git a/uds/indexConfig.c b/uds/indexConfig.c new file mode 100644 index 0000000..7ef86f2 --- /dev/null +++ b/uds/indexConfig.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.c#2 $ + */ + +#include "indexConfig.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +static const byte INDEX_CONFIG_MAGIC[] = "ALBIC"; +static const byte INDEX_CONFIG_VERSION[] = "06.02"; +static const byte INDEX_CONFIG_VERSION_6_01[] = "06.01"; + +enum { + INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, + INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION) - 1 +}; + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexConfig(Buffer *buffer, UdsConfiguration config) +{ + int result = getUInt32LEFromBuffer(buffer, &config->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->cacheChapters); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->checkpointFrequency); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->masterIndexMeanDelta); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/**********************************************************************/ +static int readVersion(BufferedReader *reader, + UdsConfiguration conf, + const char **versionPtr) +{ + byte buffer[INDEX_CONFIG_VERSION_LENGTH]; + int result = readFromBufferedReader(reader, buffer, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot read index config version"); + } + if (memcmp(INDEX_CONFIG_VERSION, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0) { + Buffer *buffer; + result = makeBuffer(sizeof(*conf), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read config data"); + } + clearBuffer(buffer); + result = decodeIndexConfig(buffer, conf); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (versionPtr != NULL) { + *versionPtr = "current"; + } + return result; + } else if (memcmp(INDEX_CONFIG_VERSION_6_01, buffer, + INDEX_CONFIG_VERSION_LENGTH) == 0) { + struct udsConfiguration6_01 oldConf; + result = readFromBufferedReader(reader, &oldConf, sizeof(oldConf)); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, + "failed to read version 6.01 config file"); + return result; + } + conf->recordPagesPerChapter = oldConf.recordPagesPerChapter; + conf->chaptersPerVolume = oldConf.chaptersPerVolume; + conf->sparseChaptersPerVolume = oldConf.sparseChaptersPerVolume; + conf->cacheChapters = oldConf.cacheChapters; + conf->checkpointFrequency = oldConf.checkpointFrequency; + conf->masterIndexMeanDelta = oldConf.masterIndexMeanDelta; + conf->bytesPerPage = oldConf.bytesPerPage; + conf->sparseSampleRate = oldConf.sparseSampleRate; + conf->nonce = 0; + if (versionPtr != NULL) { + *versionPtr = "6.01"; + } + return UDS_UNSUPPORTED_VERSION; + } + + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unsupported configuration version: '%.*s'", + INDEX_CONFIG_VERSION_LENGTH, buffer); +} + +/**********************************************************************/ +int readConfigContents(BufferedReader *reader, + UdsConfiguration config) +{ + int result = verifyBufferedData(reader, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + const char *version = NULL; + result = readVersion(reader, config, &version); + if (result != UDS_SUCCESS) { + if (result == UDS_UNSUPPORTED_VERSION) { + logNoticeWithStringError(result, "Found index config version %s", + version); + } else { + logErrorWithStringError(result, "Failed to read index config"); + } + } + return result; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeIndexConfig(Buffer *buffer, UdsConfiguration config) +{ + int result = putUInt32LEIntoBuffer(buffer, config->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->cacheChapters); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config-> checkpointFrequency); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->masterIndexMeanDelta); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*config), + "%zu bytes encoded, of %zu expected", + contentLength(buffer), sizeof(*config)); + return result; +} + +/**********************************************************************/ +int writeConfigContents(BufferedWriter *writer, + UdsConfiguration config) +{ + int result = writeToBufferedWriter(writer, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + result = writeToBufferedWriter(writer, INDEX_CONFIG_VERSION, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + Buffer *buffer; + result = makeBuffer(sizeof(*config), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeIndexConfig(buffer, config); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + return result; +} + +/**********************************************************************/ +int makeConfiguration(UdsConfiguration conf, Configuration **configPtr) +{ + *configPtr = NULL; + if (conf == NULL) { + return logErrorWithStringError(UDS_CONF_REQUIRED, + "received an invalid config"); + } + + Configuration *config; + int result = ALLOCATE(1, Configuration, "configuration", &config); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeGeometry(conf->bytesPerPage, + conf->recordPagesPerChapter, + conf->chaptersPerVolume, + conf->sparseChaptersPerVolume, + &config->geometry); + if (result != UDS_SUCCESS) { + freeConfiguration(config); + return result; + } + + config->sparseSampleRate = conf->sparseSampleRate; + config->cacheChapters = conf->cacheChapters; + config->masterIndexMeanDelta = conf->masterIndexMeanDelta; + + *configPtr = config; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeConfiguration(Configuration *config) +{ + if (config != NULL) { + freeGeometry(config->geometry); + FREE(config); + } +} diff --git a/uds/indexConfig.h b/uds/indexConfig.h new file mode 100644 index 0000000..dab3d6a --- /dev/null +++ b/uds/indexConfig.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.h#2 $ + */ + +#ifndef INDEX_CONFIG_H +#define INDEX_CONFIG_H 1 + +#include "config.h" +#include "geometry.h" + +/** + * A set of configuration parameters for the indexer. + **/ +struct configuration { + /* Parameters for the volume */ + + /* The volume layout */ + Geometry *geometry; + + /* Size of the page cache and sparse chapter index cache, in chapters */ + unsigned int cacheChapters; + + /** Parameters for the master index */ + + /* The mean delta for the master index */ + unsigned int masterIndexMeanDelta; + + /* Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; +}; + +#endif /* INDEX_CONFIG_H */ diff --git a/uds/indexInternals.c b/uds/indexInternals.c new file mode 100644 index 0000000..48268c7 --- /dev/null +++ b/uds/indexInternals.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.c#7 $ + */ + +#include "indexInternals.h" + +#include "errors.h" +#include "indexCheckpoint.h" +#include "indexStateData.h" +#include "indexZone.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "openChapter.h" +#include "request.h" +#include "stringUtils.h" +#include "threads.h" +#include "typeDefs.h" +#include "volume.h" +#include "zone.h" + +static const unsigned int MAX_COMPONENT_COUNT = 4; + +/**********************************************************************/ +int allocateIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + Index **newIndex) +{ + unsigned int checkpoint_frequency + = userParams == NULL ? 0 : userParams->checkpoint_frequency; + if (checkpoint_frequency >= config->geometry->chaptersPerVolume) { + return UDS_BAD_CHECKPOINT_FREQUENCY; + } + + Index *index; + int result = ALLOCATE(1, Index, "index", &index); + if (result != UDS_SUCCESS) { + return result; + } + + index->existed = (loadType != LOAD_CREATE); + index->hasSavedOpenChapter = true; + index->loadedType = LOAD_UNDEFINED; + + result = makeIndexCheckpoint(index); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + setIndexCheckpointFrequency(index->checkpoint, checkpoint_frequency); + + getIndexLayout(layout, &index->layout); + index->zoneCount = zoneCount; + + result = ALLOCATE(index->zoneCount, IndexZone *, "zones", + &index->zones); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeIndexState(layout, index->zoneCount, MAX_COMPONENT_COUNT, + &index->state); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = addIndexStateComponent(index->state, &INDEX_STATE_INFO, index, + NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeVolume(config, index->layout, userParams, + VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS, index->zoneCount, + &index->volume); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + index->volume->lookupMode = LOOKUP_NORMAL; + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + result = makeIndexZone(index, i); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "Could not create index zone"); + } + } + + result = addIndexStateComponent(index->state, &OPEN_CHAPTER_INFO, index, + NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "Could not create open chapter"); + } + + *newIndex = index; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void releaseIndex(Index *index) +{ + if (index == NULL) { + return; + } + + if (index->zones != NULL) { + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + freeIndexZone(index->zones[i]); + } + FREE(index->zones); + } + + freeVolume(index->volume); + + freeIndexState(&index->state); + freeIndexCheckpoint(index->checkpoint); + putIndexLayout(&index->layout); + FREE(index); +} diff --git a/uds/indexInternals.h b/uds/indexInternals.h new file mode 100644 index 0000000..16cb56a --- /dev/null +++ b/uds/indexInternals.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.h#3 $ + */ + +#ifndef INDEX_INTERNALS_H +#define INDEX_INTERNALS_H + +#include "index.h" +#include "loadType.h" +#include "request.h" + +/** + * Construct a new index from the given configuration. + * + * @param layout The index layout to use + * @param config The configuration to use + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param zoneCount The number of zones for this index to use + * @param loadType How to create the index: it can be create only, allow + * loading from files, and allow rebuilding from the volume + * @param newIndex A pointer to hold a pointer to the new index + * + * @return UDS_SUCCESS or an error code + **/ +int allocateIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + Index **newIndex) + __attribute__((warn_unused_result)); + +/** + * Clean up the index and its memory. + * + * @param index The index to destroy. + **/ +void releaseIndex(Index *index); + +#endif /* INDEX_INTERNALS_H */ diff --git a/uds/indexLayout.c b/uds/indexLayout.c new file mode 100644 index 0000000..cb019ff --- /dev/null +++ b/uds/indexLayout.c @@ -0,0 +1,2409 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.c#19 $ + */ + +#include "indexLayout.h" + +#include "buffer.h" +#include "compiler.h" +#include "config.h" +#include "indexConfig.h" +#include "layoutRegion.h" +#include "logger.h" +#include "masterIndexOps.h" +#include "memoryAlloc.h" +#include "nonce.h" +#include "openChapter.h" + +/* + * Overall layout of an index on disk: + * + * The layout is divided into a number of fixed-size regions, the sizes of + * which are computed when the index is created. Every header and region + * begins on 4K block boundary. Save regions are further sub-divided into + * regions of their own. + * + * Each region has a kind and an instance number. Some kinds only have one + * instance and therefore use RL_SOLE_INSTANCE (-1) as the instance number. + * The RL_KIND_INDEX uses instances to represent sub-indices, where used. + * A save region can either hold a checkpoint or a clean shutdown (determined + * by the type). The instances determine which available save slot is used. + * The RL_KIND_MASTER_INDEX uses instances to record which zone is being saved. + * + * +-+-+--------+--------+--------+-----+--- -+-+ + * | | | I N D E X 0 101, 0 | ... | | + * |H|C+--------+--------+--------+-----+--- -+S| + * |D|f| Volume | Save | Save | | |e| + * |R|g| Region | Region | Region | ... | ... |a| + * | | | 201 -1 | 202 0 | 202 1 | | |l| + * +-+-+--------+--------+--------+-----+--- -+-+ + * + * The header contains the encoded regional layout table as well as + * the saved index configuration record. The sub-index regions and their + * subdivisions are maintained in the same table. + * + * There are at least two save regions per sub-index to preserve the old + * state should the saving of a state be incomplete. They are used in + * a round-robin fashion. + * + * Anatomy of a save region: + * + * +-+-----+------+------+-----+ -+-----+ + * |H| IPM | MI | MI | | | OC | + * |D| | zone | zone | ... | | | + * |R| 301 | 302 | 302 | | | 303 | + * | | -1 | 0 | 1 | | | -1 | + * +-+-----+------+------+-----+ -+-----+ + * + * Every region header has a type (and version). In save regions, + * the open chapter only appears in RL_TYPE_SAVE not RL_TYPE_CHECKPOINT, + * although the same space is reserved for both. + * + * The header contains the encoded regional layout table as well as the + * index state record for that save or checkpoint. Each save or checkpoint + * has a unique generation number and nonce which is used to seed the + * checksums of those regions. + */ + +typedef struct indexSaveData_v1 { + uint64_t timestamp; // ms since epoch... + uint64_t nonce; + uint32_t version; // 1 + uint32_t unused__; +} IndexSaveData; + +typedef struct indexSaveLayout { + LayoutRegion indexSave; + LayoutRegion header; + unsigned int numZones; + LayoutRegion indexPageMap; + LayoutRegion freeSpace; + LayoutRegion *masterIndexZones; + LayoutRegion *openChapter; + IndexSaveType saveType; + IndexSaveData saveData; + Buffer *indexStateBuffer; + bool read; + bool written; +} IndexSaveLayout; + +typedef struct subIndexLayout { + LayoutRegion subIndex; + uint64_t nonce; + LayoutRegion volume; + IndexSaveLayout *saves; +} SubIndexLayout; + +typedef struct superBlockData_v1 { + byte magicLabel[32]; + byte nonceInfo[32]; + uint64_t nonce; + uint32_t version; // 2 + uint32_t blockSize; // for verification + uint16_t numIndexes; // 1 + uint16_t maxSaves; + uint64_t openChapterBlocks; + uint64_t pageMapBlocks; +} SuperBlockData; + +struct indexLayout { + IOFactory *factory; + off_t offset; + struct index_version indexVersion; + SuperBlockData super; + LayoutRegion header; + LayoutRegion config; + SubIndexLayout index; + LayoutRegion seal; + uint64_t totalBlocks; + int refCount; +}; + +/** + * Structure used to compute single file layout sizes. + * + * Note that the masterIndexBlocks represent all zones and are sized for + * the maximum number of blocks that would be needed regardless of the number + * of zones (up to the maximum value) that are used at run time. + * + * Similarly, the number of saves is sized for the minimum safe value + * assuming checkpointing is enabled, since that is also a run-time parameter. + **/ +typedef struct saveLayoutSizes { + Configuration config; // this is a captive copy + Geometry geometry; // this is a captive copy + unsigned int numSaves; // per sub-index + size_t blockSize; // in bytes + uint64_t volumeBlocks; // per sub-index + uint64_t masterIndexBlocks; // per save + uint64_t pageMapBlocks; // per save + uint64_t openChapterBlocks; // per save + uint64_t saveBlocks; // per sub-index + uint64_t subIndexBlocks; // per sub-index + uint64_t totalBlocks; // for whole layout +} SaveLayoutSizes; + +enum { + INDEX_STATE_BUFFER_SIZE = 512, + MAX_SAVES = 5, +}; + +static const byte SINGLE_FILE_MAGIC_1[32] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +enum { + SINGLE_FILE_MAGIC_1_LENGTH = sizeof(SINGLE_FILE_MAGIC_1), +}; + +static int reconstituteSingleFileLayout(IndexLayout *layout, + SuperBlockData *super, + RegionTable *table, + uint64_t firstBlock) + __attribute__((warn_unused_result)); +static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +static INLINE uint64_t blockCount(uint64_t bytes, uint32_t blockSize) +{ + uint64_t blocks = bytes / blockSize; + if (bytes % blockSize > 0) { + ++blocks; + } + return blocks; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int computeSizes(SaveLayoutSizes *sls, + const UdsConfiguration config, + size_t blockSize, + unsigned int numCheckpoints) +{ + if (config->bytesPerPage % blockSize != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "page size not a multiple of block size"); + } + + Configuration *cfg = NULL; + int result = makeConfiguration(config, &cfg); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot compute layout size"); + } + + memset(sls, 0, sizeof(*sls)); + + // internalize the configuration and geometry... + + sls->geometry = *cfg->geometry; + sls->config = *cfg; + sls->config.geometry = &sls->geometry; + + freeConfiguration(cfg); + + sls->numSaves = 2 + numCheckpoints; + sls->blockSize = blockSize; + sls->volumeBlocks = sls->geometry.bytesPerVolume / blockSize; + + result = computeMasterIndexSaveBlocks(&sls->config, blockSize, + &sls->masterIndexBlocks); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot compute index save size"); + } + + sls->pageMapBlocks = + blockCount(computeIndexPageMapSaveSize(&sls->geometry), blockSize); + sls->openChapterBlocks = + blockCount(computeSavedOpenChapterSize(&sls->geometry), blockSize); + sls->saveBlocks = 1 + (sls->masterIndexBlocks + + sls->pageMapBlocks + sls->openChapterBlocks); + sls->subIndexBlocks = sls->volumeBlocks + (sls->numSaves * sls->saveBlocks); + sls->totalBlocks = 3 + sls->subIndexBlocks; + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int udsComputeIndexSize(const UdsConfiguration config, + unsigned int numCheckpoints, + uint64_t *indexSize) +{ + SaveLayoutSizes sizes; + int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, numCheckpoints); + if (result != UDS_SUCCESS) { + return result; + } + + if (indexSize != NULL) { + *indexSize = sizes.totalBlocks * sizes.blockSize; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int openLayoutReader(IndexLayout *layout, + LayoutRegion *lr, + BufferedReader **readerPtr) +{ + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + return openBufferedReader(layout->factory, start, size, readerPtr); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int openLayoutWriter(IndexLayout *layout, + LayoutRegion *lr, + BufferedWriter **writerPtr) +{ + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + return openBufferedWriter(layout->factory, start, size, writerPtr); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) +{ + int result = getUInt64LEFromBuffer(buffer, &saveData->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &saveData->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &saveData->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &saveData->unused__); + if (result != UDS_SUCCESS) { + return result; + } + // The unused padding has to be zeroed for correct nonce calculation + if (saveData->unused__ != 0) { + return UDS_CORRUPT_COMPONENT; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*saveData)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeRegionHeader(Buffer *buffer, RegionHeader *header) +{ + int result = getUInt64LEFromBuffer(buffer, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->regionBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->type); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->numRegions); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->payload); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*header)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeLayoutRegion(Buffer *buffer, LayoutRegion *region) +{ + size_t cl1 = contentLength(buffer); + + int result = getUInt64LEFromBuffer(buffer, ®ion->startBlock); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, ®ion->numBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, ®ion->checksum); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, ®ion->kind); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, ®ion->instance); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(cl1 - contentLength(buffer) == sizeof(*region), + "%zu bytes decoded, of %zu expected", + cl1 - contentLength(buffer), sizeof(*region)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadRegionTable(BufferedReader *reader, RegionTable **tablePtr) +{ + Buffer *buffer; + int result = makeBuffer(sizeof(RegionHeader), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read region table header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + RegionHeader header; + result = decodeRegionHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (header.magic != REGION_MAGIC) { + return UDS_NO_INDEX; + } + if (header.version != 1) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unknown region table version %" PRIu16, + header.version); + } + + RegionTable *table; + result = ALLOCATE_EXTENDED(RegionTable, header.numRegions, LayoutRegion, + "single file layout region table", &table); + if (result != UDS_SUCCESS) { + return result; + } + + table->header = header; + result = makeBuffer(header.numRegions * sizeof(LayoutRegion), &buffer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "cannot read region table layouts"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return result; + } + unsigned int i; + for (i = 0; i < header.numRegions; i++){ + result = decodeLayoutRegion(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return result; + } + } + freeBuffer(&buffer); + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeSuperBlockData(Buffer *buffer, SuperBlockData *super) +{ + int result = getBytesFromBuffer(buffer, 32, super->magicLabel); + if (result != UDS_SUCCESS) { + return result; + } + result = getBytesFromBuffer(buffer, 32, super->nonceInfo); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &super->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &super->blockSize); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &super->numIndexes); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &super->maxSaves); + if (result != UDS_SUCCESS) { + return result; + } + result = skipForward(buffer, 4); // aligment + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->openChapterBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->pageMapBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*super)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int readSuperBlockData(BufferedReader *reader, + SuperBlockData *super, + size_t savedSize) +{ + if (savedSize != sizeof(SuperBlockData)) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected super block data size %zu", + savedSize); + } + + if (sizeof(super->magicLabel) != SINGLE_FILE_MAGIC_1_LENGTH) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "super block magic label size incorrect"); + } + + Buffer *buffer; + int result = makeBuffer(savedSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read region table header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeSuperBlockData(buffer, super); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot read super block data"); + } + + if (memcmp(super->magicLabel, SINGLE_FILE_MAGIC_1, + SINGLE_FILE_MAGIC_1_LENGTH) != 0) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unknown superblock magic label"); + } + + if ((super->version < SUPER_VERSION_MINIMUM) + || (super->version > SUPER_VERSION_MAXIMUM)) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unknown superblock version number %" + PRIu32, + super->version); + } + + // We dropped the usage of multiple subindices before we ever ran UDS code in + // the kernel. We do not have code that will handle multiple subindices. + if (super->numIndexes != 1) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "invalid subindex count %" PRIu32, + super->numIndexes); + } + + if (generateMasterNonce(super->nonceInfo, sizeof(super->nonceInfo)) != + super->nonce) + { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "inconsistent superblock nonce"); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int allocateSingleFileParts(IndexLayout *layout, + SuperBlockData *super) +{ + int result = ALLOCATE(super->maxSaves, IndexSaveLayout, __func__, + &layout->index.saves); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadSuperBlock(IndexLayout *layout, + size_t blockSize, + uint64_t firstBlock, + BufferedReader *reader) +{ + RegionTable *table = NULL; + int result = loadRegionTable(reader, &table); + if (result != UDS_SUCCESS) { + return result; + } + + if (table->header.type != RH_TYPE_SUPER) { + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "not a superblock region table"); + } + + SuperBlockData superBlockData; + result = readSuperBlockData(reader, &superBlockData, table->header.payload); + if (result != UDS_SUCCESS) { + FREE(table); + return logErrorWithStringError(result, "unknown superblock format"); + } + + if (superBlockData.blockSize != blockSize) { + FREE(table); + return logErrorWithStringError(UDS_WRONG_INDEX_CONFIG, + "superblock saved blockSize %" PRIu32 + " differs from supplied blockSize %zu", + superBlockData.blockSize, blockSize); + } + initializeIndexVersion(&layout->indexVersion, superBlockData.version); + + result = allocateSingleFileParts(layout, &superBlockData); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = reconstituteSingleFileLayout(layout, &superBlockData, table, + firstBlock); + FREE(table); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int readIndexSaveData(BufferedReader *reader, + IndexSaveData *saveData, + size_t savedSize, + Buffer **bufferPtr) +{ + int result = UDS_SUCCESS; + if (savedSize == 0) { + memset(saveData, 0, sizeof(*saveData)); + } else { + if (savedSize < sizeof(IndexSaveData)) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index save data size %zu", + savedSize); + } + + Buffer *buffer; + result = makeBuffer(sizeof(*saveData), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read index save data"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = decodeIndexSaveData(buffer, saveData); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + savedSize -= sizeof(IndexSaveData); + + if (saveData->version > 1) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unkown index save verion number %" + PRIu32, + saveData->version); + } + + if (savedSize > INDEX_STATE_BUFFER_SIZE) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index state buffer size %zu", + savedSize); + } + } + + Buffer *buffer = NULL; + + if (saveData->version != 0) { + result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (savedSize > 0) { + result = readFromBufferedReader(reader, getBufferContents(buffer), + savedSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = resetBufferEnd(buffer, savedSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + } + } + + *bufferPtr = buffer; + return UDS_SUCCESS; +} + +/*****************************************************************************/ + +typedef struct { + LayoutRegion *nextRegion; + LayoutRegion *lastRegion; + uint64_t nextBlock; + int result; +} RegionIterator; + +/*****************************************************************************/ +__attribute__((format(printf, 2, 3))) +static void iterError(RegionIterator *iter, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + int r = vLogWithStringError(LOG_ERR, UDS_UNEXPECTED_RESULT, fmt, args); + va_end(args); + if (iter->result == UDS_SUCCESS) { + iter->result = r; + } +} + +/** + * Set the next layout region in the layout according to a region table + * iterator, unless the iterator already contains an error + * + * @param expect whether to record an error or return false + * @param lr the layout region field to set + * @param iter the region iterator, which also holds the cumulative + * result + * @param numBlocks if non-zero, the expected number of blocks + * @param kind the expected kind of the region + * @param instance the expected instance number of the region + * + * @return true if we meet expectations, false if we do not + **/ +static bool expectLayout(bool expect, + LayoutRegion *lr, + RegionIterator *iter, + uint64_t numBlocks, + RegionKind kind, + unsigned int instance) +{ + if (iter->result != UDS_SUCCESS) { + return false; + } + + if (iter->nextRegion == iter->lastRegion) { + if (expect) { + iterError(iter, "ran out of layout regions in region table"); + } + return false; + } + + if (iter->nextRegion->startBlock != iter->nextBlock) { + iterError(iter, "layout region not at expected offset"); + return false; + } + + if (iter->nextRegion->kind != kind) { + if (expect) { + iterError(iter, "layout region has incorrect kind"); + } + return false; + } + + if (iter->nextRegion->instance != instance) { + iterError(iter, "layout region has incorrect instance"); + return false; + } + + if (numBlocks > 0 && iter->nextRegion->numBlocks != numBlocks) { + iterError(iter, "layout region size is incorrect"); + return false; + } + + if (lr != NULL) { + *lr = *iter->nextRegion; + } + + iter->nextBlock += iter->nextRegion->numBlocks; + iter->nextRegion++; + return true; +} + +/*****************************************************************************/ +static void setupLayout(LayoutRegion *lr, + uint64_t *nextAddrPtr, + uint64_t regionSize, + unsigned int kind, + unsigned int instance) +{ + *lr = (LayoutRegion) { + .startBlock = *nextAddrPtr, + .numBlocks = regionSize, + .checksum = 0, + .kind = kind, + .instance = instance, + }; + *nextAddrPtr += regionSize; +} + +/*****************************************************************************/ +static void populateIndexSaveLayout(IndexSaveLayout *isl, + SuperBlockData *super, + unsigned int numZones, + IndexSaveType saveType) +{ + uint64_t nextBlock = isl->indexSave.startBlock; + + setupLayout(&isl->header, &nextBlock, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); + setupLayout(&isl->indexPageMap, &nextBlock, super->pageMapBlocks, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + + uint64_t blocksAvail = (isl->indexSave.numBlocks - + (nextBlock - isl->indexSave.startBlock) - + super->openChapterBlocks); + + if (numZones > 0) { + uint64_t miBlockCount = blocksAvail / numZones; + unsigned int z; + for (z = 0; z < numZones; ++z) { + LayoutRegion *miz = &isl->masterIndexZones[z]; + setupLayout(miz, &nextBlock, miBlockCount, RL_KIND_MASTER_INDEX, z); + } + } + if (saveType == IS_SAVE && isl->openChapter != NULL) { + setupLayout(isl->openChapter, &nextBlock, super->openChapterBlocks, + RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); + } + setupLayout(&isl->freeSpace, &nextBlock, + (isl->indexSave.numBlocks - + (nextBlock - isl->indexSave.startBlock)), + RL_KIND_SCRATCH, RL_SOLE_INSTANCE); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int reconstructIndexSave(IndexSaveLayout *isl, + IndexSaveData *saveData, + SuperBlockData *super, + RegionTable *table) +{ + isl->numZones = 0; + isl->saveData = *saveData; + isl->read = false; + isl->written = false; + + if (table->header.type == RH_TYPE_SAVE) { + isl->saveType = IS_SAVE; + } else if (table->header.type == RH_TYPE_CHECKPOINT) { + isl->saveType = IS_CHECKPOINT; + } else { + isl->saveType = NO_SAVE; + } + + if ((table->header.numRegions == 0) || + ((table->header.numRegions == 1) && + (table->regions[0].kind == RL_KIND_SCRATCH))) + { + populateIndexSaveLayout(isl, super, 0, NO_SAVE); + return UDS_SUCCESS; + } + + RegionIterator iter = { + .nextRegion = table->regions, + .lastRegion = table->regions + table->header.numRegions, + .nextBlock = isl->indexSave.startBlock, + .result = UDS_SUCCESS, + }; + + expectLayout(true, &isl->header, &iter, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); + expectLayout(true, &isl->indexPageMap, &iter, 0, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + unsigned int n = 0; + RegionIterator tmpIter; + for (tmpIter = iter; + expectLayout(false, NULL, &tmpIter, 0, RL_KIND_MASTER_INDEX, n); + ++n) + ; + isl->numZones = n; + + int result = UDS_SUCCESS; + + if (isl->numZones > 0) { + result = ALLOCATE(n, LayoutRegion, "master index layout regions", + &isl->masterIndexZones); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (isl->saveType == IS_SAVE) { + result = ALLOCATE(1, LayoutRegion, "open chapter layout region", + &isl->openChapter); + if (result != UDS_SUCCESS) { + FREE(isl->masterIndexZones); + return result; + } + } + + unsigned int z; + for (z = 0; z < isl->numZones; ++z) { + expectLayout(true, &isl->masterIndexZones[z], &iter, 0, + RL_KIND_MASTER_INDEX, z); + } + if (isl->saveType == IS_SAVE) { + expectLayout(true, isl->openChapter, &iter, 0, + RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); + } + if (!expectLayout(false, &isl->freeSpace, &iter, 0, + RL_KIND_SCRATCH, RL_SOLE_INSTANCE)) + { + isl->freeSpace = (LayoutRegion) { + .startBlock = iter.nextBlock, + .numBlocks = (isl->indexSave.startBlock + + isl->indexSave.numBlocks) - iter.nextBlock, + .checksum = 0, + .kind = RL_KIND_SCRATCH, + .instance = RL_SOLE_INSTANCE, + }; + iter.nextBlock = isl->freeSpace.startBlock + isl->freeSpace.numBlocks; + } + + if (iter.result != UDS_SUCCESS) { + return iter.result; + } + if (iter.nextRegion != iter.lastRegion) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "expected %ld additional regions", + iter.lastRegion - iter.nextRegion); + } + if (iter.nextBlock != isl->indexSave.startBlock + isl->indexSave.numBlocks) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "index save layout table incomplete"); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadIndexSave(IndexSaveLayout *isl, + SuperBlockData *super, + BufferedReader *reader, + unsigned int saveId) +{ + RegionTable *table = NULL; + int result = loadRegionTable(reader, &table); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot read index 0 save %u header", + saveId); + } + + if (table->header.regionBlocks != isl->indexSave.numBlocks) { + uint64_t regionBlocks = table->header.regionBlocks; + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index 0 save %u " + "region block count %llu", + saveId, regionBlocks); + } + + if (table->header.type != RH_TYPE_SAVE && + table->header.type != RH_TYPE_CHECKPOINT && + table->header.type != RH_TYPE_UNSAVED) + { + unsigned int type = table->header.type; + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, "unexpected" + " index 0 save %u header type %u", + saveId, type); + } + + IndexSaveData indexSaveData; + result = readIndexSaveData(reader, &indexSaveData, table->header.payload, + &isl->indexStateBuffer); + if (result != UDS_SUCCESS) { + FREE(table); + return logErrorWithStringError(result, + "unknown index 0 save %u data format", + saveId); + } + + result = reconstructIndexSave(isl, &indexSaveData, super, table); + FREE(table); + + if (result != UDS_SUCCESS) { + freeBuffer(&isl->indexStateBuffer); + return logErrorWithStringError(result, + "cannot reconstruct index 0 save %u", + saveId); + } + isl->read = true; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadSubIndexRegions(IndexLayout *layout) +{ + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &layout->index.saves[j]; + + BufferedReader *reader; + int result = openLayoutReader(layout, &isl->indexSave, &reader); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot get reader for index 0 save %u", + j); + while (j-- > 0) { + IndexSaveLayout *isl = &layout->index.saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + return result; + } + + result = loadIndexSave(isl, &layout->super, reader, j); + freeBufferedReader(reader); + if (result != UDS_SUCCESS) { + while (j-- > 0) { + IndexSaveLayout *isl = &layout->index.saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + return result; + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int loadIndexLayout(IndexLayout *layout) +{ + BufferedReader *reader; + int result = openBufferedReader(layout->factory, layout->offset, + UDS_BLOCK_SIZE, &reader); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "unable to read superblock"); + } + + result = loadSuperBlock(layout, UDS_BLOCK_SIZE, + layout->offset / UDS_BLOCK_SIZE, reader); + freeBufferedReader(reader); + if (result != UDS_SUCCESS) { + FREE(layout->index.saves); + layout->index.saves = NULL; + return result; + } + + result = loadSubIndexRegions(layout); + if (result != UDS_SUCCESS) { + FREE(layout->index.saves); + layout->index.saves = NULL; + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void generateSuperBlockData(size_t blockSize, + unsigned int maxSaves, + uint64_t openChapterBlocks, + uint64_t pageMapBlocks, + SuperBlockData *super) +{ + memset(super, 0, sizeof(*super)); + memcpy(super->magicLabel, SINGLE_FILE_MAGIC_1, SINGLE_FILE_MAGIC_1_LENGTH); + createUniqueNonceData(super->nonceInfo, sizeof(super->nonceInfo)); + + super->nonce = generateMasterNonce(super->nonceInfo, + sizeof(super->nonceInfo)); + super->version = SUPER_VERSION_CURRENT; + super->blockSize = blockSize; + super->numIndexes = 1; + super->maxSaves = maxSaves; + super->openChapterBlocks = openChapterBlocks; + super->pageMapBlocks = pageMapBlocks; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int resetIndexSaveLayout(IndexSaveLayout *isl, + uint64_t *nextBlockPtr, + uint64_t saveBlocks, + uint64_t pageMapBlocks, + unsigned int instance) +{ + uint64_t startBlock = *nextBlockPtr; + + if (isl->masterIndexZones) { + FREE(isl->masterIndexZones); + } + if (isl->openChapter) { + FREE(isl->openChapter); + } + if (isl->indexStateBuffer) { + freeBuffer(&isl->indexStateBuffer); + } + memset(isl, 0, sizeof(*isl)); + isl->saveType = NO_SAVE; + setupLayout(&isl->indexSave, &startBlock, saveBlocks, RL_KIND_SAVE, + instance); + setupLayout(&isl->header, nextBlockPtr, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + setupLayout(&isl->indexPageMap, nextBlockPtr, pageMapBlocks, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + uint64_t remaining = startBlock - *nextBlockPtr; + setupLayout(&isl->freeSpace, nextBlockPtr, remaining, RL_KIND_SCRATCH, + RL_SOLE_INSTANCE); + // number of zones is a save-time parameter + // presence of open chapter is a save-time parameter + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void defineSubIndexNonce(SubIndexLayout *sil, + uint64_t masterNonce, + unsigned int indexId) +{ + struct subIndexNonceData { + uint64_t offset; + uint16_t indexId; + }; + byte buffer[sizeof(struct subIndexNonceData)] = { 0 }; + size_t offset = 0; + encodeUInt64LE(buffer, &offset, sil->subIndex.startBlock); + encodeUInt16LE(buffer, &offset, indexId); + sil->nonce = generateSecondaryNonce(masterNonce, buffer, sizeof(buffer)); + if (sil->nonce == 0) { + sil->nonce = generateSecondaryNonce(~masterNonce + 1, + buffer, sizeof(buffer)); + } +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int setupSubIndex(SubIndexLayout *sil, + uint64_t *nextBlockPtr, + SaveLayoutSizes *sls, + unsigned int instance, + uint64_t masterNonce) +{ + uint64_t startBlock = *nextBlockPtr; + + setupLayout(&sil->subIndex, &startBlock, sls->subIndexBlocks, + RL_KIND_INDEX, instance); + setupLayout(&sil->volume, nextBlockPtr, sls->volumeBlocks, + RL_KIND_VOLUME, RL_SOLE_INSTANCE); + unsigned int i; + for (i = 0; i < sls->numSaves; ++i) { + int result = resetIndexSaveLayout(&sil->saves[i], nextBlockPtr, + sls->saveBlocks, sls->pageMapBlocks, i); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (startBlock != *nextBlockPtr) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "sub index layout regions don't agree"); + } + + defineSubIndexNonce(sil, masterNonce, instance); + return UDS_SUCCESS; +} + +/*****************************************************************************/ +/** + * Initialize a single file layout using the save layout sizes specified. + * + * @param layout the layout to initialize + * @param offset the offset in bytes from the start of the backing storage + * @param size the size in bytes of the backing storage + * @param sls a populated SaveLayoutSizes object + * + * @return UDS_SUCCESS or an error code, potentially + * UDS_INSUFFICIENT_INDEX_SPACE if the size of the backing store + * is not sufficient for the index configuration, + * UDS_BAD_INDEX_ALIGNMENT if the offset specified does not + * align properly with the index block and page sizes] + * various other errors + **/ +__attribute__((warn_unused_result)) +static int initSingleFileLayout(IndexLayout *layout, + uint64_t offset, + uint64_t size, + SaveLayoutSizes *sls) +{ + layout->totalBlocks = sls->totalBlocks; + + if (size < sls->totalBlocks * sls->blockSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "not enough space for index as configured"); + } + + generateSuperBlockData(sls->blockSize, sls->numSaves, sls->openChapterBlocks, + sls->pageMapBlocks, &layout->super); + initializeIndexVersion(&layout->indexVersion, SUPER_VERSION_CURRENT); + + int result = allocateSingleFileParts(layout, &layout->super); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t nextBlock = offset / sls->blockSize; + + setupLayout(&layout->header, &nextBlock, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + setupLayout(&layout->config, &nextBlock, 1, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + result = setupSubIndex(&layout->index, &nextBlock, sls, 0, + layout->super.nonce); + if (result != UDS_SUCCESS) { + return result; + } + setupLayout(&layout->seal, &nextBlock, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); + if (nextBlock * sls->blockSize > offset + size) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "layout does not fit as expected"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void expectSubIndex(SubIndexLayout *sil, + RegionIterator *iter, + SuperBlockData *super, + unsigned int instance) +{ + if (iter->result != UDS_SUCCESS) { + return; + } + + uint64_t startBlock = iter->nextBlock; + + expectLayout(true, &sil->subIndex, iter, 0, RL_KIND_INDEX, instance); + + uint64_t endBlock = iter->nextBlock; + iter->nextBlock = startBlock; + + expectLayout(true, &sil->volume, iter, 0, RL_KIND_VOLUME, RL_SOLE_INSTANCE); + + unsigned int i; + for (i = 0; i < super->maxSaves; ++i) { + IndexSaveLayout *isl = &sil->saves[i]; + expectLayout(true, &isl->indexSave, iter, 0, RL_KIND_SAVE, i); + } + + if (iter->nextBlock != endBlock) { + iterError(iter, "sub index region does not span all saves"); + } + + defineSubIndexNonce(sil, super->nonce, instance); +} + +/*****************************************************************************/ + +/** + * Initialize a single file layout from the region table and super block data + * stored in stable storage. + * + * @param layout the layout to initialize + * @param region the IO region for this layout + * @param super the super block data read from the superblock + * @param table the region table read from the superblock + * @param firstBlock the first block number in the region + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int reconstituteSingleFileLayout(IndexLayout *layout, + SuperBlockData *super, + RegionTable *table, + uint64_t firstBlock) +{ + layout->super = *super; + layout->totalBlocks = table->header.regionBlocks; + + RegionIterator iter = { + .nextRegion = table->regions, + .lastRegion = table->regions + table->header.numRegions, + .nextBlock = firstBlock, + .result = UDS_SUCCESS + }; + + expectLayout(true, &layout->header, &iter, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + expectLayout(true, &layout->config, &iter, 1, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + expectSubIndex(&layout->index, &iter, &layout->super, 0); + expectLayout(true, &layout->seal, &iter, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); + + if (iter.result != UDS_SUCCESS) { + return iter.result; + } + + if (iter.nextBlock != firstBlock + layout->totalBlocks) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "layout table does not span total blocks"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int saveSubIndexRegions(IndexLayout *layout) +{ + SubIndexLayout *sil = &layout->index; + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &sil->saves[j]; + int result = writeIndexSaveLayout(layout, isl); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "unable to format index %u save 0 layout", + j); + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int makeSingleFileRegionTable(IndexLayout *layout, + unsigned int *numRegionsPtr, + RegionTable **tablePtr) +{ + unsigned int numRegions = + 1 + // header + 1 + // config + 1 + // index + 1 + // volume + layout->super.maxSaves + // saves + 1; // seal + + RegionTable *table; + int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, + "layout region table", &table); + if (result != UDS_SUCCESS) { + return result; + } + + LayoutRegion *lr = &table->regions[0]; + *lr++ = layout->header; + *lr++ = layout->config; + SubIndexLayout *sil = &layout->index; + *lr++ = sil->subIndex; + *lr++ = sil->volume; + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + *lr++ = sil->saves[j].indexSave; + } + *lr++ = layout->seal; + + result = ASSERT((lr == &table->regions[numRegions]), + "incorrect number of regions"); + if (result != UDS_SUCCESS) { + return result; + } + + *numRegionsPtr = numRegions; + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) +{ + int result = putUInt64LEIntoBuffer(buffer, saveData->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, saveData->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, saveData->version); + if (result != UDS_SUCCESS) { + return result; + } + result = zeroBytes(buffer, 4); /* padding */ + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof *saveData, + "%zu bytes encoded of %zu expected", + contentLength(buffer), sizeof(*saveData)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeRegionHeader(Buffer *buffer, RegionHeader *header) +{ + size_t startingLength = contentLength(buffer); + int result = putUInt64LEIntoBuffer(buffer, REGION_MAGIC); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->regionBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->type); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->version); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->numRegions); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->payload); + if (result != UDS_SUCCESS) { + return result; + } + result + = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*header), + "%zu bytes encoded, of %zu expected", + contentLength(buffer) - startingLength, sizeof(*header)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeLayoutRegion(Buffer *buffer, LayoutRegion *region) +{ + size_t startingLength = contentLength(buffer); + int result = putUInt64LEIntoBuffer(buffer, region->startBlock); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, region->numBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, region->checksum); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, region->kind); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, region->instance); + if (result != UDS_SUCCESS) { + return result; + } + result + = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*region), + "%zu bytes encoded, of %zu expected", + contentLength(buffer) - startingLength, sizeof(*region)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeSuperBlockData(Buffer *buffer, SuperBlockData *super) +{ + int result = putBytes(buffer, 32, &super->magicLabel); + if (result != UDS_SUCCESS) { + return result; + } + result = putBytes(buffer, 32, &super->nonceInfo); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, super->version); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, super->blockSize); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, super->numIndexes); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, super->maxSaves); + if (result != UDS_SUCCESS) { + return result; + } + result = zeroBytes(buffer, 4); // aligment + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->openChapterBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->pageMapBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(SuperBlockData), + "%zu bytes encoded, of %zu expected", + contentLength(buffer), sizeof(SuperBlockData)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int writeSingleFileHeader(IndexLayout *layout, + RegionTable *table, + unsigned int numRegions, + BufferedWriter *writer) +{ + table->header = (RegionHeader) { + .magic = REGION_MAGIC, + .regionBlocks = layout->totalBlocks, + .type = RH_TYPE_SUPER, + .version = 1, + .numRegions = numRegions, + .payload = sizeof(layout->super), + }; + + size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); + + Buffer *buffer; + int result = makeBuffer(tableSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeRegionHeader(buffer, &table->header); + + unsigned int i; + for (i = 0; i < numRegions; i++) { + if (result == UDS_SUCCESS) { + result = encodeLayoutRegion(buffer, &table->regions[i]); + } + } + + if (result == UDS_SUCCESS) { + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + } + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBuffer(sizeof(layout->super), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeSuperBlockData(buffer, &layout->super); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + return flushBufferedWriter(writer); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int saveSingleFileConfiguration(IndexLayout *layout) +{ + int result = saveSubIndexRegions(layout); + if (result != UDS_SUCCESS) { + return result; + } + + RegionTable *table; + unsigned int numRegions; + result = makeSingleFileRegionTable(layout, &numRegions, &table); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer = NULL; + result = openLayoutWriter(layout, &layout->header, &writer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = writeSingleFileHeader(layout, table, numRegions, writer); + FREE(table); + freeBufferedWriter(writer); + + return result; +} + +/*****************************************************************************/ +void putIndexLayout(IndexLayout **layoutPtr) +{ + if (layoutPtr == NULL) { + return; + } + IndexLayout *layout = *layoutPtr; + *layoutPtr = NULL; + if ((layout == NULL) || (--layout->refCount > 0)) { + return; + } + + SubIndexLayout *sil = &layout->index; + if (sil->saves != NULL) { + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &sil->saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + } + FREE(sil->saves); + + if (layout->factory != NULL) { + putIOFactory(layout->factory); + } + FREE(layout); +} + +/*****************************************************************************/ +void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr) +{ + ++layout->refCount; + *layoutPtr = layout; +} + +/*****************************************************************************/ +const struct index_version *getIndexVersion(IndexLayout *layout) +{ + return &layout->indexVersion; +} + +/*****************************************************************************/ +int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) +{ + BufferedWriter *writer = NULL; + int result = openLayoutWriter(layout, &layout->config, &writer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "failed to open config region"); + } + + result = writeConfigContents(writer, config); + if (result != UDS_SUCCESS) { + freeBufferedWriter(writer); + return logErrorWithStringError(result, "failed to write config region"); + } + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + freeBufferedWriter(writer); + return logErrorWithStringError(result, "cannot flush config writer"); + } + freeBufferedWriter(writer); + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) +{ + BufferedReader *reader = NULL; + int result = openLayoutReader(layout, &layout->config, &reader); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "failed to open config reader"); + } + + struct udsConfiguration storedConfig; + result = readConfigContents(reader, &storedConfig); + if (result != UDS_SUCCESS) { + freeBufferedReader(reader); + return logErrorWithStringError(result, "failed to read config region"); + } + freeBufferedReader(reader); + + return (areUdsConfigurationsEqual(&storedConfig, config) + ? UDS_SUCCESS + : UDS_NO_INDEX); +} + +#ifdef __KERNEL__ +/*****************************************************************************/ +int openVolumeBufio(IndexLayout *layout, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) +{ + off_t offset = layout->index.volume.startBlock * layout->super.blockSize; + return makeBufio(layout->factory, offset, blockSize, reservedBuffers, + clientPtr); +} +#else +/*****************************************************************************/ +int openVolumeRegion(IndexLayout *layout, IORegion **regionPtr) +{ + LayoutRegion *lr = &layout->index.volume; + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + int result = makeIORegion(layout->factory, start, size, regionPtr); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot access index volume region"); + } + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +uint64_t getVolumeNonce(IndexLayout *layout) +{ + return layout->index.nonce; +} + +/*****************************************************************************/ +static uint64_t generateIndexSaveNonce(uint64_t volumeNonce, + IndexSaveLayout *isl) +{ + struct SaveNonceData { + IndexSaveData data; + uint64_t offset; + } nonceData; + + nonceData.data = isl->saveData; + nonceData.data.nonce = 0; + nonceData.offset = isl->indexSave.startBlock; + + byte buffer[sizeof(nonceData)]; + size_t offset = 0; + encodeUInt64LE(buffer, &offset, nonceData.data.timestamp); + encodeUInt64LE(buffer, &offset, nonceData.data.nonce); + encodeUInt32LE(buffer, &offset, nonceData.data.version); + encodeUInt32LE(buffer, &offset, 0U); // padding + encodeUInt64LE(buffer, &offset, nonceData.offset); + ASSERT_LOG_ONLY(offset == sizeof(nonceData), + "%zu bytes encoded of %zu expected", + offset, sizeof(nonceData)); + return generateSecondaryNonce(volumeNonce, buffer, sizeof(buffer)); +} + +/*****************************************************************************/ +static int validateIndexSaveLayout(IndexSaveLayout *isl, + uint64_t volumeNonce, + uint64_t *saveTimePtr) +{ + if (isl->saveType == NO_SAVE || isl->numZones == 0 || + isl->saveData.timestamp == 0) + { + return UDS_BAD_STATE; + } + if (isl->saveData.nonce != generateIndexSaveNonce(volumeNonce, isl)) { + return UDS_BAD_STATE; + } + if (saveTimePtr != NULL) { + *saveTimePtr = isl->saveData.timestamp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int selectOldestIndexSaveLayout(SubIndexLayout *sil, + unsigned int maxSaves, + IndexSaveLayout **islPtr) +{ + IndexSaveLayout *oldest = NULL; + uint64_t oldestTime = 0; + + // find the oldest valid or first invalid slot + IndexSaveLayout *isl; + for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { + uint64_t saveTime = 0; + int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); + if (result != UDS_SUCCESS) { + saveTime = 0; + } + if (oldest == NULL || saveTime < oldestTime) { + oldest = isl; + oldestTime = saveTime; + } + } + + int result = ASSERT((oldest != NULL), "no oldest or free save slot"); + if (result != UDS_SUCCESS) { + return result; + } + *islPtr = oldest; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int selectLatestIndexSaveLayout(SubIndexLayout *sil, + unsigned int maxSaves, + IndexSaveLayout **islPtr) +{ + IndexSaveLayout *latest = NULL; + uint64_t latestTime = 0; + + // find the latest valid save slot + IndexSaveLayout *isl; + for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { + uint64_t saveTime = 0; + int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); + if (result != UDS_SUCCESS) { + continue; + } + if (saveTime > latestTime) { + latest = isl; + latestTime = saveTime; + } + } + + if (latest == NULL) { + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + *islPtr = latest; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static uint64_t getTimeMS(AbsTime time) +{ + time_t t = asTimeT(time); + RelTime r = timeDifference(time, fromTimeT(t)); + return (uint64_t) t * 1000 + relTimeToMilliseconds(r); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int instantiateIndexSaveLayout(IndexSaveLayout *isl, + SuperBlockData *super, + uint64_t volumeNonce, + unsigned int numZones, + IndexSaveType saveType) +{ + int result = UDS_SUCCESS; + if (isl->openChapter && saveType == IS_CHECKPOINT) { + FREE(isl->openChapter); + isl->openChapter = NULL; + } else if (isl->openChapter == NULL && saveType == IS_SAVE) { + result = ALLOCATE(1, LayoutRegion, "open chapter layout", + &isl->openChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + if (numZones != isl->numZones) { + if (isl->masterIndexZones != NULL) { + FREE(isl->masterIndexZones); + } + result = ALLOCATE(numZones, LayoutRegion, "master index zone layouts", + &isl->masterIndexZones); + if (result != UDS_SUCCESS) { + return result; + } + isl->numZones = numZones; + } + + populateIndexSaveLayout(isl, super, numZones, saveType); + + result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &isl->indexStateBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + isl->read = isl->written = false; + isl->saveType = saveType; + memset(&isl->saveData, 0, sizeof(isl->saveData)); + isl->saveData.timestamp = getTimeMS(currentTime(CLOCK_REALTIME)); + isl->saveData.version = 1; + + isl->saveData.nonce = generateIndexSaveNonce(volumeNonce, isl); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int invalidateOldSave(IndexLayout *layout, IndexSaveLayout *isl) +{ + uint64_t startBlock = isl->indexSave.startBlock; + uint64_t saveBlocks = isl->indexSave.numBlocks; + unsigned int save = isl->indexSave.instance; + + int result = resetIndexSaveLayout(isl, &startBlock, saveBlocks, + layout->super.pageMapBlocks, save); + if (result != UDS_SUCCESS) { + return result; + } + + return writeIndexSaveLayout(layout, isl); +} + +/*****************************************************************************/ +int setupIndexSaveSlot(IndexLayout *layout, + unsigned int numZones, + IndexSaveType saveType, + unsigned int *saveSlotPtr) +{ + SubIndexLayout *sil = &layout->index; + + IndexSaveLayout *isl = NULL; + int result = selectOldestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + result = invalidateOldSave(layout, isl); + if (result != UDS_SUCCESS) { + return result; + } + + result = instantiateIndexSaveLayout(isl, &layout->super, sil->nonce, + numZones, saveType); + if (result != UDS_SUCCESS) { + return result; + } + + *saveSlotPtr = isl - sil->saves; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int findLatestIndexSaveSlot(IndexLayout *layout, + unsigned int *numZonesPtr, + unsigned int *slotPtr) +{ + SubIndexLayout *sil = &layout->index; + + IndexSaveLayout *isl = NULL; + int result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + if (numZonesPtr != NULL) { + *numZonesPtr = isl->numZones; + } + if (slotPtr != NULL) { + *slotPtr = isl - sil->saves; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int makeIndexSaveRegionTable(IndexSaveLayout *isl, + unsigned int *numRegionsPtr, + RegionTable **tablePtr) +{ + unsigned int numRegions = + 1 + // header + 1 + // index page map + isl->numZones + // master index zones + (bool) isl->openChapter; // open chapter if needed + + if (isl->freeSpace.numBlocks > 0) { + numRegions++; + } + + RegionTable *table; + int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, + "layout region table for ISL", &table); + if (result != UDS_SUCCESS) { + return result; + } + + LayoutRegion *lr = &table->regions[0]; + *lr++ = isl->header; + *lr++ = isl->indexPageMap; + unsigned int z; + for (z = 0; z < isl->numZones; ++z) { + *lr++ = isl->masterIndexZones[z]; + } + if (isl->openChapter) { + *lr++ = *isl->openChapter; + } + if (isl->freeSpace.numBlocks > 0) { + *lr++ = isl->freeSpace; + } + + result = ASSERT((lr == &table->regions[numRegions]), + "incorrect number of ISL regions"); + if (result != UDS_SUCCESS) { + return result; + } + + *numRegionsPtr = numRegions; + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static unsigned int regionTypeForSaveType(IndexSaveType saveType) +{ + switch (saveType) { + case IS_SAVE: + return RH_TYPE_SAVE; + + case IS_CHECKPOINT: + return RH_TYPE_CHECKPOINT; + + default: + break; + } + + return RH_TYPE_UNSAVED; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int writeIndexSaveHeader(IndexSaveLayout *isl, + RegionTable *table, + unsigned int numRegions, + BufferedWriter *writer) +{ + size_t payload = sizeof(isl->saveData); + if (isl->indexStateBuffer != NULL) { + payload += contentLength(isl->indexStateBuffer); + } + + table->header = (RegionHeader) { + .magic = REGION_MAGIC, + .regionBlocks = isl->indexSave.numBlocks, + .type = regionTypeForSaveType(isl->saveType), + .version = 1, + .numRegions = numRegions, + .payload = payload, + }; + + size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); + Buffer *buffer; + int result = makeBuffer(tableSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeRegionHeader(buffer, &table->header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + unsigned int i; + for (i = 0; i < numRegions; i++) { + result = encodeLayoutRegion(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == tableSize, + "%zu bytes encoded of %zu expected", + contentLength(buffer), tableSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBuffer(sizeof(isl->saveData), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeIndexSaveData(buffer, &isl->saveData); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (isl->indexStateBuffer != NULL) { + result = writeToBufferedWriter(writer, + getBufferContents(isl->indexStateBuffer), + contentLength(isl->indexStateBuffer)); + if (result != UDS_SUCCESS) { + return result; + } + } + + return flushBufferedWriter(writer); +} + +/*****************************************************************************/ +static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) +{ + unsigned int numRegions; + RegionTable *table; + int result = makeIndexSaveRegionTable(isl, &numRegions, &table); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer = NULL; + result = openLayoutWriter(layout, &isl->header, &writer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = writeIndexSaveHeader(isl, table, numRegions, writer); + FREE(table); + freeBufferedWriter(writer); + + isl->written = true; + return result; +} + +/*****************************************************************************/ +int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) +{ + int result = ASSERT((saveSlot < layout->super.maxSaves), + "save slot out of range"); + if (result != UDS_SUCCESS) { + return result; + } + + IndexSaveLayout *isl = &layout->index.saves[saveSlot]; + + if (bufferUsed(isl->indexStateBuffer) == 0) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: no index state data saved", __func__); + } + + return writeIndexSaveLayout(layout, isl); +} + +/*****************************************************************************/ + +static void mutilateIndexSaveInfo(IndexSaveLayout *isl) +{ + memset(&isl->saveData, 0, sizeof(isl->saveData)); + isl->read = isl->written = 0; + isl->saveType = NO_SAVE; + isl->numZones = 0; + freeBuffer(&isl->indexStateBuffer); +} + +/*****************************************************************************/ +int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) +{ + int result = ASSERT((saveSlot < layout->super.maxSaves), + "save slot out of range"); + if (result != UDS_SUCCESS) { + return result; + } + + mutilateIndexSaveInfo(&layout->index.saves[saveSlot]); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardIndexSaves(IndexLayout *layout, bool all) +{ + int result = UDS_SUCCESS; + SubIndexLayout *sil = &layout->index; + + if (all) { + unsigned int i; + for (i = 0; i < layout->super.maxSaves; ++i) { + IndexSaveLayout *isl = &sil->saves[i]; + result = firstError(result, invalidateOldSave(layout, isl)); + } + } else { + IndexSaveLayout *isl; + result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result == UDS_SUCCESS) { + result = invalidateOldSave(layout, isl); + } + } + + return result; +} + +/*****************************************************************************/ +static int createIndexLayout(IndexLayout *layout, + uint64_t size, + const UdsConfiguration config) +{ + if (config == NULL) { + return UDS_CONF_PTR_REQUIRED; + } + + SaveLayoutSizes sizes; + int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, 0); + if (result != UDS_SUCCESS) { + return result; + } + + if (size < sizes.totalBlocks * sizes.blockSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "layout requires at least %" PRIu64 + " bytes", + sizes.totalBlocks * sizes.blockSize); + } + + result = initSingleFileLayout(layout, layout->offset, size, &sizes); + if (result != UDS_SUCCESS) { + return result; + } + + result = saveSingleFileConfiguration(layout); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) +{ + return layout->index.saves[slot].indexStateBuffer; +} + +/*****************************************************************************/ +static int findLayoutRegion(IndexLayout *layout, + unsigned int slot, + const char *operation, + RegionKind kind, + unsigned int zone, + LayoutRegion **lrPtr) +{ + int result = ASSERT((slot < layout->super.maxSaves), "%s not started", + operation); + if (result != UDS_SUCCESS) { + return result; + } + + IndexSaveLayout *isl = &layout->index.saves[slot]; + + LayoutRegion *lr = NULL; + switch (kind) { + case RL_KIND_INDEX_PAGE_MAP: + lr = &isl->indexPageMap; + break; + + case RL_KIND_OPEN_CHAPTER: + if (isl->openChapter == NULL) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: %s has no open chapter", + __func__, operation); + } + lr = isl->openChapter; + break; + + case RL_KIND_MASTER_INDEX: + if (isl->masterIndexZones == NULL || zone >= isl->numZones) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: %s has no master index zone %u", + __func__, operation, zone); + } + lr = &isl->masterIndexZones[zone]; + break; + + default: + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%s: unexpected kind %u", + __func__, kind); + } + + *lrPtr = lr; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int openIndexBufferedReader(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) +{ + LayoutRegion *lr = NULL; + int result = findLayoutRegion(layout, slot, "load", kind, zone, &lr); + if (result != UDS_SUCCESS) { + return result; + } + return openLayoutReader(layout, lr, readerPtr); +} + +/*****************************************************************************/ +int openIndexBufferedWriter(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) +{ + LayoutRegion *lr = NULL; + int result = findLayoutRegion(layout, slot, "save", kind, zone, &lr); + if (result != UDS_SUCCESS) { + return result; + } + return openLayoutWriter(layout, lr, writerPtr); +} + +/*****************************************************************************/ +int makeIndexLayoutFromFactory(IOFactory *factory, + off_t offset, + uint64_t namedSize, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) +{ + // Get the device size and round it down to a multiple of UDS_BLOCK_SIZE. + size_t size = getWritableSize(factory) & -UDS_BLOCK_SIZE; + if (namedSize > size) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "index storage (%zu) is smaller than the" + " requested size %llu", + size, namedSize); + } + if ((namedSize > 0) && (namedSize < size)) { + size = namedSize; + } + + // Get the index size according the the config + uint64_t configSize; + int result = udsComputeIndexSize(config, 0, &configSize); + if (result != UDS_SUCCESS) { + return result; + } + if (size < configSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "index storage (%zu) is smaller than the" + " required size %llu", + size, configSize); + } + size = configSize; + + IndexLayout *layout = NULL; + result = ALLOCATE(1, IndexLayout, __func__, &layout); + if (result != UDS_SUCCESS) { + return result; + } + layout->refCount = 1; + + getIOFactory(factory); + layout->factory = factory; + layout->offset = offset; + + if (newLayout) { + // Populate the layout from the UDSConfiguration + result = createIndexLayout(layout, size, config); + } else { + // Populate the layout from the saved index. + result = loadIndexLayout(layout); + } + if (result != UDS_SUCCESS) { + putIndexLayout(&layout); + return result; + } + *layoutPtr = layout; + return UDS_SUCCESS; +} diff --git a/uds/indexLayout.h b/uds/indexLayout.h new file mode 100644 index 0000000..4144799 --- /dev/null +++ b/uds/indexLayout.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.h#13 $ + */ + +#ifndef INDEX_LAYOUT_H +#define INDEX_LAYOUT_H + +#include "buffer.h" +#include "indexState.h" +#include "indexVersion.h" +#include "ioFactory.h" +#include "uds.h" + +typedef struct indexLayout IndexLayout; + +/** + * Construct an index layout. This is a platform specific function that uses + * the name string, a flag that indicates old vs. new indices, and a + * UDSConfiguration (for new indices) to make an IOFactory and invoke + * makeIndexLayoutFromFactory. + * + * @param name String naming the index. Each platform will use its own + * conventions to interpret the string, but in general it is + * a space-separated sequence of param=value settings. For + * backward compatibility a string without an equals is + * treated as a platform-specific default parameter value. + * @param newLayout Whether this is a new layout. + * @param config The UdsConfiguration required for a new layout. + * @param layoutPtr Where to store the new index layout + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexLayout(const char *name, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Construct an index layout using an IOFactory. This method is common to all + * platforms. + * + * @param factory The IOFactory for the block storage containing the index. + * @param offset The offset of the start of the index within the block + * storage address space. + * @param namedSize The size in bytes of the space within the block storage + * address space, as specified in the name string. + * @param newLayout Whether this is a new layout. + * @param config The UdsConfiguration required for a new layout. + * @param layoutPtr Where to store the new index layout + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexLayoutFromFactory(IOFactory *factory, + off_t offset, + uint64_t namedSize, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Decrement the use count of an index layout. If the count goes to zero, free + * the index layout. + * + * @param layoutPtr Where the layout is being stored. Always reset to NULL. + **/ +void putIndexLayout(IndexLayout **layoutPtr); + +/*****************************************************************************/ +int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +int discardIndexSaves(IndexLayout *layout, bool all) + __attribute__((warn_unused_result)); + +/** + * Find the latest index save slot. + * + * @param [in] layout The single file layout. + * @param [out] numZonesPtr Where to store the actual number of zones + * that were saved. + * @param [out] slotPtr Where to store the slot number we found. + * + * @return UDS_SUCCESS or an error code. + **/ +int findLatestIndexSaveSlot(IndexLayout *layout, + unsigned int *numZonesPtr, + unsigned int *slotPtr) + __attribute__((warn_unused_result)); + +/** + * Get another reference to an index layout, incrementing it's use count. + * + * @param layout The index layout. + * @param layoutPtr Where the new layout pointer is being stored. + **/ +void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr); + +/** + * Open a BufferedReader for a specified state, kind, and zone. + * + * @param layout The index layout + * @param slot The save slot + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param readerPtr Where to store the BufferedReader. + * + * @return UDS_SUCCESS or an error code. + **/ +int openIndexBufferedReader(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedWriter for a specified state, kind, and zone. + * + * @param layout The index layout + * @param slot The save slot + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param writerPtr Where to store the BufferedWriter. + * + * @return UDS_SUCCESS or an error code. + **/ +int openIndexBufferedWriter(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +/** + * Obtain the nonce to be used to store or validate the loading of volume index + * pages. + * + * @param [in] layout The index layout. + * + * @return The nonce to use. + **/ +uint64_t getVolumeNonce(IndexLayout *layout) + __attribute__((warn_unused_result)); + +#ifdef __KERNEL__ +/** + * Obtain a dm_bufio_client for the specified index volume. + * + * @param [in] layout The index layout. + * @param [in] blockSize The size of a volume page + * @param [in] reservedBuffers The count of reserved buffers + * @param [out] clientPtr Where to put the new dm_bufio_client + * + * @return UDS_SUCCESS or an error code. + **/ +int openVolumeBufio(IndexLayout *layout, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) + __attribute__((warn_unused_result)); +#else +/** + * Obtain an IORegion for the specified index volume. + * + * @param [in] layout The index layout. + * @param [out] regionPtr Where to put the new region. + * + * @return UDS_SUCCESS or an error code. + **/ +int openVolumeRegion(IndexLayout *layout, struct ioRegion **regionPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Read the index configuration, and verify that it matches the given + * configuration. + * + * @param layout the generic index layout + * @param config the index configuration + * + * @return UDS_SUCCESS or an error code + **/ +int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Determine which index save slot to use for a new index save. + * + * Also allocates the masterIndex regions and, if needed, the openChapter + * region. + * + * @param [in] layout The index layout. + * @param [in] numZones Actual number of zones currently in use. + * @param [in] saveType The index save type. + * @param [out] saveSlotPtr Where to store the save slot number. + * + * @return UDS_SUCCESS or an error code + **/ +int setupIndexSaveSlot(IndexLayout *layout, + unsigned int numZones, + IndexSaveType saveType, + unsigned int *saveSlotPtr) + __attribute__((warn_unused_result)); + +/** + * Write the index configuration. + * + * @param layout the generic index layout + * @param config the index configuration to write + * + * @return UDS_SUCCESS or an error code + **/ +int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Get the index state buffer + * + * @param layout the index layout + * @param slot the save slot + * + * @return UDS_SUCCESS or an error code + **/ +Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) + __attribute__((warn_unused_result)); + +/** + * Get the index version parameters. + * + * @param layout the index layout + * + * @return the index version parameters. + **/ +const struct index_version *getIndexVersion(IndexLayout *layout) + __attribute__((warn_unused_result)); + +#endif // INDEX_LAYOUT_H diff --git a/uds/indexLayoutLinuxKernel.c b/uds/indexLayoutLinuxKernel.c new file mode 100644 index 0000000..8301166 --- /dev/null +++ b/uds/indexLayoutLinuxKernel.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/indexLayoutLinuxKernel.c#5 $ + */ + +#include "indexLayout.h" +#include "indexLayoutParser.h" +#include "memoryAlloc.h" + +/*****************************************************************************/ +int makeIndexLayout(const char *name, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) +{ + char *dev = NULL; + uint64_t offset = 0; + uint64_t size = 0; + + LayoutParameter parameterTable[] = { + { "dev", LP_STRING | LP_DEFAULT, { .str = &dev } }, + { "offset", LP_UINT64, { .num = &offset } }, + { "size", LP_UINT64, { .num = &size } }, + }; + size_t numParameters = sizeof(parameterTable) / sizeof(*parameterTable); + + char *params = NULL; + int result = duplicateString(name, "makeIndexLayout parameters", ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + + // note dev will be set to memory owned by params + result = parseLayoutString(params, parameterTable, numParameters); + if (result != UDS_SUCCESS) { + FREE(params); + return result; + } + + IOFactory *factory = NULL; + result = makeIOFactory(dev, &factory); + FREE(params); + if (result != UDS_SUCCESS) { + return result; + } + IndexLayout *layout; + result = makeIndexLayoutFromFactory(factory, offset, size, newLayout, config, + &layout); + putIOFactory(factory); + if (result != UDS_SUCCESS) { + return result; + } + *layoutPtr = layout; + return UDS_SUCCESS; +} diff --git a/uds/indexLayoutParser.c b/uds/indexLayoutParser.c new file mode 100644 index 0000000..808def7 --- /dev/null +++ b/uds/indexLayoutParser.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.c#2 $ + */ + +#include "indexLayoutParser.h" + +#include "errors.h" +#include "logger.h" +#include "permassert.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int setParameterValue(LayoutParameter *lp, char *data) +{ + if ((lp->type & LP_TYPE_MASK) == LP_UINT64) { + int result = parseUint64(data, lp->value.num); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "bad numeric value %s", data); + } + } else if ((lp->type & LP_TYPE_MASK) == LP_STRING) { + *lp->value.str = data; + } else { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "unkown LayoutParameter type code %x", + (lp->type & LP_TYPE_MASK)); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int parseLayoutString(char *info, LayoutParameter *params, size_t count) +{ + if (!strchr(info, '=')) { + LayoutParameter *lp; + for (lp = params; lp < params + count; ++lp) { + if (lp->type & LP_DEFAULT) { + int result = setParameterValue(lp, info); + if (result != UDS_SUCCESS) { + return result; + } + break; + } + } + } else { + char *data = NULL; + char *token; + for (token = nextToken(info, " ", &data); + token; + token = nextToken(NULL, " ", &data)) + { + char *equal = strchr(token, '='); + LayoutParameter *lp; + for (lp = params; lp < params + count; ++lp) { + if (!equal && (lp->type & LP_DEFAULT)) { + break; + } else if (strncmp(token, lp->name, equal - token) == 0 && + strlen(lp->name) == (size_t) (equal - token)) { + break; + } + } + if (lp == NULL) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "unkown index parameter %s", + token); + } + if (lp->seen) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "duplicate index parameter %s", + token); + } + lp->seen = true; + int result = setParameterValue(lp, equal ? equal + 1 : token); + if (result != UDS_SUCCESS) { + return result; + } + } + } + return UDS_SUCCESS; +} diff --git a/uds/indexLayoutParser.h b/uds/indexLayoutParser.h new file mode 100644 index 0000000..35b492a --- /dev/null +++ b/uds/indexLayoutParser.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.h#1 $ + */ + +#ifndef INDEX_LAYOUT_PARSER_H +#define INDEX_LAYOUT_PARSER_H + +#include "typeDefs.h" + +typedef enum { + LP_STRING = 0x001, + LP_UINT64 = 0x002, + LP_TYPE_MASK = 0x0FF, + LP_DEFAULT = 0x100, +} LPType; + +typedef struct layoutParameter { + const char *name; + LPType type; + union { + char **str; + uint64_t *num; + } value; + bool seen; +} LayoutParameter; + +/** + * Function to parse an index layout specification. + * + * This parser treats the specification as a set of name=value parameters + * or, in the absence of an '=' character, a single value for a default + * parameter. The list of acceptable parameters is specified as an array + * of LayoutParameter entries. Each such parameter contains the address + * of the variable in which the value is to be stored. + * + * @param info A copy of the index layout specification that + * will be altered by the parser to insert null + * characters after each value. Note that string + * parameter values will point into the memory of + * this string, so this specification cannot be + * deallocated until all uses of the parameter + * values are over. + * @param params The table of parameters the caller expects to + * find in the ``info'' string. Currently this + * parser can handle string and uint64_t values. + * @param count The size of the parameter table. + * + * @return UDS_SUCCESS or an error code, particularly + * UDS_INDEX_NAME_REQUIRED for all parsing errors. + **/ +int parseLayoutString(char *info, LayoutParameter *params, size_t count) + __attribute__((warn_unused_result)); + +#endif // INDEX_LAYOUT_PARSER_H diff --git a/uds/indexPageMap.c b/uds/indexPageMap.c new file mode 100644 index 0000000..a915179 --- /dev/null +++ b/uds/indexPageMap.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.c#4 $ + */ + +#include "indexPageMap.h" + +#include "buffer.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "indexComponent.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" +#include "threads.h" +#include "uds.h" + +static int readIndexPageMap(ReadPortal *portal); +static int writeIndexPageMap(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +static const byte INDEX_PAGE_MAP_MAGIC[] = "ALBIPM02"; +enum { + INDEX_PAGE_MAP_MAGIC_LENGTH = sizeof(INDEX_PAGE_MAP_MAGIC) - 1, +}; + +const IndexComponentInfo INDEX_PAGE_MAP_INFO = { + .kind = RL_KIND_INDEX_PAGE_MAP, + .name = "index page map", + .saveOnly = false, + .chapterSync = true, + .multiZone = false, + .ioStorage = true, + .loader = readIndexPageMap, + .saver = writeIndexPageMap, + .incremental = NULL, +}; + +/*****************************************************************************/ +static INLINE size_t numEntries(const Geometry *geometry) +{ + return geometry->chaptersPerVolume * (geometry->indexPagesPerChapter - 1); +} + +/*****************************************************************************/ +int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) +{ + unsigned int deltaListsPerChapter = geometry->deltaListsPerChapter; + int result + = ASSERT_WITH_ERROR_CODE(((deltaListsPerChapter - 1) <= UINT16_MAX), + UDS_BAD_STATE, + "delta lists per chapter (%u) is too large", + deltaListsPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + IndexPageMap *map; + result = ALLOCATE(1, IndexPageMap, "Index Page Map", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->geometry = geometry; + + result = ALLOCATE(numEntries(geometry), + IndexPageMapEntry, + "Index Page Map Entries", + &map->entries); + if (result != UDS_SUCCESS) { + freeIndexPageMap(map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void freeIndexPageMap(IndexPageMap *map) +{ + if (map != NULL) { + FREE(map->entries); + FREE(map); + } +} + +/*****************************************************************************/ +uint64_t getLastUpdate(const IndexPageMap *map) +{ + return map->lastUpdate; +} + +/*****************************************************************************/ +int updateIndexPageMap(IndexPageMap *map, + uint64_t virtualChapterNumber, + unsigned int chapterNumber, + unsigned int indexPageNumber, + unsigned int deltaListNumber) +{ + const Geometry *geometry = map->geometry; + if ((virtualChapterNumber < map->lastUpdate) + || (virtualChapterNumber > map->lastUpdate + 1)) { + // if the lastUpdate is 0, this is likely to be normal because we are + // replaying the volume + if (map->lastUpdate != 0) { + logWarning("unexpected index page map update, jumping from %" PRIu64 + " to %llu", + map->lastUpdate, virtualChapterNumber); + } + } + map->lastUpdate = virtualChapterNumber; + + if (chapterNumber >= geometry->chaptersPerVolume) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", + chapterNumber, geometry->chaptersPerVolume - 1); + } + if (indexPageNumber >= geometry->indexPagesPerChapter) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "index page number %u exceeds maximum %u", + indexPageNumber, geometry->indexPagesPerChapter - 1); + } + if (deltaListNumber >= geometry->deltaListsPerChapter) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "delta list number %u exceeds maximum %u", + deltaListNumber, geometry->deltaListsPerChapter - 1); + } + + if (indexPageNumber == (geometry->indexPagesPerChapter - 1)) { + /* + * There is no entry for the last index page of a chapter since its entry + * would always be geometry->deltaListsPerChapter - 1. + */ + return UDS_SUCCESS; + } + + size_t slot + = (chapterNumber * (geometry->indexPagesPerChapter - 1)) + indexPageNumber; + map->entries[slot] = (IndexPageMapEntry) deltaListNumber; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int findIndexPageNumber(const IndexPageMap *map, + const UdsChunkName *name, + unsigned int chapterNumber, + unsigned int *indexPageNumberPtr) +{ + const Geometry *geometry = map->geometry; + if (chapterNumber >= geometry->chaptersPerVolume) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", + chapterNumber, geometry->chaptersPerVolume - 1); + } + + unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); + unsigned int slot = (chapterNumber * (geometry->indexPagesPerChapter - 1)); + unsigned int limit = slot + (geometry->indexPagesPerChapter - 1); + unsigned int indexPageNumber = 0; + for (; slot < limit; indexPageNumber++, slot++) { + if (deltaListNumber <= map->entries[slot]) { + break; + } + } + + // This should be a clear post-condition of the loop above, but just in case + // it's not obvious, the check is cheap. + int result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), + "index page number too large"); + if (result != UDS_SUCCESS) { + return result; + } + + *indexPageNumberPtr = indexPageNumber; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getListNumberBounds(const IndexPageMap *map, + unsigned int chapterNumber, + unsigned int indexPageNumber, + IndexPageBounds *bounds) +{ + const Geometry *geometry = map->geometry; + int result = ASSERT((chapterNumber < geometry->chaptersPerVolume), + "chapter number is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), + "index page number is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int slot = chapterNumber * (geometry->indexPagesPerChapter - 1); + bounds->lowestList = ((indexPageNumber == 0) + ? 0 + : map->entries[slot + indexPageNumber - 1] + 1); + bounds->highestList = ((indexPageNumber == geometry->indexPagesPerChapter - 1) + ? geometry->deltaListsPerChapter - 1 + : map->entries[slot + indexPageNumber]); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +size_t indexPageMapSize(const Geometry *geometry) +{ + return sizeof(IndexPageMapEntry) * numEntries(geometry); +} + +/*****************************************************************************/ +static int writeIndexPageMap(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + int result = ASSERT((zone == 0), "unimplemented zone %d", zone); + if (result != UDS_SUCCESS) { + return result; + } + + IndexPageMap *map = indexComponentData(component); + + Buffer *buffer; + result = makeBuffer(INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(map->lastUpdate), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = putBytes(buffer, INDEX_PAGE_MAP_MAGIC_LENGTH, INDEX_PAGE_MAP_MAGIC); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = putUInt64LEIntoBuffer(buffer, map->lastUpdate); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot write index page map header"); + } + result = makeBuffer(indexPageMapSize(map->geometry), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result + = putUInt16LEsIntoBuffer(buffer, numEntries(map->geometry), map->entries); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot write index page map data"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +uint64_t computeIndexPageMapSaveSize(const Geometry *geometry) +{ + return indexPageMapSize(geometry) + + INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(((IndexPageMap *) 0)->lastUpdate); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexPageMap(Buffer *buffer, IndexPageMap *map) +{ + int result = getUInt64LEFromBuffer(buffer, &map->lastUpdate); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEsFromBuffer(buffer, numEntries(map->geometry), + map->entries); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + return result; +} + +/*****************************************************************************/ +static int readIndexPageMap(ReadPortal *portal) +{ + IndexPageMap *map = indexComponentData(portal->component); + + BufferedReader *reader = NULL; + + int result = getBufferedReaderForPortal(portal, 0, &reader); + if (result != UDS_SUCCESS) { + return result; + } + + result = verifyBufferedData(reader, INDEX_PAGE_MAP_MAGIC, + INDEX_PAGE_MAP_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "bad index page map saved magic"); + } + + Buffer *buffer; + result + = makeBuffer(sizeof(map->lastUpdate) + indexPageMapSize(map->geometry), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + logErrorWithStringError(result, "cannot read index page map data"); + return result; + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeIndexPageMap(buffer, map); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + logDebug("read index page map, last update %llu", map->lastUpdate); + return UDS_SUCCESS; +} diff --git a/uds/indexPageMap.h b/uds/indexPageMap.h new file mode 100644 index 0000000..3767cdd --- /dev/null +++ b/uds/indexPageMap.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.h#2 $ + */ + +#ifndef INDEX_PAGE_MAP_H +#define INDEX_PAGE_MAP_H 1 + +#include "common.h" +#include "geometry.h" +#include "indexComponent.h" + +extern const IndexComponentInfo INDEX_PAGE_MAP_INFO; + +typedef struct indexPageMap IndexPageMap; + +typedef struct { + unsigned int lowestList; + unsigned int highestList; +} IndexPageBounds; + +/* + * Notes on IndexPageMap + * + * Each volume maintains an index page map which records how the chapter delta + * lists are distributed among the index pages for that chapter. + * + * The map is conceptually a two-dimensional array indexed by chapter number + * and index page number within the chapter. Each entry contains the number + * of the last delta list on that index page. In order to save memory, the + * information for the last page in each chapter is not recorded, as it is + * known from the geometry. + */ + +typedef uint16_t IndexPageMapEntry; + +struct indexPageMap { + const Geometry *geometry; + uint64_t lastUpdate; + IndexPageMapEntry *entries; +}; + +/** + * Create an index page map. + * + * @param geometry The geometry governing the index. + * @param mapPtr A pointer to hold the new map. + * + * @return A success or error code. + **/ +int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free an index page map. + * + * @param map The index page map to destroy. + **/ +void freeIndexPageMap(IndexPageMap *map); + +/** + * Get the virtual chapter number of the last update to the index page map. + * + * @param map The index page map + * + * @return the virtual chapter number of the last chapter updated + **/ +uint64_t getLastUpdate(const IndexPageMap *map); + +/** + * Update an index page map entry. + * + * @param map The map to update + * @param virtualChapterNumber The virtual chapter number being updated. + * @param chapterNumber The chapter of the entry to update + * @param indexPageNumber The index page of the entry to update + * @param deltaListNumber The value of the new entry + * + * @return UDS_SUCCESS or an error code + **/ +int updateIndexPageMap(IndexPageMap *map, + uint64_t virtualChapterNumber, + unsigned int chapterNumber, + unsigned int indexPageNumber, + unsigned int deltaListNumber) + __attribute__((warn_unused_result)); + +/** + * Find the page number of the index page in a chapter that will contain the + * chapter index entry for a given chunk name, if it exists. + * + * @param [in] map The map to search + * @param [in] name The chunk name + * @param [in] chapterNumber The chapter containing the index page + * @param [out] indexPageNumberPtr A pointer to hold the result, guaranteed to + * be a valid index page number on UDS_SUCCESS + * + * @return UDS_SUCCESS, or UDS_INVALID_ARGUMENT if the chapter number + * is out of range + **/ +int findIndexPageNumber(const IndexPageMap *map, + const UdsChunkName *name, + unsigned int chapterNumber, + unsigned int *indexPageNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Get the lowest and highest numbered delta lists for the given immutable + * chapter index page from the index page map. + * + * @param map The index page map + * @param chapterNumber The chapter containing the delta list + * @param indexPageNumber The index page number within the chapter + * @param bounds A structure to hold the list number bounds + * for the given page + * + * @return UDS_SUCCESS or an error code + **/ +int getListNumberBounds(const IndexPageMap *map, + unsigned int chapterNumber, + unsigned int indexPageNumber, + IndexPageBounds *bounds) + __attribute__((warn_unused_result)); + +/** + * Compute the size of the index page map save image, including all headers. + * + * @param geometry The index geometry. + * + * @return The number of bytes required to save the index page map. + **/ +uint64_t computeIndexPageMapSaveSize(const Geometry *geometry); + +/** + * Escaped for testing.... + * + * @param geometry The index geometry. + * + * @return The number of bytes required for the page map data, + * exclusive of headers. + **/ +size_t indexPageMapSize(const Geometry *geometry) + __attribute__((warn_unused_result)); + +#endif // INDEX_PAGE_MAP_H diff --git a/uds/indexRouter.c b/uds/indexRouter.c new file mode 100644 index 0000000..b9b0a9e --- /dev/null +++ b/uds/indexRouter.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.c#7 $ + */ + +#include "indexRouter.h" + +#include "compiler.h" +#include "indexCheckpoint.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "requestQueue.h" +#include "zone.h" + +/** + * This is the request processing function invoked by the zone's RequestQueue + * worker thread. + * + * @param request the request to be indexed or executed by the zone worker + **/ +static void executeZoneRequest(Request *request) +{ + executeIndexRouterRequest(request->router, request); +} + +/** + * Construct and enqueue asynchronous control messages to add the chapter + * index for a given virtual chapter to the sparse chapter index cache. + * + * @param router the router containing the relevant queues + * @param index the index with the relevant cache and chapter + * @param virtualChapter the virtual chapter number of the chapter to cache + **/ +static void enqueueBarrierMessages(IndexRouter *router, + Index *index, + uint64_t virtualChapter) +{ + ZoneMessage barrier = { + .index = index, + .data = { + .barrier = { + .virtualChapter = virtualChapter, + } + } + }; + unsigned int zone; + for (zone = 0; zone < router->zoneCount; zone++) { + int result = launchZoneControlMessage(REQUEST_SPARSE_CACHE_BARRIER, + barrier, zone, router); + ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); + } +} + +/** + * This is the request processing function for the triage stage queue. Each + * request is resolved in the master index, determining if it is a hook or + * not, and if a hook, what virtual chapter (if any) it might be found in. If + * a virtual chapter is found, this enqueues a sparse chapter cache barrier in + * every zone before enqueueing the request in its zone. + * + * @param request the request to triage + **/ +static void triageRequest(Request *request) +{ + IndexRouter *router = request->router; + Index *index = router->index; + + // Check if the name is a hook in the index pointing at a sparse chapter. + uint64_t sparseVirtualChapter = triageIndexRequest(index, request); + if (sparseVirtualChapter != UINT64_MAX) { + // Generate and place a barrier request on every zone queue. + enqueueBarrierMessages(router, index, sparseVirtualChapter); + } + + enqueueRequest(request, STAGE_INDEX); +} + +/** + * Initialize the zone queues and the triage queue. + * + * @param router the router containing the queues + * @param geometry the geometry governing the indexes + * + * @return UDS_SUCCESS or error code + **/ +static int initializeLocalIndexQueues(IndexRouter *router, + const Geometry *geometry) +{ + unsigned int i; + for (i = 0; i < router->zoneCount; i++) { + int result = makeRequestQueue("indexW", &executeZoneRequest, + &router->zoneQueues[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + + // The triage queue is only needed for sparse multi-zone indexes. + if ((router->zoneCount > 1) && isSparse(geometry)) { + int result = makeRequestQueue("triageW", &triageRequest, + &router->triageQueue); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +static INLINE RequestQueue *getZoneQueue(IndexRouter *router, + unsigned int zoneNumber) +{ + return router->zoneQueues[zoneNumber]; +} + +/**********************************************************************/ +int makeIndexRouter(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + LoadType loadType, + IndexLoadContext *loadContext, + IndexRouterCallback callback, + IndexRouter **routerPtr) +{ + unsigned int zoneCount = getZoneCount(userParams); + IndexRouter *router; + int result = ALLOCATE_EXTENDED(IndexRouter, zoneCount, RequestQueue *, + "index router", &router); + if (result != UDS_SUCCESS) { + return result; + } + + router->callback = callback; + router->zoneCount = zoneCount; + + result = initializeLocalIndexQueues(router, config->geometry); + if (result != UDS_SUCCESS) { + freeIndexRouter(router); + return result; + } + + result = makeIndex(layout, config, userParams, router->zoneCount, loadType, + loadContext, &router->index); + if (result != UDS_SUCCESS) { + freeIndexRouter(router); + return logErrorWithStringError(result, "failed to create index"); + } + + router->needToSave = (router->index->loadedType != LOAD_LOAD); + *routerPtr = router; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int saveIndexRouter(IndexRouter *router) +{ + if (!router->needToSave) { + return UDS_SUCCESS; + } + int result = saveIndex(router->index); + router->needToSave = (result != UDS_SUCCESS); + return result; +} + +/**********************************************************************/ +void freeIndexRouter(IndexRouter *router) +{ + if (router == NULL) { + return; + } + requestQueueFinish(router->triageQueue); + unsigned int i; + for (i = 0; i < router->zoneCount; i++) { + requestQueueFinish(router->zoneQueues[i]); + } + freeIndex(router->index); + FREE(router); +} + +/**********************************************************************/ +RequestQueue *selectIndexRouterQueue(IndexRouter *router, + Request *request, + RequestStage nextStage) +{ + if (request->isControlMessage) { + return getZoneQueue(router, request->zoneNumber); + } + + if (nextStage == STAGE_TRIAGE) { + // The triage queue is only needed for multi-zone sparse indexes and won't + // be allocated by the router if not needed, so simply check for NULL. + if (router->triageQueue != NULL) { + return router->triageQueue; + } + // Dense index or single zone, so route it directly to the zone queue. + } else if (nextStage != STAGE_INDEX) { + ASSERT_LOG_ONLY(false, "invalid index stage: %d", nextStage); + return NULL; + } + + Index *index = router->index; + request->zoneNumber = getMasterIndexZone(index->masterIndex, + &request->chunkName); + return getZoneQueue(router, request->zoneNumber); +} + +/**********************************************************************/ +void executeIndexRouterRequest(IndexRouter *router, Request *request) +{ + if (request->isControlMessage) { + int result = dispatchIndexZoneControlRequest(request); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "error executing control message: %d", + request->action); + } + request->status = result; + enterCallbackStage(request); + return; + } + + router->needToSave = true; + if (request->requeued && !isSuccessful(request->status)) { + request->status = makeUnrecoverable(request->status); + router->callback(request); + return; + } + + Index *index = router->index; + int result = dispatchIndexRequest(index, request); + if (result == UDS_QUEUED) { + // Take the request off the pipeline. + return; + } + + request->status = result; + router->callback(request); +} diff --git a/uds/indexRouter.h b/uds/indexRouter.h new file mode 100644 index 0000000..a96262b --- /dev/null +++ b/uds/indexRouter.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.h#3 $ + */ + +#ifndef INDEX_ROUTER_H +#define INDEX_ROUTER_H + +#include "compiler.h" +#include "index.h" +#include "indexSession.h" +#include "request.h" + +/** + * Callback after a query, update or remove request completes and fills in + * select fields in the request: status for all requests, oldMetadata and + * hashExists for query and update requests. + * + * @param request request object. + **/ +typedef void (*IndexRouterCallback)(Request *request); + +struct indexRouter { + IndexRouterCallback callback; + unsigned int zoneCount; + bool needToSave; + Index *index; + RequestQueue *triageQueue; + RequestQueue *zoneQueues[]; +}; + +/** + * Construct and initialize an IndexRouter instance. + * + * @param layout the IndexLayout that describes the stored index + * @param config the configuration to use + * @param userParams the index session parameters. If NULL, the default + * session parameters will be used. + * @param loadType selects whether to create, load, or rebuild the index + * @param loadContext the index load context to use + * @param callback the function to invoke when a request completes or fails + * @param routerPtr a pointer in which to store the new router + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexRouter(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + LoadType loadType, + IndexLoadContext *loadContext, + IndexRouterCallback callback, + IndexRouter **routerPtr) + __attribute__((warn_unused_result)); + +/** + * Executes the index operation for a UDS request and calls the callback upon + * completion. + * + * @param router The index router. + * @param request A pointer to the Request to process. + **/ +void executeIndexRouterRequest(IndexRouter *router, Request *request); + +/** + * Save the index router state to persistent storage. + * + * It is the responsibility of the caller to ensure that there are no other + * uses of the index during a call to this method. It is necessary that there + * be no index requests from any block context nor any other attempt to save + * the index until after a call to saveIndexRouter returns. + * + * @param router the index router to save + * + * @return UDS_SUCCESS if successful. + **/ +int saveIndexRouter(IndexRouter *router) __attribute__((warn_unused_result)); + +/** + * Destroy the index router and free its memory. + * + * @param router the index router to destroy (may be NULL) + * + * @return UDS_SUCCESS if successful. + **/ +void freeIndexRouter(IndexRouter *router); + +/** + * Select and return the request queue responsible for executing the next + * index stage of a request, updating the request with any associated state + * (such as the zone number for UDS requests on a local index). + * + * @param router The index router. + * @param request The Request destined for the queue. + * @param nextStage The next request stage (STAGE_TRIAGE or STAGE_INDEX). + * + * @return the next index stage queue (the local triage queue, local zone + * queue, or remote RPC send queue) + **/ +RequestQueue *selectIndexRouterQueue(IndexRouter *router, + Request *request, + RequestStage nextStage); + +/** + * Wait for the index router to finish all operations that access a local + * storage device. + * + * @param router The index router. + **/ +static INLINE void waitForIdleIndexRouter(IndexRouter *router) +{ + waitForIdleChapterWriter(router->index->chapterWriter); +} + +#endif /* INDEX_ROUTER_H */ diff --git a/uds/indexSession.c b/uds/indexSession.c new file mode 100644 index 0000000..15e5b3f --- /dev/null +++ b/uds/indexSession.c @@ -0,0 +1,554 @@ +/* + * %Copyright% + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexSession.c#10 $ + */ + +#include "indexSession.h" + +#include "indexCheckpoint.h" +#include "indexRouter.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "requestQueue.h" + +/**********************************************************************/ +static void collectStats(const struct uds_index_session *indexSession, + UdsContextStats *stats) +{ + const SessionStats *sessionStats = &indexSession->stats; + + stats->currentTime = asTimeT(currentTime(CLOCK_REALTIME)); + + stats->postsFound = READ_ONCE(sessionStats->postsFound); + stats->inMemoryPostsFound = READ_ONCE(sessionStats->postsFoundOpenChapter); + stats->densePostsFound = READ_ONCE(sessionStats->postsFoundDense); + stats->sparsePostsFound = READ_ONCE(sessionStats->postsFoundSparse); + stats->postsNotFound = READ_ONCE(sessionStats->postsNotFound); + stats->updatesFound = READ_ONCE(sessionStats->updatesFound); + stats->updatesNotFound = READ_ONCE(sessionStats->updatesNotFound); + stats->deletionsFound = READ_ONCE(sessionStats->deletionsFound); + stats->deletionsNotFound = READ_ONCE(sessionStats->deletionsNotFound); + stats->queriesFound = READ_ONCE(sessionStats->queriesFound); + stats->queriesNotFound = READ_ONCE(sessionStats->queriesNotFound); + stats->requests = READ_ONCE(sessionStats->requests); +} + +/**********************************************************************/ +static void handleCallbacks(Request *request) +{ + if (request->status == UDS_SUCCESS) { + // Measure the turnaround time of this request and include that time, + // along with the rest of the request, in the context's StatCounters. + updateRequestContextStats(request); + } + + if (request->callback != NULL) { + // The request has specified its own callback and does not expect to be + // freed. + struct uds_index_session *indexSession = request->session; + request->found = (request->location != LOC_UNAVAILABLE); + request->callback((UdsRequest *) request); + // We do this release after the callback because of the contract of the + // udsFlushIndexSession method. + releaseIndexSession(indexSession); + return; + } + + // Should not get here, because this is either a control message or it has a + // callback method. + freeRequest(request); +} + +/**********************************************************************/ +int checkIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + unsigned int state = indexSession->state; + unlockMutex(&indexSession->requestMutex); + + if (state == IS_FLAG_LOADED) { + return UDS_SUCCESS; + } else if (state & IS_FLAG_DISABLED) { + return UDS_DISABLED; + } else if ((state & IS_FLAG_LOADING) + || (state & IS_FLAG_SUSPENDED) + || (state & IS_FLAG_WAITING)) { + return UDS_SUSPENDED; + } + + return UDS_NO_INDEXSESSION; +} + +/**********************************************************************/ +int getIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + indexSession->requestCount++; + unlockMutex(&indexSession->requestMutex); + + int result = checkIndexSession(indexSession); + if (result != UDS_SUCCESS) { + releaseIndexSession(indexSession); + return result; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void releaseIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + if (--indexSession->requestCount == 0) { + broadcastCond(&indexSession->requestCond); + } + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int startLoadingIndexSession(struct uds_index_session *indexSession) +{ + int result; + lockMutex(&indexSession->requestMutex); + if (indexSession->state & IS_FLAG_SUSPENDED) { + result = UDS_SUSPENDED; + } else if (indexSession->state != 0) { + result = UDS_INDEXSESSION_IN_USE; + } else { + indexSession->state |= IS_FLAG_LOADING; + result = UDS_SUCCESS; + } + unlockMutex(&indexSession->requestMutex); + return result; +} + +/**********************************************************************/ +void finishLoadingIndexSession(struct uds_index_session *indexSession, + int result) +{ + lockMutex(&indexSession->requestMutex); + indexSession->state &= ~IS_FLAG_LOADING; + if (result == UDS_SUCCESS) { + indexSession->state |= IS_FLAG_LOADED; + } + broadcastCond(&indexSession->requestCond); + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +void disableIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + indexSession->state |= IS_FLAG_DISABLED; + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) +{ + struct uds_index_session *session; + int result = ALLOCATE(1, struct uds_index_session, __func__, &session); + if (result != UDS_SUCCESS) { + return result; + } + + result = initMutex(&session->requestMutex); + if (result != UDS_SUCCESS) { + FREE(session); + return result; + } + + result = initCond(&session->requestCond); + if (result != UDS_SUCCESS) { + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = initMutex(&session->loadContext.mutex); + if (result != UDS_SUCCESS) { + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = initCond(&session->loadContext.cond); + if (result != UDS_SUCCESS) { + destroyMutex(&session->loadContext.mutex); + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = makeRequestQueue("callbackW", &handleCallbacks, + &session->callbackQueue); + if (result != UDS_SUCCESS) { + destroyCond(&session->loadContext.cond); + destroyMutex(&session->loadContext.mutex); + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + *indexSessionPtr = session; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsSuspendIndexSession(struct uds_index_session *session, bool save) +{ + int result; + bool saveIndex = false; + bool suspendIndex = false; + lockMutex(&session->requestMutex); + // Wait for any pending close operation to complete. + while (session->state & IS_FLAG_CLOSING) { + waitCond(&session->requestCond, &session->requestMutex); + } + if ((session->state & IS_FLAG_WAITING) + || (session->state & IS_FLAG_DESTROYING)) { + result = EBUSY; + } else if (session->state & IS_FLAG_SUSPENDED) { + result = UDS_SUCCESS; + } else if (session->state & IS_FLAG_LOADING) { + session->state |= IS_FLAG_WAITING; + suspendIndex = true; + result = UDS_SUCCESS; + } else if (!(session->state & IS_FLAG_LOADED)) { + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + result = UDS_SUCCESS; + } else { + saveIndex = save; + if (saveIndex) { + session->state |= IS_FLAG_WAITING; + } else { + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + } + result = UDS_SUCCESS; + } + unlockMutex(&session->requestMutex); + + if (!saveIndex && !suspendIndex) { + return result; + } + + if (saveIndex) { + result = udsSaveIndex(session); + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return result; + } + + lockMutex(&session->loadContext.mutex); + switch (session->loadContext.status) { + case INDEX_OPENING: + session->loadContext.status = INDEX_SUSPENDING; + + // Wait until the index indicates that it is not replaying. + while ((session->loadContext.status != INDEX_SUSPENDED) + && (session->loadContext.status != INDEX_READY)) { + waitCond(&session->loadContext.cond, + &session->loadContext.mutex); + } + break; + + case INDEX_READY: + // Index load does not need to be suspended. + break; + + case INDEX_SUSPENDED: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + // These cases should not happen. + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->loadContext.status); + break; + } + unlockMutex(&session->loadContext.mutex); + + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsResumeIndexSession(struct uds_index_session *session) +{ + lockMutex(&session->requestMutex); + if (session->state & IS_FLAG_WAITING) { + unlockMutex(&session->requestMutex); + return EBUSY; + } + + /* If not suspended, just succeed */ + if (!(session->state & IS_FLAG_SUSPENDED)) { + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; + } + + if (!(session->state & IS_FLAG_LOADING)) { + session->state &= ~IS_FLAG_SUSPENDED; + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; + } + + session->state |= IS_FLAG_WAITING; + unlockMutex(&session->requestMutex); + + lockMutex(&session->loadContext.mutex); + switch (session->loadContext.status) { + case INDEX_SUSPENDED: + session->loadContext.status = INDEX_OPENING; + // Notify the index to start replaying again. + broadcastCond(&session->loadContext.cond); + break; + + case INDEX_READY: + // There is no index rebuild to resume. + break; + + case INDEX_OPENING: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + // These cases should not happen; do nothing. + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->loadContext.status); + break; + } + unlockMutex(&session->loadContext.mutex); + + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state &= ~IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static void waitForNoRequestsInProgress(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + while (indexSession->requestCount > 0) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int saveAndFreeIndex(struct uds_index_session *indexSession) +{ + int result = UDS_SUCCESS; + IndexRouter *router = indexSession->router; + if (router != NULL) { + lockMutex(&indexSession->requestMutex); + bool suspended = (indexSession->state & IS_FLAG_SUSPENDED); + unlockMutex(&indexSession->requestMutex); + if (!suspended) { + result = saveIndexRouter(router); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "ignoring error from saveIndexRouter"); + } + } + freeIndexRouter(router); + indexSession->router = NULL; + + // Reset all index state that happens to be in the index session, so it + // doesn't affect any future index. + lockMutex(&indexSession->loadContext.mutex); + indexSession->loadContext.status = INDEX_OPENING; + unlockMutex(&indexSession->loadContext.mutex); + + lockMutex(&indexSession->requestMutex); + // Only the suspend bit will remain relevant. + indexSession->state &= IS_FLAG_SUSPENDED; + unlockMutex(&indexSession->requestMutex); + } + + logDebug("Closed index"); + return result; +} + +/**********************************************************************/ +int udsCloseIndex(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + + // Wait for any pending suspend, resume or close operations to complete. + while ((indexSession->state & IS_FLAG_WAITING) + || (indexSession->state & IS_FLAG_CLOSING)) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + + int result = UDS_SUCCESS; + if (indexSession->state & IS_FLAG_SUSPENDED) { + result = UDS_SUSPENDED; + } else if ((indexSession->state & IS_FLAG_DESTROYING) + || !(indexSession->state & IS_FLAG_LOADED)) { + // The index doesn't exist, hasn't finished loading, or is being destroyed. + result = UDS_NO_INDEXSESSION; + } else { + indexSession->state |= IS_FLAG_CLOSING; + } + unlockMutex(&indexSession->requestMutex); + if (result != UDS_SUCCESS) { + return result; + } + + logDebug("Closing index"); + waitForNoRequestsInProgress(indexSession); + result = saveAndFreeIndex(indexSession); + + lockMutex(&indexSession->requestMutex); + indexSession->state &= ~IS_FLAG_CLOSING; + broadcastCond(&indexSession->requestCond); + unlockMutex(&indexSession->requestMutex); + return result; +} + +/**********************************************************************/ +int udsDestroyIndexSession(struct uds_index_session *indexSession) +{ + logDebug("Destroying index session"); + + bool loadPending = false; + lockMutex(&indexSession->requestMutex); + + // Wait for any pending suspend, resume, or close operations to complete. + while ((indexSession->state & IS_FLAG_WAITING) + || (indexSession->state & IS_FLAG_CLOSING)) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + + if (indexSession->state & IS_FLAG_DESTROYING) { + unlockMutex(&indexSession->requestMutex); + return EBUSY; + } + + indexSession->state |= IS_FLAG_DESTROYING; + loadPending = ((indexSession->state & IS_FLAG_LOADING) + && (indexSession->state & IS_FLAG_SUSPENDED)); + unlockMutex(&indexSession->requestMutex); + + if (loadPending) { + // Tell the index to terminate the rebuild. + lockMutex(&indexSession->loadContext.mutex); + if (indexSession->loadContext.status == INDEX_SUSPENDED) { + indexSession->loadContext.status = INDEX_FREEING; + broadcastCond(&indexSession->loadContext.cond); + } + unlockMutex(&indexSession->loadContext.mutex); + + // Wait until the load exits before proceeding. + lockMutex(&indexSession->requestMutex); + while (indexSession->state & IS_FLAG_LOADING) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + unlockMutex(&indexSession->requestMutex); + } + + waitForNoRequestsInProgress(indexSession); + int result = saveAndFreeIndex(indexSession); + requestQueueFinish(indexSession->callbackQueue); + indexSession->callbackQueue = NULL; + destroyCond(&indexSession->loadContext.cond); + destroyMutex(&indexSession->loadContext.mutex); + destroyCond(&indexSession->requestCond); + destroyMutex(&indexSession->requestMutex); + logDebug("Destroyed index session"); + FREE(indexSession); + return result; +} + +/**********************************************************************/ +int udsFlushIndexSession(struct uds_index_session *indexSession) +{ + waitForNoRequestsInProgress(indexSession); + // Wait until any open chapter writes are complete + waitForIdleIndexRouter(indexSession->router); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsSaveIndex(struct uds_index_session *indexSession) +{ + waitForNoRequestsInProgress(indexSession); + // saveIndexRouter waits for open chapter writes to complete + return saveIndexRouter(indexSession->router); +} + +/**********************************************************************/ +int udsSetCheckpointFrequency(struct uds_index_session *indexSession, + unsigned int frequency) +{ + setIndexCheckpointFrequency(indexSession->router->index->checkpoint, + frequency); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsGetIndexConfiguration(struct uds_index_session *indexSession, + UdsConfiguration *conf) +{ + if (conf == NULL) { + return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, + "received a NULL config pointer"); + } + int result = ALLOCATE(1, struct udsConfiguration, __func__, conf); + if (result == UDS_SUCCESS) { + **conf = indexSession->userConfig; + } + return result; +} + +/**********************************************************************/ +int udsGetIndexStats(struct uds_index_session *indexSession, + UdsIndexStats *stats) +{ + if (stats == NULL) { + return logErrorWithStringError(UDS_INDEX_STATS_PTR_REQUIRED, + "received a NULL index stats pointer"); + } + getIndexStats(indexSession->router->index, stats); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsGetIndexSessionStats(struct uds_index_session *indexSession, + UdsContextStats *stats) +{ + if (stats == NULL) { + return logWarningWithStringError(UDS_CONTEXT_STATS_PTR_REQUIRED, + "received a NULL context stats pointer"); + } + collectStats(indexSession, stats); + return UDS_SUCCESS; +} diff --git a/uds/indexSession.h b/uds/indexSession.h new file mode 100644 index 0000000..1467fd2 --- /dev/null +++ b/uds/indexSession.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexSession.h#6 $ + */ + +#ifndef INDEX_SESSION_H +#define INDEX_SESSION_H + +#include "atomicDefs.h" +#include "config.h" +#include "cpu.h" +#include "opaqueTypes.h" +#include "threads.h" +#include "uds.h" + +/** + * The bit position of flags used to indicate index session states. + **/ +typedef enum { + IS_FLAG_BIT_START = 8, + /** Flag indicating that the session is loading */ + IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, + /** Flag indicating that that the session has been loaded */ + IS_FLAG_BIT_LOADED, + /** Flag indicating that the session is disabled permanently */ + IS_FLAG_BIT_DISABLED, + /** Flag indicating that the session is suspended */ + IS_FLAG_BIT_SUSPENDED, + /** Flag indicating that the session is waiting for an index state change */ + IS_FLAG_BIT_WAITING, + /** Flag indicating that that the session is closing */ + IS_FLAG_BIT_CLOSING, + /** Flag indicating that that the session is being destroyed */ + IS_FLAG_BIT_DESTROYING, +} IndexSessionFlagBit; + +/** + * The index session state flags. + **/ +typedef enum { + IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), + IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), + IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), + IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), + IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), + IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), + IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), +} IndexSessionFlag; + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sessionStats { + uint64_t postsFound; /* Post calls that found an entry */ + uint64_t postsFoundOpenChapter; /* Post calls found in the open chapter */ + uint64_t postsFoundDense; /* Post calls found in the dense index */ + uint64_t postsFoundSparse; /* Post calls found in the sparse index */ + uint64_t postsNotFound; /* Post calls that did not find an entry */ + uint64_t updatesFound; /* Update calls that found an entry */ + uint64_t updatesNotFound; /* Update calls that did not find an entry */ + uint64_t deletionsFound; /* Delete calls that found an entry */ + uint64_t deletionsNotFound; /* Delete calls that did not find an entry */ + uint64_t queriesFound; /* Query calls that found an entry */ + uint64_t queriesNotFound; /* Query calls that did not find an entry */ + uint64_t requests; /* Total number of requests */ +} SessionStats; + +/** + * States used in the index load context, reflecting the state of the index. + **/ +typedef enum { + /** The index has not been loaded or rebuilt completely */ + INDEX_OPENING = 0, + /** The index is able to handle requests */ + INDEX_READY, + /** The index has a pending request to suspend */ + INDEX_SUSPENDING, + /** The index is suspended in the midst of a rebuild */ + INDEX_SUSPENDED, + /** The index is being shut down while suspended */ + INDEX_FREEING, +} IndexSuspendStatus; + +/** + * The CondVar here must be notified when the status changes to + * INDEX_SUSPENDED, in order to wake up the waiting udsSuspendIndexSession() + * call. It must also be notified when the status changes away from + * INDEX_SUSPENDED, to resume rebuild the index from checkForSuspend() in the + * index. + **/ +typedef struct indexLoadContext { + Mutex mutex; + CondVar cond; + IndexSuspendStatus status; // Covered by indexLoadContext.mutex. +} IndexLoadContext; + +/** + * The request CondVar here must be notified when IS_FLAG_WAITING is cleared, + * in case udsCloseIndex() or udsDestroyIndexSession() is waiting on that flag. + * It must also be notified when IS_FLAG_CLOSING is cleared, in case + * udsSuspendIndexSession(), udsCloseIndex() or udsDestroyIndexSession() is + * waiting on that flag. + * Finally, it must also be notified when IS_FLAG_LOADING is cleared, to inform + * udsDestroyIndexSession() that the index session can be safely freed. + **/ +struct uds_index_session { + unsigned int state; // Covered by requestMutex. + IndexRouter *router; + RequestQueue *callbackQueue; + struct udsConfiguration userConfig; + IndexLoadContext loadContext; + // Asynchronous Request synchronization + Mutex requestMutex; + CondVar requestCond; + int requestCount; + // Request statistics, all owned by the callback thread + SessionStats stats; +}; + +/** + * Check that the index session is usable. + * + * @param indexSession the session to query + * + * @return UDS_SUCCESS or an error code + **/ +int checkIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Make sure that the IndexSession is allowed to load an index, and if so, set + * its state to indicate that the load has started. + * + * @param indexSession the session to load with + * + * @return UDS_SUCCESS, or an error code if an index already exists. + **/ +int startLoadingIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Update the IndexSession state after attempting to load an index, to indicate + * that the load has completed, and whether or not it succeeded. + * + * @param indexSession the session that was loading + * @param result the result of the load operation + **/ +void finishLoadingIndexSession(struct uds_index_session *indexSession, + int result); + +/** + * Disable an index session due to an error. + * + * @param indexSession the session to be disabled + **/ +void disableIndexSession(struct uds_index_session *indexSession); + +/** + * Acquire the index session for an asynchronous index request. + * + * The pointer must eventually be released with a corresponding call to + * releaseIndexSession(). + * + * @param indexSession The index session + * + * @return UDS_SUCCESS or an error code + **/ +int getIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Release a pointer to an index session. + * + * @param indexSession The session to release + **/ +void releaseIndexSession(struct uds_index_session *indexSession); + +/** + * Construct a new, empty index session. + * + * @param indexSessionPtr The pointer to receive the new session + * + * @return UDS_SUCCESS or an error code + **/ +int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) + __attribute__((warn_unused_result)); + +/** + * Save an index while the session is quiescent. + * + * During the call to #udsSaveIndex, there should be no other call to + * #udsSaveIndex and there should be no calls to #udsStartChunkOperation. + * + * @param indexSession The session to save + * + * @return Either #UDS_SUCCESS or an error code + **/ +int udsSaveIndex(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Close the index by saving the underlying index. + * + * @param indexSession The index session to be shut down and freed + **/ +int saveAndFreeIndex(struct uds_index_session *indexSession); + +/** + * Set the checkpoint frequency of the grid. + * + * @param session The index session to be modified. + * @param frequency New checkpoint frequency. + * + * @return Either UDS_SUCCESS or an error code. + * + **/ +int udsSetCheckpointFrequency(struct uds_index_session *session, + unsigned int frequency) + __attribute__((warn_unused_result)); + +#endif /* INDEX_SESSION_H */ diff --git a/uds/indexState.c b/uds/indexState.c new file mode 100644 index 0000000..86b9fd3 --- /dev/null +++ b/uds/indexState.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexState.c#6 $ + */ + +#include "indexState.h" + +#include "errors.h" +#include "indexComponent.h" +#include "indexLayout.h" +#include "logger.h" +#include "memoryAlloc.h" + + +/*****************************************************************************/ +int makeIndexState(IndexLayout *layout, + unsigned int numZones, + unsigned int maxComponents, + IndexState **statePtr) +{ + if (maxComponents == 0) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "cannot make index state with maxComponents 0"); + } + + IndexState *state = NULL; + int result = ALLOCATE_EXTENDED(IndexState, maxComponents, IndexComponent *, + "index state", &state); + if (result != UDS_SUCCESS) { + return result; + } + + state->count = 0; + state->layout = layout; + state->length = maxComponents; + state->loadZones = 0; + state->loadSlot = UINT_MAX; + state->saveSlot = UINT_MAX; + state->saving = false; + state->zoneCount = numZones; + + *statePtr = state; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void freeIndexState(IndexState **statePtr) +{ + IndexState *state = *statePtr; + *statePtr = NULL; + if (state != NULL) { + unsigned int i; + for (i = 0; i < state->count; ++i) { + freeIndexComponent(&state->entries[i]); + } + FREE(state); + } +} + +/*****************************************************************************/ +/** + * Add a component to the index state. + * + * @param state The index state. + * @param component The index component. + * + * @return UDS_SUCCESS or an error code. + **/ +static int addComponentToIndexState(IndexState *state, + IndexComponent *component) +{ + if (findIndexComponent(state, component->info) != NULL) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "cannot add state component %s: already present", + component->info->name); + } + + if (state->count >= state->length) { + return logErrorWithStringError( + UDS_RESOURCE_LIMIT_EXCEEDED, + "cannot add state component %s, %u components already added", + component->info->name, state->count); + } + + state->entries[state->count] = component; + ++state->count; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int addIndexStateComponent(IndexState *state, + const IndexComponentInfo *info, + void *data, + void *context) +{ + IndexComponent *component = NULL; + int result = makeIndexComponent(state, info, state->zoneCount, data, context, + &component); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot make region index component"); + } + + result = addComponentToIndexState(state, component); + if (result != UDS_SUCCESS) { + freeIndexComponent(&component); + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +IndexComponent *findIndexComponent(const IndexState *state, + const IndexComponentInfo *info) +{ + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (info == component->info) { + return component; + } + } + return NULL; +} + +/*****************************************************************************/ +static const char *indexSaveTypeName(IndexSaveType saveType) +{ + return saveType == IS_SAVE ? "save" : "checkpoint"; +} + +/*****************************************************************************/ +int loadIndexState(IndexState *state, bool *replayPtr) +{ + int result = findLatestIndexSaveSlot(state->layout, &state->loadZones, + &state->loadSlot); + if (result != UDS_SUCCESS) { + return result; + } + + bool replayRequired = false; + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + result = readIndexComponent(component); + if (result != UDS_SUCCESS) { + if (!missingIndexComponentRequiresReplay(component)) { + state->loadZones = 0; + state->loadSlot = UINT_MAX; + return logErrorWithStringError(result, "index component %s", + indexComponentName(component)); + } + replayRequired = true; + } + } + + state->loadZones = 0; + state->loadSlot = UINT_MAX; + if (replayPtr != NULL) { + *replayPtr = replayRequired; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) +{ + if (state->saving) { + return logErrorWithStringError(UDS_BAD_STATE, + "already saving the index state"); + } + int result = setupIndexSaveSlot(state->layout, state->zoneCount, saveType, + &state->saveSlot); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot prepare index %s", + indexSaveTypeName(saveType)); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +/** + * Complete the saving of an index state. + * + * @param state the index state + * + * @return UDS_SUCCESS or an error code + **/ +static int completeIndexSaving(IndexState *state) +{ + state->saving = false; + int result = commitIndexSave(state->layout, state->saveSlot); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot commit index state"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int cleanupSave(IndexState *state) +{ + int result = cancelIndexSave(state->layout, state->saveSlot); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot cancel index save"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int saveIndexState(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_SAVE); + if (result != UDS_SUCCESS) { + return result; + } + + +unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + result = writeIndexComponent(component); + if (result != UDS_SUCCESS) { + cleanupSave(state); + return result; + } + } + return completeIndexSaving(state); +} + +/*****************************************************************************/ +int writeIndexStateCheckpoint(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_CHECKPOINT); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + result = writeIndexComponent(component); + if (result != UDS_SUCCESS) { + cleanupSave(state); + return result; + } + } + + return completeIndexSaving(state); +} + +/*****************************************************************************/ +int startIndexStateCheckpoint(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_CHECKPOINT); + if (result != UDS_SUCCESS) { + return result; + } + + state->saving = true; + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + result = startIndexComponentIncrementalSave(component); + if (result != UDS_SUCCESS) { + abortIndexStateCheckpoint(state); + return result; + } + } + + return result; +} + +/*****************************************************************************/ +int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) +{ + if (!state->saving) { + return UDS_SUCCESS; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component) || + !deferIndexComponentCheckpointToChapterWriter(component)) { + continue; + } + int result = performIndexComponentChapterWriterSave(component); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Wrapper function to do a zone-based checkpoint operation. + * + * @param [in] state the index state + * @param [in] zone the zone number + * @param [in] compFunc the index component function to use + * @param [out] completed if non-NULL, where to save the completion status + * + * @return UDS_SUCCESS or an error code + * + **/ +static int doIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + int (*compFunc)(IndexComponent *, + unsigned int, + CompletionStatus *), + CompletionStatus *completed) +{ + if (!state->saving) { + if (completed != NULL) { + *completed = CS_COMPLETED_PREVIOUSLY; + } + return UDS_SUCCESS; + } + + CompletionStatus status = CS_COMPLETED_PREVIOUSLY; + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + if (zone > 0 && !component->info->multiZone) { + continue; + } + CompletionStatus componentStatus = CS_NOT_COMPLETED; + int result = (*compFunc)(component, zone, &componentStatus); + if (result != UDS_SUCCESS) { + return result; + } + // compute rolling least status + if (componentStatus < status) { + status = componentStatus; + } + } + + if (completed != NULL) { + *completed = status; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int performIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &performIndexComponentZoneSave, + completed); +} + +/*****************************************************************************/ +int finishIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &finishIndexComponentZoneSave, + completed); +} + +/*****************************************************************************/ +int abortIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &abortIndexComponentZoneSave, completed); +} + +/*****************************************************************************/ +int finishIndexStateCheckpoint(IndexState *state) +{ + if (!state->saving) { + return UDS_SUCCESS; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + int result = finishIndexComponentIncrementalSave(component); + if (result != UDS_SUCCESS) { + abortIndexStateCheckpoint(state); + return result; + } + } + + int result = completeIndexSaving(state); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexStateCheckpoint(IndexState *state) +{ + if (!state->saving) { + return logErrorWithStringError(UDS_BAD_STATE, + "not saving the index state"); + } + + logError("aborting index state checkpoint"); + + int result = UDS_SUCCESS; + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + int tmp = abortIndexComponentIncrementalSave(component); + if (result == UDS_SUCCESS) { + result = tmp; + } + } + + cleanupSave(state); + state->saving = false; + + return result; +} + +/*****************************************************************************/ +int discardIndexStateData(IndexState *state) +{ + int result = discardIndexSaves(state->layout, true); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot destroy all index saves", + __func__); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardLastIndexStateSave(IndexState *state) +{ + int result = discardIndexSaves(state->layout, false); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot destroy latest index save", + __func__); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) +{ + unsigned int slot = mode == IO_READ ? state->loadSlot : state->saveSlot; + return getIndexStateBuffer(state->layout, slot); +} + +/*****************************************************************************/ +int openStateBufferedReader(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) +{ + return openIndexBufferedReader(state->layout, state->loadSlot, kind, zone, + readerPtr); +} + +/*****************************************************************************/ +int openStateBufferedWriter(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) +{ + return openIndexBufferedWriter(state->layout, state->saveSlot, kind, zone, + writerPtr); +} diff --git a/uds/indexState.h b/uds/indexState.h new file mode 100644 index 0000000..82899c1 --- /dev/null +++ b/uds/indexState.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexState.h#5 $ + */ + +#ifndef INDEX_STATE_H +#define INDEX_STATE_H 1 + +#include "buffer.h" +#include "indexComponent.h" + + +/** + * Used here and in SingleFileLayout. + **/ +typedef enum { + IS_SAVE, + IS_CHECKPOINT, + NO_SAVE = 9999, +} IndexSaveType; + +/* + * Used in getStateIndexStateBuffer to identify whether the index state buffer + * is for the index being loaded or the index being saved. + */ +typedef enum { + IO_READ = 0x1, + IO_WRITE = 0x2, +} IOAccessMode; + +/** + * The index state structure controls the loading and saving of the index + * state. + **/ +typedef struct indexState { + struct indexLayout *layout; + unsigned int zoneCount; // number of index zones to use + unsigned int loadZones; + unsigned int loadSlot; + unsigned int saveSlot; + unsigned int count; // count of registered entries (<= length) + unsigned int length; // total span of array allocation + bool saving; // incremental save in progress + IndexComponent *entries[]; // array of index component entries +} IndexState; + +/** + * Make an index state object, + * + * @param [in] layout The index layout. + * @param [in] numZones The number of zones to use. + * @param [in] maxComponents The maximum number of components to be handled. + * @param [out] statePtr Where to store the index state object. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexState(struct indexLayout *layout, + unsigned int numZones, + unsigned int maxComponents, + IndexState **statePtr) + __attribute__((warn_unused_result)); + +/** + * Free an index state (generically). + * + * @param statePtr The pointer to the index state to be freed and + * set to NULL. + **/ +void freeIndexState(IndexState **statePtr); + +/** + * Add an index component to an index state. + * + * @param state The index directory in which to add this component. + * @param info The index component file specification. + * @param data The per-component data structure. + * @param context The load/save context of the component. + * + * @return UDS_SUCCESS or an error code. + **/ +int addIndexStateComponent(IndexState *state, + const IndexComponentInfo *info, + void *data, + void *context) + __attribute__((warn_unused_result)); + +/** + * Load index state + * + * @param state The index state. + * @param replayPtr If set, the place to hold whether a replay is required. + * + * @return UDS_SUCCESS or error + **/ +int loadIndexState(IndexState *state, bool *replayPtr) + __attribute__((warn_unused_result)); + +/** + * Save the current index state, including the open chapter. + * + * @param state The index state. + * + * @return UDS_SUCCESS or error + **/ +int saveIndexState(IndexState *state) __attribute__((warn_unused_result)); + +/** + * Prepare to save the index state. + * + * @param state the index state + * @param saveType whether a checkpoint or save + * + * @return UDS_SUCCESS or an error code + **/ +int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) + __attribute__((warn_unused_result)); + +/** + * Write index checkpoint non-incrementally (for testing). + * + * @param state The index state. + * + * @return UDS_SUCCESS or error + **/ +int writeIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Sets up an index state checkpoint which will proceed incrementally. + * May create the directory but does not actually write any data. + * + * @param state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int startIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Perform operations on index state checkpoints that are synchronized to + * the chapter writer thread. + * + * @param state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Performs zone-specific (and, for zone 0, general) incremental checkpointing. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed + * for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int performIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental index state checkpoint + * for a particular zone. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed + * for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental index state checkpoint once + * all zones are completed. + * + * @param [in] state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Aborts an index state checkpoint which is proceeding incrementally + * for a particular zone. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed or + * aborted for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int abortIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed); + +/** + * Aborts an index state checkpoint which is proceeding incrementally, + * once all the zones are aborted. + * + * @param [in] state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int abortIndexStateCheckpoint(IndexState *state); + +/** + * Remove or disable the index state data, for testing. + * + * @param state The index state + * + * @return UDS_SUCCESS or an error code + * + * @note the return value of this function is frequently ignored + **/ +int discardIndexStateData(IndexState *state); + +/** + * Discard the last index state save, for testing. + * + * @param state The index state + * + * @return UDS_SUCCESS or an error code + * + * @note the return value of this function is frequently ignored + **/ +int discardLastIndexStateSave(IndexState *state); + +/** + * Find index component, for testing. + * + * @param state The index state + * @param info The index component file specification + * + * @return The index component, or NULL if not found + **/ +IndexComponent *findIndexComponent(const IndexState *state, + const IndexComponentInfo *info) + __attribute__((warn_unused_result)); + +/** + * Get the indexStateBuffer for a specified mode. + * + * @param state The index state. + * @param mode One of IO_READ or IO_WRITE. + * + * @return the index state buffer + **/ +Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedReader for a specified state, kind, and zone. + * This helper function is used by IndexComponent. + * + * @param state The index state. + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param readerPtr Where to store the BufferedReader. + * + * @return UDS_SUCCESS or an error code. + **/ +int openStateBufferedReader(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedWriter for a specified state, kind, and zone. + * This helper function is used by IndexComponent. + * + * @param state The index state. + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param writerPtr Where to store the BufferedWriter. + * + * @return UDS_SUCCESS or an error code. + **/ +int openStateBufferedWriter(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +#endif // INDEX_STATE_H diff --git a/uds/indexStateData.c b/uds/indexStateData.c new file mode 100644 index 0000000..62038f0 --- /dev/null +++ b/uds/indexStateData.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.c#3 $ + */ + +#include "indexStateData.h" + +#include "buffer.h" +#include "errors.h" +#include "index.h" +#include "logger.h" +#include "uds.h" + +/* The index state version header */ +typedef struct { + int32_t signature; + int32_t versionID; +} IndexStateVersion; + +/* The version 301 index state */ +typedef struct { + uint64_t newestChapter; + uint64_t oldestChapter; + uint64_t lastCheckpoint; + uint32_t unused; + uint32_t padding; +} IndexStateData301; + +static const IndexStateVersion INDEX_STATE_VERSION_301 = { + .signature = -1, + .versionID = 301, +}; + +/** + * The index state index component reader. + * + * @param portal the ReadPortal that handles the read of the component + * + * @return UDS_SUCCESS or an error code + **/ +static int readIndexStateData(ReadPortal *portal) +{ + Buffer *buffer = getStateIndexStateBuffer(portal->component->state, IO_READ); + int result = rewindBuffer(buffer, uncompactedAmount(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + IndexStateVersion fileVersion; + result = getInt32LEFromBuffer(buffer, &fileVersion.signature); + if (result != UDS_SUCCESS) { + return result; + } + result = getInt32LEFromBuffer(buffer, &fileVersion.versionID); + if (result != UDS_SUCCESS) { + return result; + } + + if (fileVersion.signature != -1 || fileVersion.versionID != 301) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "Index state version %d,%d is unsupported", + fileVersion.signature, + fileVersion.versionID); + } + + IndexStateData301 state; + result = getUInt64LEFromBuffer(buffer, &state.newestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &state.oldestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &state.lastCheckpoint); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &state.unused); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &state.padding); + if (result != UDS_SUCCESS) { + return result; + } + + if ((state.unused != 0) || (state.padding != 0)) { + return UDS_CORRUPT_COMPONENT; + } + + Index *index = indexComponentData(portal->component); + index->newestVirtualChapter = state.newestChapter; + index->oldestVirtualChapter = state.oldestChapter; + index->lastCheckpoint = state.lastCheckpoint; + return UDS_SUCCESS; +} + +/** + * The index state index component writer. + * + * @param component The component whose state is to be saved (an Index) + * @param writer The buffered writer. + * @param zone The zone to write. + * + * @return UDS_SUCCESS or an error code + **/ +static int writeIndexStateData(IndexComponent *component, + BufferedWriter *writer __attribute__((unused)), + unsigned int zone __attribute__((unused))) +{ + Buffer *buffer = getStateIndexStateBuffer(component->state, IO_WRITE); + int result = resetBufferEnd(buffer, 0); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.signature); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.versionID); + if (result != UDS_SUCCESS) { + return result; + } + + Index *index = indexComponentData(component); + IndexStateData301 state = { + .newestChapter = index->newestVirtualChapter, + .oldestChapter = index->oldestVirtualChapter, + .lastCheckpoint = index->lastCheckpoint, + }; + + result = putUInt64LEIntoBuffer(buffer, state.newestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, state.oldestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, state.lastCheckpoint); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, state.unused); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, state.padding); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ + +const IndexComponentInfo INDEX_STATE_INFO = { + .kind = RL_KIND_INDEX_STATE, + .name = "index state", + .saveOnly = false, + .chapterSync = true, + .multiZone = false, + .ioStorage = false, + .loader = readIndexStateData, + .saver = writeIndexStateData, + .incremental = NULL, +}; diff --git a/uds/indexStateData.h b/uds/indexStateData.h new file mode 100644 index 0000000..b6aa9b2 --- /dev/null +++ b/uds/indexStateData.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.h#1 $ + */ + +#ifndef INDEX_STATE_DATA_H +#define INDEX_STATE_DATA_H 1 + +#include "indexComponent.h" + +extern const IndexComponentInfo INDEX_STATE_INFO; + +#endif /* not INDEX_STATE_DATA_H */ diff --git a/uds/indexVersion.c b/uds/indexVersion.c new file mode 100644 index 0000000..df16e73 --- /dev/null +++ b/uds/indexVersion.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.c#1 $ + */ + +#include "indexVersion.h" + +void initializeIndexVersion(struct index_version *version, + uint32_t superVersion) +{ + /* + * Version 1 was introduced for the first single file layout. It was used in + * RHEL7 and in RHEL8.0 Beta. No kernel index ever used an earlier version. + */ + + /* + * Version 2 was created when we discovered that the volume header page was + * written in native endian format. It was used in RHEL8.0 and RHEL8.1. We + * stopped reading and the volume header page, and changed to version 2 so + * that an index creaed on RHEL8 cannot be taken back an used on RHEL7. + * + * Versions 1 and 2 are identical in normal operation (i.e. after the index + * is loaded). + */ + + /* + * Version 3 was created when we discovered the the chapter index headers + * were written in native endian format. It was first used in RHEL8.2 and is + * the current version for new indices. + * + * Versions before 3 read and write native endian chapter headers. Version 3 + * reads chapter headers in any endian order, and writes little-endian + * chapter headers. + */ + bool chapterIndexHeaderNativeEndian = superVersion < 3; + + *version = (struct index_version) { + .chapterIndexHeaderNativeEndian = chapterIndexHeaderNativeEndian, + }; +} diff --git a/uds/indexVersion.h b/uds/indexVersion.h new file mode 100644 index 0000000..f46b2e9 --- /dev/null +++ b/uds/indexVersion.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.h#1 $ + */ + +#ifndef INDEX_VERSION_H +#define INDEX_VERSION_H + +#include "typeDefs.h" + +struct index_version { + bool chapterIndexHeaderNativeEndian; +}; + +enum { + SUPER_VERSION_MINIMUM = 1, + SUPER_VERSION_MAXIMUM = 3, + SUPER_VERSION_CURRENT = 3, +}; + +/** + * Initialize the version parameters that we normally learn when loading the + * index but need to use during index operation. + * + * @param version The version parameters + * @param superVersion The SuperBlock version number + **/ +void initializeIndexVersion(struct index_version *version, + uint32_t superVersion); + +#endif // INDEX_VERSION_H diff --git a/uds/indexZone.c b/uds/indexZone.c new file mode 100644 index 0000000..f3cd8ed --- /dev/null +++ b/uds/indexZone.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexZone.c#4 $ + */ + +#include "indexZone.h" + +#include "errors.h" +#include "index.h" +#include "indexCheckpoint.h" +#include "indexRouter.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "request.h" +#include "sparseCache.h" +#include "uds.h" + +/**********************************************************************/ +int makeIndexZone(struct index *index, unsigned int zoneNumber) +{ + IndexZone *zone; + int result = ALLOCATE(1, IndexZone, "index zone", &zone); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeOpenChapter(index->volume->geometry, index->zoneCount, + &zone->openChapter); + if (result != UDS_SUCCESS) { + freeIndexZone(zone); + return result; + } + + result = makeOpenChapter(index->volume->geometry, index->zoneCount, + &zone->writingChapter); + if (result != UDS_SUCCESS) { + freeIndexZone(zone); + return result; + } + + zone->index = index; + zone->id = zoneNumber; + index->zones[zoneNumber] = zone; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndexZone(IndexZone *zone) +{ + if (zone == NULL) { + return; + } + + freeOpenChapter(zone->openChapter); + freeOpenChapter(zone->writingChapter); + FREE(zone); +} + +/**********************************************************************/ +bool isZoneChapterSparse(const IndexZone *zone, + uint64_t virtualChapter) +{ + return isChapterSparse(zone->index->volume->geometry, + zone->oldestVirtualChapter, + zone->newestVirtualChapter, + virtualChapter); +} + +/**********************************************************************/ +void setActiveChapters(IndexZone *zone) +{ + zone->oldestVirtualChapter = zone->index->oldestVirtualChapter; + zone->newestVirtualChapter = zone->index->newestVirtualChapter; +} + +/** + * Swap the open and writing chapters after blocking until there are no active + * chapter writers on the index. + * + * @param zone The zone swapping chapters + * + * @return UDS_SUCCESS or a return code + **/ +static int swapOpenChapter(IndexZone *zone) +{ + // Wait for any currently writing chapter to complete + int result = finishPreviousChapter(zone->index->chapterWriter, + zone->newestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + // Swap the writing and open chapters + OpenChapterZone *tempChapter = zone->openChapter; + zone->openChapter = zone->writingChapter; + zone->writingChapter = tempChapter; + return UDS_SUCCESS; +} + +/** + * Advance to a new open chapter, and forget the oldest chapter in the + * index if necessary. + * + * @param zone The zone containing the chapter to reap + * + * @return UDS_SUCCESS or an error code + **/ +static int reapOldestChapter(IndexZone *zone) +{ + Index *index = zone->index; + unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; + int result + = ASSERT(((zone->newestVirtualChapter - zone->oldestVirtualChapter) + <= chaptersPerVolume), + "newest (%llu) and oldest (%llu) virtual chapters " + "less than or equal to chapters per volume (%u)", + zone->newestVirtualChapter, zone->oldestVirtualChapter, + chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + + setMasterIndexZoneOpenChapter(index->masterIndex, zone->id, + zone->newestVirtualChapter); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int executeSparseCacheBarrierMessage(IndexZone *zone, + BarrierMessageData *barrier) +{ + /* + * Check if the chapter index for the virtual chapter is already in the + * cache, and if it's not, rendezvous with the other zone threads to add the + * chapter index to the sparse index cache. + */ + return updateSparseCache(zone, barrier->virtualChapter); +} + +/** + * Handle notification that some other zone has closed its open chapter. If + * the chapter that was closed is still the open chapter for this zone, + * close it now in order to minimize skew. + * + * @param zone The zone receiving the notification + * @param chapterClosed The notification + * + * @return UDS_SUCCESS or an error code + **/ +static int handleChapterClosed(IndexZone *zone, + ChapterClosedMessageData *chapterClosed) +{ + if (zone->newestVirtualChapter == chapterClosed->virtualChapter) { + return openNextChapter(zone, NULL); + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int dispatchIndexZoneControlRequest(Request *request) +{ + ZoneMessage *message = &request->zoneMessage; + IndexZone *zone = message->index->zones[request->zoneNumber]; + + switch (request->action) { + case REQUEST_SPARSE_CACHE_BARRIER: + return executeSparseCacheBarrierMessage(zone, &message->data.barrier); + + case REQUEST_ANNOUNCE_CHAPTER_CLOSED: + return handleChapterClosed(zone, &message->data.chapterClosed); + + default: + return ASSERT_FALSE("valid control message type: %d", request->action); + } +} + +/** + * Announce the closure of the current open chapter to the other zones. + * + * @param request The request which caused the chapter to close + * (may be NULL) + * @param zone The zone which first closed the chapter + * @param closedChapter The chapter which was closed + * + * @return UDS_SUCCESS or an error code + **/ +static int announceChapterClosed(Request *request, + IndexZone *zone, + uint64_t closedChapter) +{ + IndexRouter *router = ((request != NULL) ? request->router : NULL); + + ZoneMessage zoneMessage = { + .index = zone->index, + .data = { + .chapterClosed = { .virtualChapter = closedChapter } + } + }; + + unsigned int i; + for (i = 0; i < zone->index->zoneCount; i++) { + if (zone->id == i) { + continue; + } + int result; + if (router != NULL) { + result = launchZoneControlMessage(REQUEST_ANNOUNCE_CHAPTER_CLOSED, + zoneMessage, i, router); + } else { + // We're in a test which doesn't have zone queues, so we can just + // call the message function directly. + result = handleChapterClosed(zone->index->zones[i], + &zoneMessage.data.chapterClosed); + } + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int openNextChapter(IndexZone *zone, Request *request) +{ + logDebug("closing chapter %llu of zone %d after %u entries (%u short)", + zone->newestVirtualChapter, zone->id, zone->openChapter->size, + zone->openChapter->capacity - zone->openChapter->size); + + int result = swapOpenChapter(zone); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t closedChapter = zone->newestVirtualChapter++; + result = reapOldestChapter(zone); + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "reapOldestChapter failed"); + } + + resetOpenChapter(zone->openChapter); + + // begin, continue, or finish the checkpoint processing + // moved above startClosingChapter because some of the + // checkpoint processing now done by the chapter writer thread + result = processCheckpointing(zone->index, + zone->id, + zone->newestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int finishedZones = startClosingChapter(zone->index->chapterWriter, + zone->id, + zone->writingChapter); + if ((finishedZones == 1) && (zone->index->zoneCount > 1)) { + // This is the first zone of a multi-zone index to close this chapter, + // so inform the other zones in order to control zone skew. + result = announceChapterClosed(request, zone, closedChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + + // If the chapter being opened won't overwrite the oldest chapter, we're + // done. + if (!areSamePhysicalChapter(zone->index->volume->geometry, + zone->newestVirtualChapter, + zone->oldestVirtualChapter)) { + return UDS_SUCCESS; + } + + uint64_t victim = zone->oldestVirtualChapter++; + if (finishedZones < zone->index->zoneCount) { + // We are not the last zone to close the chapter, so we're done + return UDS_SUCCESS; + } + + /* + * We are the last zone to close the chapter, so clean up the cache. That + * it is safe to let the last thread out of the previous chapter to do this + * relies on the fact that although the new open chapter shadows the oldest + * chapter in the cache, until we write the new open chapter to disk, we'll + * never look for it in the cache. + */ + return forgetChapter(zone->index->volume, victim, INVALIDATION_EXPIRE); +} + +/**********************************************************************/ +IndexRegion computeIndexRegion(const IndexZone *zone, + uint64_t virtualChapter) +{ + if (virtualChapter == zone->newestVirtualChapter) { + return LOC_IN_OPEN_CHAPTER; + } + return (isZoneChapterSparse(zone, virtualChapter) + ? LOC_IN_SPARSE : LOC_IN_DENSE); +} + +/**********************************************************************/ +int getRecordFromZone(IndexZone *zone, + Request *request, + bool *found, + uint64_t virtualChapter) +{ + if (virtualChapter == zone->newestVirtualChapter) { + searchOpenChapter(zone->openChapter, &request->chunkName, + &request->oldMetadata, found); + return UDS_SUCCESS; + } + + if ((zone->newestVirtualChapter > 0) + && (virtualChapter == (zone->newestVirtualChapter - 1)) + && (zone->writingChapter->size > 0)) { + // Only search the writing chapter if it is full, else look on disk. + searchOpenChapter(zone->writingChapter, &request->chunkName, + &request->oldMetadata, found); + return UDS_SUCCESS; + } + + // The slow lane thread has determined the location previously. We don't need + // to search again. Just return the location. + if (request->slLocationKnown) { + *found = request->slLocation != LOC_UNAVAILABLE; + return UDS_SUCCESS; + } + + Volume *volume = zone->index->volume; + if (isZoneChapterSparse(zone, virtualChapter) + && sparseCacheContains(volume->sparseCache, virtualChapter, + request->zoneNumber)) { + // The named chunk, if it exists, is in a sparse chapter that is cached, + // so just run the chunk through the sparse chapter cache search. + return searchSparseCacheInZone(zone, request, virtualChapter, found); + } + + return searchVolumePageCache(volume, request, &request->chunkName, + virtualChapter, &request->oldMetadata, found); +} + +/**********************************************************************/ +int putRecordInZone(IndexZone *zone, + Request *request, + const UdsChunkData *metadata) +{ + unsigned int remaining; + int result = putOpenChapter(zone->openChapter, &request->chunkName, metadata, + &remaining); + if (result != UDS_SUCCESS) { + return result; + } + + if (remaining == 0) { + return openNextChapter(zone, request); + } + + return UDS_SUCCESS; +} + +/**************************************************************************/ +int searchSparseCacheInZone(IndexZone *zone, + Request *request, + uint64_t virtualChapter, + bool *found) +{ + int recordPageNumber; + int result = searchSparseCache(zone, &request->chunkName, &virtualChapter, + &recordPageNumber); + if ((result != UDS_SUCCESS) || (virtualChapter == UINT64_MAX)) { + return result; + } + + Volume *volume = zone->index->volume; + // XXX map to physical chapter and validate. It would be nice to just pass + // the virtual in to the slow lane, since it's tracking invalidations. + unsigned int chapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + + return searchCachedRecordPage(volume, request, &request->chunkName, chapter, + recordPageNumber, &request->oldMetadata, + found); +} diff --git a/uds/indexZone.h b/uds/indexZone.h new file mode 100644 index 0000000..8301894 --- /dev/null +++ b/uds/indexZone.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexZone.h#2 $ + */ + +#ifndef INDEX_ZONE_H +#define INDEX_ZONE_H + +#include "common.h" +#include "openChapterZone.h" +#include "request.h" + +typedef struct { + struct index *index; + OpenChapterZone *openChapter; + OpenChapterZone *writingChapter; + uint64_t oldestVirtualChapter; + uint64_t newestVirtualChapter; + unsigned int id; +} IndexZone; + +/** + * Allocate an index zone. + * + * @param index The index receiving the zone + * @param zoneNumber The number of the zone to allocate + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexZone(struct index *index, unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Clean up an index zone. + * + * @param zone The index zone to free + * + * @return UDS_SUCCESS or an error code. + **/ +void freeIndexZone(IndexZone *zone); + +/** + * Check whether a chapter is sparse or dense based on the current state of + * the index zone. + * + * @param zone The index zone to check against + * @param virtualChapter The virtual chapter number of the chapter to check + * + * @return true if the chapter is in the sparse part of the volume + **/ +bool isZoneChapterSparse(const IndexZone *zone, + uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Set the active chapter numbers for a zone based on its index. The active + * chapters consist of the range of chapters from the current oldest to + * the current newest virtual chapter. + * + * @param zone The zone to set + **/ +void setActiveChapters(IndexZone *zone); + +/** + * Dispatch a control request to an index zone. + * + * @param request The request to dispatch + * + * @return UDS_SUCCESS or an error code + **/ +int dispatchIndexZoneControlRequest(Request *request) + __attribute__((warn_unused_result)); + +/** + * Execute a sparse chapter index cache barrier control request on the zone + * worker thread. This call into the sparse cache to coordinate the cache + * update with the other zones. + * + * @param zone The index zone receiving the barrier message + * @param barrier The barrier control message data + * + * @return UDS_SUCCESS or an error code if the chapter index could not be + * read or decoded + **/ +int executeSparseCacheBarrierMessage(IndexZone *zone, + BarrierMessageData *barrier) + __attribute__((warn_unused_result)); + +/** + * Open the next chapter. + * + * @param zone The zone containing the open chapter + * @param request The request which requires the next chapter to be + * opened + * + * @return UDS_SUCCESS if successful. + **/ +int openNextChapter(IndexZone *zone, Request *request) + __attribute__((warn_unused_result)); + +/** + * Determine the IndexRegion in which a block was found. + * + * @param zone The zone that was searched + * @param virtualChapter The virtual chapter number + * + * @return the IndexRegion of the chapter in which the block was found + **/ +IndexRegion computeIndexRegion(const IndexZone *zone, + uint64_t virtualChapter); + +/** + * Get a record from either the volume or the open chapter in a zone. + * + * @param zone The index zone to query + * @param request The request originating the query + * @param found A pointer to a bool which will be set to + * true if the record was found. + * @param virtualChapter The chapter in which to search + * + * @return UDS_SUCCESS or an error code + **/ +int getRecordFromZone(IndexZone *zone, + Request *request, + bool *found, + uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Put a record in the open chapter. If this fills the chapter, the chapter + * will be closed and a new one will be opened. + * + * @param zone The index zone containing the chapter + * @param request The request containing the name of the record + * @param metadata The record metadata + * + * @return UDS_SUCCESS or an error + **/ +int putRecordInZone(IndexZone *zone, + Request *request, + const UdsChunkData *metadata) + __attribute__((warn_unused_result)); + +/** + * Search the cached sparse chapter index, either for a cached sparse hook, or + * as the last chance for finding the record named by a request. + * + * @param [in] zone the index zone + * @param [in] request the request originating the search + * @param [in] virtualChapter if UINT64_MAX, search the entire cache; + * otherwise search this chapter, if cached + * @param [out] found A pointer to a bool which will be set to + * true if the record was found + * + * @return UDS_SUCCESS or an error code + **/ +int searchSparseCacheInZone(IndexZone *zone, + Request *request, + uint64_t virtualChapter, + bool *found) + __attribute__((warn_unused_result)); + +#endif /* INDEX_ZONE_H */ diff --git a/uds/ioFactory.h b/uds/ioFactory.h new file mode 100644 index 0000000..ef6cc90 --- /dev/null +++ b/uds/ioFactory.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/ioFactory.h#7 $ + */ + +#ifndef IO_FACTORY_H +#define IO_FACTORY_H + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#ifdef __KERNEL__ +#include +#else +#include "fileUtils.h" +#include "ioRegion.h" +#endif + +/* + * An IOFactory object is responsible for controlling access to index storage. + * The index is a contiguous range of blocks on a block device or within a + * file. + * + * The IOFactory holds the open device or file and is responsible for closing + * it. The IOFactory has methods to make IORegions that are used to access + * sections of the index. + */ +typedef struct ioFactory IOFactory; + +/* + * Define the UDS block size as 4K. Historically, we wrote the volume file in + * large blocks, but wrote all the other index data into byte streams stored in + * files. When we converted to writing an index into a block device, we + * changed to writing the byte streams into page sized blocks. Now that we + * support multiple architectures, we write into 4K blocks on all platforms. + * + * XXX We must convert all the rogue 4K constants to use UDS_BLOCK_SIZE. + */ +enum { UDS_BLOCK_SIZE = 4096 }; + +#ifdef __KERNEL__ +/** + * Create an IOFactory. The IOFactory is returned with a reference count of 1. + * + * @param path The path to the block device or file that contains the + * block stream + * @param factoryPtr The IOFactory is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIOFactory(const char *path, IOFactory **factoryPtr) + __attribute__((warn_unused_result)); +#else +/** + * Create an IOFactory. The IOFactory is returned with a reference count of 1. + * + * @param path The path to the block device or file that contains the + * block stream + * @param access The requested access kind. + * @param factoryPtr The IOFactory is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIOFactory(const char *path, + FileAccess access, + IOFactory **factoryPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Get another reference to an IOFactory, incrementing its reference count. + * + * @param factory The IOFactory + **/ +void getIOFactory(IOFactory *factory); + +/** + * Free a reference to an IOFactory. If the reference count drops to zero, + * free the IOFactory and release all its resources. + * + * @param factory The IOFactory + **/ +void putIOFactory(IOFactory *factory); + +/** + * Get the maximum potential size of the device or file. For a device, this is + * the actual size of the device. For a file, this is the largest file that we + * can possibly write. + * + * @param factory The IOFactory + * + * @return the writable size (in bytes) + **/ +size_t getWritableSize(IOFactory *factory) __attribute__((warn_unused_result)); + +#ifdef __KERNEL__ +/** + * Create a struct dm_bufio_client for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size of a block, in bytes + * @param reservedBuffers The number of buffers that can be reserved + * @param clientPtr The struct dm_bufio_client is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeBufio(IOFactory *factory, + off_t offset, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) + __attribute__((warn_unused_result)); +#else +/** + * Create an IORegion for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIORegion(IOFactory *factory, + off_t offset, + size_t size, + IORegion **regionPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Create a BufferedReader for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int openBufferedReader(IOFactory *factory, + off_t offset, + size_t size, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Create a BufferedWriter for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int openBufferedWriter(IOFactory *factory, + off_t offset, + size_t size, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +#endif // IO_FACTORY_H diff --git a/uds/ioFactoryLinuxKernel.c b/uds/ioFactoryLinuxKernel.c new file mode 100644 index 0000000..9e45920 --- /dev/null +++ b/uds/ioFactoryLinuxKernel.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/ioFactoryLinuxKernel.c#9 $ + */ + +#include +#include + +#include "atomicDefs.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" + +enum { BLK_FMODE = FMODE_READ | FMODE_WRITE }; + +/* + * A kernel mode IOFactory object controls access to an index stored on a block + * device. + */ +struct ioFactory { + struct block_device *bdev; + atomic_t refCount; +}; + +/*****************************************************************************/ +void getIOFactory(IOFactory *factory) +{ + atomic_inc(&factory->refCount); +} + +/*****************************************************************************/ +int makeIOFactory(const char *path, IOFactory **factoryPtr) +{ + struct block_device *bdev; + dev_t device = name_to_dev_t(path); + if (device != 0) { + bdev = blkdev_get_by_dev(device, BLK_FMODE, NULL); + } else { + bdev = blkdev_get_by_path(path, BLK_FMODE, NULL); + } + if (IS_ERR(bdev)) { + logErrorWithStringError(-PTR_ERR(bdev), "%s is not a block device", path); + return UDS_INVALID_ARGUMENT; + } + + IOFactory *factory; + int result = ALLOCATE(1, IOFactory, __func__, &factory); + if (result != UDS_SUCCESS) { + blkdev_put(bdev, BLK_FMODE); + return result; + } + + factory->bdev = bdev; + atomic_set_release(&factory->refCount, 1); + + *factoryPtr = factory; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void putIOFactory(IOFactory *factory) +{ + if (atomic_add_return(-1, &factory->refCount) <= 0) { + blkdev_put(factory->bdev, BLK_FMODE); + FREE(factory); + } +} + +/*****************************************************************************/ +size_t getWritableSize(IOFactory *factory) +{ + return i_size_read(factory->bdev->bd_inode); +} + +/*****************************************************************************/ +int makeBufio(IOFactory *factory, + off_t offset, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) +{ + if (offset % SECTOR_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "offset %zd not multiple of %d", + offset, SECTOR_SIZE); + } + if (blockSize % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "blockSize %zd not multiple of %d", + blockSize, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = dm_bufio_client_create(factory->bdev, + blockSize, + reservedBuffers, 0, + NULL, NULL); + if (IS_ERR(client)) { + return -PTR_ERR(client); + } + + dm_bufio_set_sector_offset(client, offset >> SECTOR_SHIFT); + *clientPtr = client; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int openBufferedReader(IOFactory *factory, + off_t offset, + size_t size, + BufferedReader **readerPtr) +{ + if (size % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "region size %zd is not multiple of %d", + size, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = NULL; + int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBufferedReader(factory, client, size / UDS_BLOCK_SIZE, + readerPtr); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + } + return result; +} + +/*****************************************************************************/ +int openBufferedWriter(IOFactory *factory, + off_t offset, + size_t size, + BufferedWriter **writerPtr) +{ + if (size % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "region size %zd is not multiple of %d", + size, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = NULL; + int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBufferedWriter(factory, client, size / UDS_BLOCK_SIZE, + writerPtr); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + } + return result; +} diff --git a/uds/layoutRegion.h b/uds/layoutRegion.h new file mode 100644 index 0000000..b49f979 --- /dev/null +++ b/uds/layoutRegion.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/layoutRegion.h#1 $ + */ + +#ifndef LAYOUT_REGION_H +#define LAYOUT_REGION_H + +/** + * Single file layouts are defined in terms of data regions. Each data region + * is a sub-section of the available space. Some data regions may contain + * subsidiary data regions, for example, a checkpoint or index save will + * contain master index regions (according to the number of zones), an + * index page map region, and possibly an open chapter region. + **/ + +static const uint64_t REGION_MAGIC = 0x416c6252676e3031; // 'AlbRgn01' + +typedef struct regionHeader { + uint64_t magic; // REGION_MAGIC + uint64_t regionBlocks; // size of whole region + uint16_t type; // RH_TYPE_... + uint16_t version; // 1 + uint16_t numRegions; // number of layouts in the table + uint16_t payload; // extra data beyond region table +} RegionHeader; + +typedef struct layoutRegion { + uint64_t startBlock; + uint64_t numBlocks; + uint32_t checksum; // only used for save regions + uint16_t kind; + uint16_t instance; +} LayoutRegion; + +typedef struct regionTable { + RegionHeader header; + LayoutRegion regions[]; +} RegionTable; + +#endif // LAYOUT_REGION_H diff --git a/uds/loadType.c b/uds/loadType.c new file mode 100644 index 0000000..125f8b0 --- /dev/null +++ b/uds/loadType.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/loadType.c#1 $ + */ + +#include "loadType.h" + +#include "logger.h" + +/**********************************************************************/ +const char *getLoadType(LoadType loadType) +{ + switch (loadType) { + case LOAD_CREATE: + return "creating index"; + case LOAD_LOAD: + return "loading index"; + case LOAD_REBUILD: + return "loading or rebuilding index"; + default: + return "no load method specified"; + } +} diff --git a/uds/loadType.h b/uds/loadType.h new file mode 100644 index 0000000..2b93e72 --- /dev/null +++ b/uds/loadType.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/loadType.h#1 $ + */ + +#ifndef LOAD_TYPE_H +#define LOAD_TYPE_H + +/** + * Methods of starting the index. (Keep getLoadType() in sync.) + * + * Usage number 1 is to note the interface method that initiates loading the + * index. As in this table: + * + * name type opened by + * =========== ====== ==================== + * LOAD_CREATE local udsCreateLocalIndex + * LOAD_LOAD local udsLoadLocalIndex + * LOAD_REBUILD local udsRebuildLocalIndex + * + * Usage number 2 is to record how an index was really opened. As in this + * table: + * + * LOAD_CREATE new empty index + * LOAD_LOAD loaded saved index + * LOAD_REPLAY loaded checkpoint and replayed new chapters + * LOAD_EMPTY empty master index from empty volume data + * LOAD_REBUILD rebuilt master index from volume data + **/ +typedef enum { + LOAD_UNDEFINED = 0, + LOAD_CREATE, + LOAD_LOAD, + LOAD_REBUILD, + LOAD_EMPTY, + LOAD_REPLAY, +} LoadType; + +/** + * get a string indicating how an index is to be loaded. + * + * @param loadType The load type to log + **/ +const char *getLoadType(LoadType loadType); + +#endif /* LOAD_TYPE_H */ diff --git a/uds/logger.c b/uds/logger.c new file mode 100644 index 0000000..311bae1 --- /dev/null +++ b/uds/logger.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/logger.c#3 $ + */ + +#include "logger.h" + +#include "common.h" +#include "errors.h" +#include "stringUtils.h" +#include "threads.h" +#include "uds.h" + +typedef struct { + const char *name; + const int priority; +} PriorityName; + +static const PriorityName PRIORITIES[] = { + { "ALERT", LOG_ALERT }, + { "CRITICAL", LOG_CRIT }, + { "CRIT", LOG_CRIT }, + { "DEBUG", LOG_DEBUG }, + { "EMERGENCY", LOG_EMERG }, + { "EMERG", LOG_EMERG }, + { "ERROR", LOG_ERR }, + { "ERR", LOG_ERR }, + { "INFO", LOG_INFO }, + { "NOTICE", LOG_NOTICE }, + { "PANIC", LOG_EMERG }, + { "WARN", LOG_WARNING }, + { "WARNING", LOG_WARNING }, + { NULL, -1 }, +}; + +static const char *const PRIORITY_STRINGS[] = { + "EMERGENCY", + "ALERT", + "CRITICAL", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", +}; + +static int logLevel = LOG_INFO; + +/*****************************************************************************/ +int getLogLevel(void) +{ + return logLevel; +} + +/*****************************************************************************/ +void setLogLevel(int newLogLevel) +{ + logLevel = newLogLevel; +} + +/*****************************************************************************/ +int stringToPriority(const char *string) +{ + int i; + for (i = 0; PRIORITIES[i].name != NULL; i++) { + if (strcasecmp(string, PRIORITIES[i].name) == 0) { + return PRIORITIES[i].priority; + } + } + return LOG_INFO; +} + +/*****************************************************************************/ +const char *priorityToString(int priority) +{ + if ((priority < 0) || (priority >= (int) COUNT_OF(PRIORITY_STRINGS))) { + return "unknown"; + } + return PRIORITY_STRINGS[priority]; +} + +/*****************************************************************************/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) +{ + va_list ap; + va_start(ap, fmt2); + logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); + va_end(ap); +} + +#pragma GCC diagnostic push +/* + * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems + * to think that this function should get a printf format + * attribute. But we have no second format string, and no additional + * arguments at the call site, and GCC also gets unhappy trying to + * analyze the format and values when there are none. So we'll just + * shut it up. + */ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +/** + * Log a message. + * + * This helper function exists solely to create a valid va_list with + * no useful info. It does the real work of vLogMessage, which wants a + * second va_list object to pass down. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +static void vLogMessageHelper(int priority, + const char *format, + va_list args, + ...) +{ + va_list dummy; + va_start(dummy, args); + logMessagePack(priority, NULL, format, args, NULL, dummy); + va_end(dummy); +} +#pragma GCC diagnostic pop + +/*****************************************************************************/ +void vLogMessage(int priority, const char *format, va_list args) +{ + vLogMessageHelper(priority, format, args); +} + +/*****************************************************************************/ +void logMessage(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logDebug(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_DEBUG, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logInfo(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_INFO, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logNotice(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_NOTICE, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logWarning(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_WARNING, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logError(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_ERR, format, args); + va_end(args); +} + +/*****************************************************************************/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) +{ + char errbuf[ERRBUF_SIZE]; + logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", + stringError(errnum, errbuf, sizeof(errbuf)), + errnum); + return errnum; +} + +/*****************************************************************************/ +int logWithStringError(int priority, int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(priority, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logErrorWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_ERR, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_WARNING, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_DEBUG, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_INFO, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_NOTICE, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logUnrecoverable(int errnum, const char *format, ...) +{ + if (isSuccessful(errnum)) { + return errnum; + } + va_list args; + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return makeUnrecoverable(errnum); +} + +/*****************************************************************************/ +void logFatal(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_CRIT, format, args); + va_end(args); +} diff --git a/uds/logger.h b/uds/logger.h new file mode 100644 index 0000000..b1f9d56 --- /dev/null +++ b/uds/logger.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/logger.h#5 $ + */ + +#ifndef LOGGER_H +#define LOGGER_H 1 + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include "minisyslog.h" +#endif + +#ifdef __KERNEL__ +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ +#endif + +#ifdef __KERNEL__ +// Make it easy to log real pointer values using %px when in development. +#ifdef LOG_INTERNAL +#define PRIptr "px" +#else +#define PRIptr "pK" +#endif +#else // not __KERNEL__ +// For compatibility with hooks we need when compiling in kernel mode. +#define PRIptr "p" +#endif + +/* + * Apply a rate limiter to a log method call. + * + * @param logFunc A method that does logging, which is not invoked if we are + * running in the kernel and the ratelimiter detects that we + * are calling it frequently. + */ +#ifdef __KERNEL__ +#define logRatelimit(logFunc, ...) \ + do { \ + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) { \ + logFunc(__VA_ARGS__); \ + } \ + } while (0) +#else +#define logRatelimit(logFunc, ...) logFunc(__VA_ARGS__) +#endif + +/** + * @file + * + * All of the log() functions will preserve the callers value of errno. + **/ + +#ifndef __KERNEL__ +/* + * In user mode, the functions in this file are not thread safe in the sense + * that nothing prevents multiple threads from closing loggers out from under + * other threads. In reality this isn't a problem since there are no calls to + * closeLogger() in production code. + */ + +/** + * Start the logger. + **/ +void openLogger(void); + +/** + * Stop the logger. + **/ +void closeLogger(void); +#endif + +/** + * Get the current logging level. + * + * @return the current logging priority level. + **/ +int getLogLevel(void); + +/** + * Set the current logging level. + * + * @param newLogLevel the new value for the logging priority level. + **/ +void setLogLevel(int newLogLevel); + +/** + * Return the integer logging priority represented by a name. + * + * @param string the name of the logging priority (case insensitive). + * + * @return the integer priority named by string, or LOG_INFO if not recognized. + **/ +int stringToPriority(const char *string); + +/** + * Return the printable name of a logging priority. + * + * @return the priority name + **/ +const char *priorityToString(int priority); + +/** + * Log a debug message. + * + * @param format The format of the message (a printf style format) + **/ +void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an informational message. + * + * @param format The format of the message (a printf style format) + **/ +void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a normal (but notable) condition. + * + * @param format The format of the message (a printf style format) + **/ +void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a warning. + * + * @param format The format of the message (a printf style format) + **/ +void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + **/ +void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message embedded within another message. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part + **/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) + __attribute__((format(printf, 3, 0), format(printf, 5, 6))); + +/** + * Log a message pack consisting of multiple variable sections. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part, may be NULL + * @param args2 arguments for message second part + **/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) + __attribute__((format(printf, 3, 0))); + +/** + * Log a stack backtrace. + * + * @param priority The priority at which to log the backtrace + **/ +void logBacktrace(int priority); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logWithStringError(int priority, int errnum, const char *format, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args The list of arguments with format. + * + * @return errnum + **/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logErrorWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * IF the result is an error, log a FATAL level message and return the result + * after marking it unrecoverable. The UDS_SUCCESS and UDS_QUEUED results are + * not considered errors and are returned unmodified. + * + * @param errnum int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return makeUnrecoverable(errnum) or UDS_SUCCESS or UDS_QUEUED + **/ +int logUnrecoverable(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log a fatal error. + * + * @param format The format of the message (a printf style format) + **/ +void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message -- for internal use only. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +void vLogMessage(int priority, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log a message + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + **/ +void logMessage(int priority, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Sleep or delay a short time (likely a few milliseconds) in an attempt allow + * the log buffers to be written out in case they might be overrun. This is + * unnecessary in user-space (and is a no-op there), but is needed when + * quickly issuing a lot of log output in the Linux kernel, as when dumping a + * large number of data structures. + **/ +void pauseForLogger(void); + +#endif /* LOGGER_H */ diff --git a/uds/loggerLinuxKernel.c b/uds/loggerLinuxKernel.c new file mode 100644 index 0000000..bb1ad0b --- /dev/null +++ b/uds/loggerLinuxKernel.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/loggerLinuxKernel.c#2 $ + */ + +#include +#include +#include +#include + +#include "logger.h" + +/**********************************************************************/ +static const char *priorityToLogLevel(int priority) +{ + switch (priority) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + return KERN_CRIT; + case LOG_ERR: + return KERN_ERR; + case LOG_WARNING: + return KERN_WARNING; + case LOG_NOTICE: + return KERN_NOTICE; + case LOG_INFO: + return KERN_INFO; + case LOG_DEBUG: + return KERN_DEBUG; + default: + return ""; + } +} + +/**********************************************************************/ +static const char *getCurrentInterruptType(void) +{ + if (in_nmi()) { + return "NMI"; + } + if (in_irq()) { + return "HI"; + } + if (in_softirq()) { + return "SI"; + } + return "INTR"; +} + +/**********************************************************************/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) +{ + if (priority > getLogLevel()) { + return; + } + + /* + * The kernel's printk has some magic for indirection to a secondary + * va_list. It wants us to supply a pointer to the va_list. + * + * However, va_list varies across platforms and can be an array + * type, which makes passing it around as an argument kind of + * tricky, due to the automatic conversion to a pointer. This makes + * taking the address of the argument a dicey thing; if we use "&a" + * it works fine for non-array types, but for array types we get the + * address of a pointer. Functions like va_copy and sprintf don't + * care as they get "va_list" values passed and are written to do + * the right thing, but printk explicitly wants the address of the + * va_list. + * + * So, we copy the va_list values to ensure that "&" consistently + * works the way we want. + */ + va_list args1Copy; + va_copy(args1Copy, args1); + va_list args2Copy; + va_copy(args2Copy, args2); + struct va_format vaf1 = { + .fmt = (fmt1 != NULL) ? fmt1 : "", + .va = &args1Copy, + }; + struct va_format vaf2 = { + .fmt = (fmt2 != NULL) ? fmt2 : "", + .va = &args2Copy, + }; + + if (prefix == NULL) { + prefix = ""; + } + + /* + * Context info formats: + * + * interrupt: uds[NMI]: blah + * process: uds: myprog: blah + * + * Fields: module name, interrupt level or process name. + * + * XXX need the equivalent of VDO's deviceInstance here + */ + if (in_interrupt()) { + printk("%s%s[%s]: %s%pV%pV\n", priorityToLogLevel(priority), + THIS_MODULE->name, getCurrentInterruptType(), prefix, &vaf1, &vaf2); + } else { + printk("%s%s: %s: %s%pV%pV\n", priorityToLogLevel(priority), + THIS_MODULE->name, current->comm, prefix, &vaf1, &vaf2); + } + + va_end(args1Copy); + va_end(args2Copy); +} + +/**********************************************************************/ +void logBacktrace(int priority) +{ + if (priority > getLogLevel()) { + return; + } + logMessage(priority, "[backtrace]"); + dump_stack(); +} + +/**********************************************************************/ +void pauseForLogger(void) +{ + // Hopefully, a few milliseconds of sleep will be large enough + // for the kernel log buffer to be flushed. + msleep(4); +} diff --git a/uds/masterIndex005.c b/uds/masterIndex005.c new file mode 100644 index 0000000..3f9a5b2 --- /dev/null +++ b/uds/masterIndex005.c @@ -0,0 +1,1470 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.c#3 $ + */ +#include "masterIndex005.h" + +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "uds.h" +#include "zone.h" + +/* + * The master index is a kept as a delta index where the payload is a + * chapter number. The master index adds 2 basic functions to the delta + * index: + * + * (1) How to get the delta list number and address out of the chunk name. + * + * (2) Dealing with chapter numbers, and especially the lazy flushing of + * chapters from the index. + * + * There are three ways of expressing chapter numbers: virtual, index, and + * rolling. The interface to the the master index uses virtual chapter + * numbers, which are 64 bits long. We do not store such large values in + * memory, so we internally use a binary value using the minimal number of + * bits. + * + * The delta index stores the index chapter number, which is the low-order + * bits of the virtual chapter number. + * + * When we need to deal with ordering of index chapter numbers, we roll the + * index chapter number around so that the smallest one we are using has + * the representation 0. See convertIndexToVirtual() or + * flushInvalidEntries() for an example of this technique. + */ + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { + uint64_t virtualChapterLow; // The lowest virtual chapter indexed + uint64_t virtualChapterHigh; // The highest virtual chapter indexed + long numEarlyFlushes; // The number of early flushes +} MasterIndexZone; + +typedef struct { + MasterIndex common; // Common master index methods + DeltaIndex deltaIndex; // The delta index + uint64_t *flushChapters; // The first chapter to be flushed + MasterIndexZone *masterZones; // The Zones + uint64_t volumeNonce; // The volume nonce + uint64_t chapterZoneBits; // Expected size of a chapter (per zone) + uint64_t maxZoneBits; // Maximum size index (per zone) + unsigned int addressBits; // Number of bits in address mask + unsigned int addressMask; // Mask to get address within delta list + unsigned int chapterBits; // Number of bits in chapter number + unsigned int chapterMask; // Largest storable chapter number + unsigned int numChapters; // Number of chapters used + unsigned int numDeltaLists; // The number of delta lists + unsigned int numZones; // The number of zones +} MasterIndex5; + +typedef struct chapterRange { + unsigned int chapterStart; // The first chapter + unsigned int chapterCount; // The number of chapters +} ChapterRange; + +// Constants for the magic byte of a MasterIndexRecord +static const byte masterIndexRecordMagic = 0xAA; +static const byte badMagic = 0; + +/* + * In production, the default value for minMasterIndexDeltaLists will be + * replaced by MAX_ZONES*MAX_ZONES. Some unit tests will replace + * minMasterIndexDeltaLists with the non-default value 1, because those + * tests really want to run with a single delta list. + */ +unsigned int minMasterIndexDeltaLists; + +/** + * Maximum of two unsigned ints + * + * @param a One unsigned int + * @param b Another unsigned int + * + * @return the bigger one + **/ +static INLINE unsigned int maxUint(unsigned int a, unsigned int b) +{ + return a > b ? a : b; +} + +/** + * Extract the address from a block name. + * + * @param mi5 The master index + * @param name The block name + * + * @return the address + **/ +static INLINE unsigned int extractAddress(const MasterIndex5 *mi5, + const UdsChunkName *name) +{ + return extractMasterIndexBytes(name) & mi5->addressMask; +} + +/** + * Extract the delta list number from a block name. + * + * @param mi5 The master index + * @param name The block name + * + * @return the delta list number + **/ +static INLINE unsigned int extractDListNum(const MasterIndex5 *mi5, + const UdsChunkName *name) +{ + uint64_t bits = extractMasterIndexBytes(name); + return (bits >> mi5->addressBits) % mi5->numDeltaLists; +} + +/** + * Get the master index zone containing a given master index record + * + * @param record The master index record + * + * @return the master index zone + **/ +static INLINE const MasterIndexZone *getMasterZone(const MasterIndexRecord *record) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + return &mi5->masterZones[record->zoneNumber]; +} + +/** + * Convert an index chapter number to a virtual chapter number. + * + * @param record The master index record + * @param indexChapter The index chapter number + * + * @return the virtual chapter number + **/ +static INLINE uint64_t convertIndexToVirtual(const MasterIndexRecord *record, + unsigned int indexChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + const MasterIndexZone *masterZone = getMasterZone(record); + unsigned int rollingChapter + = ((indexChapter - masterZone->virtualChapterLow) & mi5->chapterMask); + return masterZone->virtualChapterLow + rollingChapter; +} + +/** + * Convert a virtual chapter number to an index chapter number. + * + * @param mi5 The master index + * @param virtualChapter The virtual chapter number + * + * @return the index chapter number + **/ +static INLINE unsigned int convertVirtualToIndex(const MasterIndex5 *mi5, + uint64_t virtualChapter) +{ + return virtualChapter & mi5->chapterMask; +} + +/** + * Determine whether a virtual chapter number is in the range being indexed + * + * @param record The master index record + * @param virtualChapter The virtual chapter number + * + * @return true if the virtual chapter number is being indexed + **/ +static INLINE bool isVirtualChapterIndexed(const MasterIndexRecord *record, + uint64_t virtualChapter) +{ + const MasterIndexZone *masterZone = getMasterZone(record); + return ((virtualChapter >= masterZone->virtualChapterLow) + && (virtualChapter <= masterZone->virtualChapterHigh)); +} + +/***********************************************************************/ +/** + * Flush an invalid entry from the master index, advancing to the next + * valid entry. + * + * @param record Updated to describe the next valid record + * @param flushRange Range of chapters to flush from the index + * @param nextChapterToInvalidate Updated to record the next chapter that we + * will need to invalidate + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int flushInvalidEntries(MasterIndexRecord *record, + ChapterRange *flushRange, + unsigned int *nextChapterToInvalidate) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + int result = nextDeltaIndexEntry(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + while (!record->deltaEntry.atEnd) { + unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); + unsigned int relativeChapter = ((indexChapter - flushRange->chapterStart) + & mi5->chapterMask); + if (likely(relativeChapter >= flushRange->chapterCount)) { + if (relativeChapter < *nextChapterToInvalidate) { + *nextChapterToInvalidate = relativeChapter; + } + break; + } + result = removeDeltaIndexEntry(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/** + * Find the delta index entry, or the insertion point for a delta index + * entry, while processing chapter LRU flushing. + * + * @param record Updated to describe the entry being looked for + * @param listNumber The delta list number + * @param key The address field being looked for + * @param flushRange The range of chapters to flush from the index + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexEntry(MasterIndexRecord *record, + unsigned int listNumber, + unsigned int key, + ChapterRange *flushRange) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + unsigned int nextChapterToInvalidate = mi5->chapterMask; + + int result = startDeltaIndexSearch(&mi5->deltaIndex, listNumber, 0, + false, &record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + do { + result = flushInvalidEntries(record, flushRange, &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + } while (!record->deltaEntry.atEnd && (key > record->deltaEntry.key)); + + result = rememberDeltaIndexOffset(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + // We probably found the record we want, but we need to keep going + MasterIndexRecord otherRecord = *record; + if (!otherRecord.deltaEntry.atEnd && (key == otherRecord.deltaEntry.key)) { + for (;;) { + result = flushInvalidEntries(&otherRecord, flushRange, + &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + if (otherRecord.deltaEntry.atEnd + || !otherRecord.deltaEntry.isCollision) { + break; + } + byte collisionName[UDS_CHUNK_NAME_SIZE]; + result = getDeltaEntryCollision(&otherRecord.deltaEntry, collisionName); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(collisionName, record->name, UDS_CHUNK_NAME_SIZE) == 0) { + // This collision record is the one we are looking for + *record = otherRecord; + break; + } + } + } + while (!otherRecord.deltaEntry.atEnd) { + result = flushInvalidEntries(&otherRecord, flushRange, + &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + } + nextChapterToInvalidate += flushRange->chapterStart; + nextChapterToInvalidate &= mi5->chapterMask; + flushRange->chapterStart = nextChapterToInvalidate; + flushRange->chapterCount = 0; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static void freeMasterIndex_005(MasterIndex *masterIndex) +{ + if (masterIndex != NULL) { + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + FREE(mi5->flushChapters); + mi5->flushChapters = NULL; + FREE(mi5->masterZones); + mi5->masterZones = NULL; + uninitializeDeltaIndex(&mi5->deltaIndex); + FREE(masterIndex); + } +} + +/** + * Constants and structures for the saved master index file. "MI5" is for + * masterIndex005, and "-XXXX" is a number to increment when the format of + * the data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_MI_START[] = "MI5-0005"; + +struct mi005_data { + char magic[MAGIC_SIZE]; // MAGIC_MI_START + uint64_t volumeNonce; + uint64_t virtualChapterLow; + uint64_t virtualChapterHigh; + unsigned int firstList; + unsigned int numLists; +}; + +/***********************************************************************/ +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static void setMasterIndexTag_005(MasterIndex *masterIndex, byte tag) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + setDeltaIndexTag(&mi5->deltaIndex, tag); +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->volumeNonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->virtualChapterLow); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->virtualChapterHigh); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi005_data), + "%zu bytes of config written, of %zu expected", + contentLength(buffer), sizeof(struct mi005_data)); + return result; +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, + zoneNumber); + unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, + zoneNumber); + + struct mi005_data header; + memset(&header, 0, sizeof(header)); + memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); + header.volumeNonce = mi5->volumeNonce; + header.virtualChapterLow = masterZone->virtualChapterLow; + header.virtualChapterHigh = masterZone->virtualChapterHigh; + header.firstList = firstList; + header.numLists = numLists; + + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi005_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeMasterIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write master index header"); + } + result = makeBuffer(numLists * sizeof(uint64_t), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + uint64_t *firstFlushChapter = &mi5->flushChapters[firstList]; + result = putUInt64LEsIntoBuffer(buffer, numLists, firstFlushChapter); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write master index flush " + "ranges"); + } + + return startSavingDeltaIndex(&mi5->deltaIndex, zoneNumber, bufferedWriter); +} + +/***********************************************************************/ +/** + * Have all the data been written while saving a master index to an output + * stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static bool isSavingMasterIndexDone_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return isSavingDeltaIndexDone(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int finishSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return finishSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int abortSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return abortSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) +{ + int result = getBytesFromBuffer(buffer, sizeof(header->magic), + &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->volumeNonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->virtualChapterLow); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->virtualChapterHigh); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered readers to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startRestoringMasterIndex_005(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + if (masterIndex == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot restore to null master index"); + } + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + emptyDeltaIndex(&mi5->deltaIndex); + + uint64_t virtualChapterLow = 0; + uint64_t virtualChapterHigh = 0; + int i; + for (i = 0; i < numReaders; i++) { + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi005_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + struct mi005_data header; + result = decodeMasterIndexHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index file had bad magic" + " number"); + } + if (mi5->volumeNonce == 0) { + mi5->volumeNonce = header.volumeNonce; + } else if (header.volumeNonce != mi5->volumeNonce) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index volume nonce incorrect"); + } + if (i == 0) { + virtualChapterLow = header.virtualChapterLow; + virtualChapterHigh = header.virtualChapterHigh; + } else if (virtualChapterHigh != header.virtualChapterHigh) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "Inconsistent master index zone files:" + " Chapter range is [%llu,%" + PRIu64 "], chapter range %d is [%" + PRIu64 ",%llu]", + virtualChapterLow, virtualChapterHigh, + i, header.virtualChapterLow, + header.virtualChapterHigh); + } else if (virtualChapterLow < header.virtualChapterLow) { + virtualChapterLow = header.virtualChapterLow; + } + uint64_t *firstFlushChapter = &mi5->flushChapters[header.firstList]; + result = makeBuffer(header.numLists * sizeof(uint64_t), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index flush" + " ranges"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = getUInt64LEsFromBuffer(buffer, header.numLists, + firstFlushChapter); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + } + + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + memset(&mi5->masterZones[z], 0, sizeof(MasterIndexZone)); + mi5->masterZones[z].virtualChapterLow = virtualChapterLow; + mi5->masterZones[z].virtualChapterHigh = virtualChapterHigh; + } + + int result = startRestoringDeltaIndex(&mi5->deltaIndex, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "restoring delta index failed"); + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Have all the data been read while restoring a master index from an + * input stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static bool isRestoringMasterIndexDone_005(const MasterIndex *masterIndex) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return isRestoringDeltaIndexDone(&mi5->deltaIndex); +} + +/***********************************************************************/ +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static int restoreDeltaListToMasterIndex_005(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + return restoreDeltaListToDeltaIndex(&mi5->deltaIndex, dlsi, data); +} + +/***********************************************************************/ +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static void abortRestoringMasterIndex_005(MasterIndex *masterIndex) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + abortRestoringDeltaIndex(&mi5->deltaIndex); +} + +/***********************************************************************/ +static void removeNewestChapters(MasterIndex5 *mi5, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + // Get the range of delta lists belonging to this zone + unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, + zoneNumber); + unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, + zoneNumber); + unsigned int lastList = firstList + numLists - 1; + + if (virtualChapter > mi5->chapterMask) { + // The virtual chapter number is large enough so that we can use the + // normal LRU mechanism without an unsigned underflow. + virtualChapter -= mi5->chapterMask + 1; + // Eliminate the newest chapters by renumbering them to become the + // oldest chapters + unsigned int i; + for (i = firstList; i <= lastList; i++) { + if (virtualChapter < mi5->flushChapters[i]) { + mi5->flushChapters[i] = virtualChapter; + } + } + } else { + // Underflow will prevent the fast path. Do it the slow and painful way. + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + ChapterRange range; + range.chapterStart = convertVirtualToIndex(mi5, virtualChapter); + range.chapterCount = (mi5->chapterMask + 1 + - (virtualChapter - masterZone->virtualChapterLow)); + UdsChunkName name; + memset(&name, 0, sizeof(UdsChunkName)); + MasterIndexRecord record = (MasterIndexRecord) { + .magic = masterIndexRecordMagic, + .masterIndex = &mi5->common, + .name = &name, + .zoneNumber = zoneNumber, + }; + unsigned int i; + for (i = firstList; i <= lastList; i++) { + ChapterRange tempRange = range; + getMasterIndexEntry(&record, i, 0, &tempRange); + } + } +} + +/***********************************************************************/ +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexZoneOpenChapter_005(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + // Take care here to avoid underflow of an unsigned value. Note that + // this is the smallest valid virtual low. We may or may not actually + // use this value. + uint64_t newVirtualLow = (virtualChapter >= mi5->numChapters + ? virtualChapter - mi5->numChapters + 1 + : 0); + + if (virtualChapter <= masterZone->virtualChapterLow) { + /* + * Moving backwards and the new range is totally before the old range. + * Note that moving to the lowest virtual chapter counts as totally before + * the old range, as we need to remove the entries in the open chapter. + */ + emptyDeltaIndexZone(&mi5->deltaIndex, zoneNumber); + masterZone->virtualChapterLow = virtualChapter; + masterZone->virtualChapterHigh = virtualChapter; + } else if (virtualChapter <= masterZone->virtualChapterHigh) { + // Moving backwards and the new range overlaps the old range. Note + // that moving to the same open chapter counts as backwards, as we need + // to remove the entries in the open chapter. + removeNewestChapters(mi5, zoneNumber, virtualChapter); + masterZone->virtualChapterHigh = virtualChapter; + } else if (newVirtualLow < masterZone->virtualChapterLow) { + // Moving forwards and we can keep all the old chapters + masterZone->virtualChapterHigh = virtualChapter; + } else if (newVirtualLow <= masterZone->virtualChapterHigh) { + // Moving forwards and we can keep some old chapters + masterZone->virtualChapterLow = newVirtualLow; + masterZone->virtualChapterHigh = virtualChapter; + } else { + // Moving forwards and the new range is totally after the old range + masterZone->virtualChapterLow = virtualChapter; + masterZone->virtualChapterHigh = virtualChapter; + } + // Check to see if the zone data has grown to be too large + if (masterZone->virtualChapterLow < masterZone->virtualChapterHigh) { + uint64_t usedBits = getDeltaIndexZoneDlistBitsUsed(&mi5->deltaIndex, + zoneNumber); + if (usedBits > mi5->maxZoneBits) { + // Expire enough chapters to free the desired space + uint64_t expireCount + = 1 + (usedBits - mi5->maxZoneBits) / mi5->chapterZoneBits; + if (expireCount == 1) { + logRatelimit(logInfo, + "masterZone %u: At chapter %" PRIu64 + ", expiring chapter %llu early", + zoneNumber, virtualChapter, + masterZone->virtualChapterLow); + masterZone->numEarlyFlushes++; + masterZone->virtualChapterLow++; + } else { + uint64_t firstExpired = masterZone->virtualChapterLow; + if (firstExpired + expireCount < masterZone->virtualChapterHigh) { + masterZone->numEarlyFlushes += expireCount; + masterZone->virtualChapterLow += expireCount; + } else { + masterZone->numEarlyFlushes + += masterZone->virtualChapterHigh - masterZone->virtualChapterLow; + masterZone->virtualChapterLow = masterZone->virtualChapterHigh; + } + logRatelimit(logInfo, + "masterZone %u: At chapter %" PRIu64 + ", expiring chapters %llu to %llu early", + zoneNumber, virtualChapter, firstExpired, + masterZone->virtualChapterLow - 1); + } + } + } +} + +/***********************************************************************/ +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexOpenChapter_005(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + // In normal operation, we advance forward one chapter at a time. + // Log all abnormal changes. + MasterIndexZone *masterZone = &mi5->masterZones[z]; + bool logMove = virtualChapter != masterZone->virtualChapterHigh + 1; + if (logMove) { + logDebug("masterZone %u: The range of indexed chapters is moving from [%" + PRIu64 ", %llu] ...", + z, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + + setMasterIndexZoneOpenChapter_005(masterIndex, z, virtualChapter); + + if (logMove) { + logDebug("masterZone %u: ... and moving to [%llu, %llu]", + z, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + } +} + +/***********************************************************************/ +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static unsigned int getMasterIndexZone_005(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + unsigned int deltaListNumber = extractDListNum(mi5, name); + return getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexName_005(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + triage->isSample = false; + triage->inSampledChapter = false; + triage->zone = getMasterIndexZone_005(masterIndex, name); + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexSampledName_005(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + unsigned int address = extractAddress(mi5, name); + unsigned int deltaListNumber = extractDListNum(mi5, name); + DeltaIndexEntry deltaEntry; + int result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, + name->name, true, &deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + triage->inSampledChapter = !deltaEntry.atEnd && (deltaEntry.key == address); + if (triage->inSampledChapter) { + const MasterIndexZone *masterZone = &mi5->masterZones[triage->zone]; + unsigned int indexChapter = getDeltaEntryValue(&deltaEntry); + unsigned int rollingChapter = ((indexChapter + - masterZone->virtualChapterLow) + & mi5->chapterMask); + triage->virtualChapter = masterZone->virtualChapterLow + rollingChapter; + if (triage->virtualChapter > masterZone->virtualChapterHigh) { + triage->inSampledChapter = false; + } + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block + * name. Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at + * the proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" + * and "isCollision" fields reflect the entry found. + * Calls to removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this + * entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexRecord_005(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + unsigned int address = extractAddress(mi5, name); + unsigned int deltaListNumber = extractDListNum(mi5, name); + uint64_t flushChapter = mi5->flushChapters[deltaListNumber]; + record->magic = masterIndexRecordMagic; + record->masterIndex = masterIndex; + record->mutex = NULL; + record->name = name; + record->zoneNumber = getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); + const MasterIndexZone *masterZone = getMasterZone(record); + + int result; + if (flushChapter < masterZone->virtualChapterLow) { + ChapterRange range; + uint64_t flushCount = masterZone->virtualChapterLow - flushChapter; + range.chapterStart = convertVirtualToIndex(mi5, flushChapter); + range.chapterCount = (flushCount > mi5->chapterMask + ? mi5->chapterMask + 1 + : flushCount); + result = getMasterIndexEntry(record, deltaListNumber, address, &range); + flushChapter = convertIndexToVirtual(record, range.chapterStart); + if (flushChapter > masterZone->virtualChapterHigh) { + flushChapter = masterZone->virtualChapterHigh; + } + mi5->flushChapters[deltaListNumber] = flushChapter; + } else { + result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, + name->name, false, &record->deltaEntry); + } + if (result != UDS_SUCCESS) { + return result; + } + record->isFound = (!record->deltaEntry.atEnd + && (record->deltaEntry.key == address)); + if (record->isFound) { + unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); + record->virtualChapter = convertIndexToVirtual(record, indexChapter); + } + record->isCollision = record->deltaEntry.isCollision; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Create a new record associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is found + * + * @return UDS_SUCCESS or an error code + **/ +int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + if (record->magic != masterIndexRecordMagic) { + return logWarningWithStringError(UDS_BAD_STATE, + "bad magic number in master index record"); + } + if (!isVirtualChapterIndexed(record, virtualChapter)) { + const MasterIndexZone *masterZone = getMasterZone(record); + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot put record into chapter number %" + PRIu64 " that is out of the valid range %" + PRIu64 " to %llu", + virtualChapter, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + unsigned int address = extractAddress(mi5, record->name); + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + int result = putDeltaIndexEntry(&record->deltaEntry, address, + convertVirtualToIndex(mi5, virtualChapter), + record->isFound ? record->name->name : NULL); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + switch (result) { + case UDS_SUCCESS: + record->virtualChapter = virtualChapter; + record->isCollision = record->deltaEntry.isCollision; + record->isFound = true; + break; + case UDS_OVERFLOW: + logRatelimit(logWarningWithStringError, UDS_OVERFLOW, + "Master index entry dropped due to overflow condition"); + logDeltaIndexEntry(&record->deltaEntry); + break; + default: + break; + } + return result; +} + +/**********************************************************************/ +static INLINE int validateRecord(MasterIndexRecord *record) +{ + if (record->magic != masterIndexRecordMagic) { + return logWarningWithStringError( + UDS_BAD_STATE, "bad magic number in master index record"); + } + if (!record->isFound) { + return logWarningWithStringError(UDS_BAD_STATE, + "illegal operation on new record"); + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Remove an existing record. + * + * @param record The master index record found by getRecord() + * + * @return UDS_SUCCESS or an error code + **/ +int removeMasterIndexRecord(MasterIndexRecord *record) +{ + int result = validateRecord(record); + if (result != UDS_SUCCESS) { + return result; + } + // Mark the record so that it cannot be used again + record->magic = badMagic; + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + result = removeDeltaIndexEntry(&record->deltaEntry); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + return result; +} + +/***********************************************************************/ +/** + * Set the chapter number associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where the block info is now found. + * + * @return UDS_SUCCESS or an error code + **/ +int setMasterIndexRecordChapter(MasterIndexRecord *record, + uint64_t virtualChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + int result = validateRecord(record); + if (result != UDS_SUCCESS) { + return result; + } + if (!isVirtualChapterIndexed(record, virtualChapter)) { + const MasterIndexZone *masterZone = getMasterZone(record); + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot set chapter number %" PRIu64 + " that is out of the valid range %" PRIu64 + " to %llu", + virtualChapter, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + result = setDeltaEntryValue(&record->deltaEntry, + convertVirtualToIndex(mi5, virtualChapter)); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + if (result != UDS_SUCCESS) { + return result; + } + record->virtualChapter = virtualChapter; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static size_t getMasterIndexMemoryUsed_005(const MasterIndex *masterIndex) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + uint64_t bits = getDeltaIndexDlistBitsUsed(&mi5->deltaIndex); + return (bits + CHAR_BIT - 1) / CHAR_BIT; +} + +/***********************************************************************/ +/** + * Return the master index stats. There is only one portion of the master + * index in this implementation, and we call it the dense portion of the + * index. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static void getMasterIndexStats_005(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + DeltaIndexStats dis; + getDeltaIndexStats(&mi5->deltaIndex, &dis); + dense->memoryAllocated = (dis.memoryAllocated + + sizeof(MasterIndex5) + + mi5->numDeltaLists * sizeof(uint64_t) + + mi5->numZones * sizeof(MasterIndexZone)); + dense->rebalanceTime = dis.rebalanceTime; + dense->rebalanceCount = dis.rebalanceCount; + dense->recordCount = dis.recordCount; + dense->collisionCount = dis.collisionCount; + dense->discardCount = dis.discardCount; + dense->overflowCount = dis.overflowCount; + dense->numLists = dis.numLists; + dense->earlyFlushes = 0; + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + dense->earlyFlushes += mi5->masterZones[z].numEarlyFlushes; + } + memset(sparse, 0, sizeof(MasterIndexStats)); +} + +/***********************************************************************/ +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static bool isMasterIndexSample_005(const MasterIndex *masterIndex + __attribute__((unused)), + const UdsChunkName *name + __attribute__((unused))) +{ + return false; +} + +/***********************************************************************/ +typedef struct { + unsigned int addressBits; // Number of bits in address mask + unsigned int chapterBits; // Number of bits in chapter number + unsigned int meanDelta; // The mean delta + unsigned long numDeltaLists; // The number of delta lists + unsigned long numChapters; // Number of chapters used + size_t numBitsPerChapter; // The number of bits per chapter + size_t memorySize; // The number of bytes of delta list memory + size_t targetFreeSize; // The number of free bytes we desire +} Parameters005; + +/***********************************************************************/ +static int computeMasterIndexParameters005(const Configuration *config, + Parameters005 *params) +{ + enum { DELTA_LIST_SIZE = 256 }; + /* + * For a given zone count, setting the the minimum number of delta lists + * to the square of the number of zones ensures that the distribution of + * delta lists over zones doesn't underflow, leaving the last zone with + * an invalid number of delta lists. See the explanation in + * initializeDeltaIndex(). Because we can restart with a different number + * of zones but the number of delta lists is invariant across restart, + * we must use the largest number of zones to compute this minimum. + */ + unsigned long minDeltaLists = (minMasterIndexDeltaLists + ? minMasterIndexDeltaLists + : MAX_ZONES * MAX_ZONES); + + Geometry *geometry = config->geometry; + unsigned long recordsPerChapter = geometry->recordsPerChapter; + params->numChapters = geometry->chaptersPerVolume; + unsigned long recordsPerVolume = recordsPerChapter * params->numChapters; + unsigned int numAddresses = config->masterIndexMeanDelta * DELTA_LIST_SIZE; + params->numDeltaLists + = maxUint(recordsPerVolume / DELTA_LIST_SIZE, minDeltaLists); + params->addressBits = computeBits(numAddresses - 1); + params->chapterBits = computeBits(params->numChapters - 1); + + if ((unsigned int) params->numDeltaLists != params->numDeltaLists) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " delta lists", + params->numDeltaLists); + } + if (params->addressBits > 31) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %u" + " address bits", + params->addressBits); + } + if (geometry->sparseChaptersPerVolume > 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize dense master index" + " with %u sparse chapters", + geometry->sparseChaptersPerVolume); + } + if (recordsPerChapter == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " records per chapter", + recordsPerChapter); + } + if (params->numChapters == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " chapters per volume", + params->numChapters); + } + + /* + * We can now compute the probability that a delta list is not touched during + * the writing of an entire chapter. The computation is: + * + * double pNotTouched = pow((double) (params->numDeltaLists - 1) + * / params->numDeltaLists, + * recordsPerChapter); + * + * For the standard index sizes, about 78% of the delta lists are not + * touched, and therefore contain dead index entries that have not been + * eliminated by the lazy LRU processing. We can then compute how many dead + * index entries accumulate over time. The computation is: + * + * double invalidChapters = pNotTouched / (1.0 - pNotTouched); + * + * For the standard index sizes, we will need about 3.5 chapters of space for + * the dead index entries in a 1K chapter index. Since we do not want to do + * that floating point computation, we use 4 chapters per 1K of chapters. + */ + unsigned long invalidChapters = maxUint(params->numChapters / 256, 2); + unsigned long chaptersInMasterIndex = params->numChapters + invalidChapters; + unsigned long entriesInMasterIndex + = recordsPerChapter * chaptersInMasterIndex; + // Compute the mean delta + unsigned long addressSpan = params->numDeltaLists << params->addressBits; + params->meanDelta = addressSpan / entriesInMasterIndex; + // Project how large we expect a chapter to be + params->numBitsPerChapter = getDeltaMemorySize(recordsPerChapter, + params->meanDelta, + params->chapterBits); + // Project how large we expect the index to be + size_t numBitsPerIndex = params->numBitsPerChapter * chaptersInMasterIndex; + size_t expectedIndexSize = numBitsPerIndex / CHAR_BIT; + /* + * Set the total memory to be 6% larger than the expected index size. We + * want this number to be large enough that the we do not do a great many + * rebalances as the list when the list is full. We use MasterIndex_p1 + * to tune this setting. + */ + params->memorySize = expectedIndexSize * 106 / 100; + // Set the target free size to 5% of the expected index size + params->targetFreeSize = expectedIndexSize / 20; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int computeMasterIndexSaveBytes005(const Configuration *config, + size_t *numBytes) +{ + Parameters005 params = { .addressBits = 0 }; + int result = computeMasterIndexParameters005(config, ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + // Saving a MasterIndex005 needs a header plus one uint64_t per delta + // list plus the delta index. + *numBytes = (sizeof(struct mi005_data) + + params.numDeltaLists * sizeof(uint64_t) + + computeDeltaIndexSaveBytes(params.numDeltaLists, + params.memorySize)); + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeMasterIndex005(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + Parameters005 params = { .addressBits = 0 }; + int result = computeMasterIndexParameters005(config, ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + + MasterIndex5 *mi5; + result = ALLOCATE(1, MasterIndex5, "master index", &mi5); + if (result != UDS_SUCCESS) { + *masterIndex = NULL; + return result; + } + + mi5->common.abortRestoringMasterIndex = abortRestoringMasterIndex_005; + mi5->common.abortSavingMasterIndex = abortSavingMasterIndex_005; + mi5->common.finishSavingMasterIndex = finishSavingMasterIndex_005; + mi5->common.freeMasterIndex = freeMasterIndex_005; + mi5->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_005; + mi5->common.getMasterIndexRecord = getMasterIndexRecord_005; + mi5->common.getMasterIndexStats = getMasterIndexStats_005; + mi5->common.getMasterIndexZone = getMasterIndexZone_005; + mi5->common.isMasterIndexSample = isMasterIndexSample_005; + mi5->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_005; + mi5->common.isSavingMasterIndexDone = isSavingMasterIndexDone_005; + mi5->common.lookupMasterIndexName = lookupMasterIndexName_005; + mi5->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_005; + mi5->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_005; + mi5->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_005; + mi5->common.setMasterIndexTag = setMasterIndexTag_005; + mi5->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_005; + mi5->common.startRestoringMasterIndex = startRestoringMasterIndex_005; + mi5->common.startSavingMasterIndex = startSavingMasterIndex_005; + + mi5->addressBits = params.addressBits; + mi5->addressMask = (1u << params.addressBits) - 1; + mi5->chapterBits = params.chapterBits; + mi5->chapterMask = (1u << params.chapterBits) - 1; + mi5->numChapters = params.numChapters; + mi5->numDeltaLists = params.numDeltaLists; + mi5->numZones = numZones; + mi5->chapterZoneBits = params.numBitsPerChapter / numZones; + mi5->volumeNonce = volumeNonce; + + result = initializeDeltaIndex(&mi5->deltaIndex, numZones, + params.numDeltaLists, params.meanDelta, + params.chapterBits, params.memorySize); + if (result == UDS_SUCCESS) { + mi5->maxZoneBits = ((getDeltaIndexDlistBitsAllocated(&mi5->deltaIndex) + - params.targetFreeSize * CHAR_BIT) + / numZones); + } + + // Initialize the chapter flush ranges to be empty. This depends upon + // allocate returning zeroed memory. + if (result == UDS_SUCCESS) { + result = ALLOCATE(params.numDeltaLists, uint64_t, + "first chapter to flush", &mi5->flushChapters); + } + + // Initialize the virtual chapter ranges to start at zero. This depends + // upon allocate returning zeroed memory. + if (result == UDS_SUCCESS) { + result = ALLOCATE(numZones, MasterIndexZone, "master index zones", + &mi5->masterZones); + } + + if (result == UDS_SUCCESS) { + *masterIndex = &mi5->common; + } else { + freeMasterIndex_005(&mi5->common); + *masterIndex = NULL; + } + return result; +} diff --git a/uds/masterIndex005.h b/uds/masterIndex005.h new file mode 100644 index 0000000..5436c7f --- /dev/null +++ b/uds/masterIndex005.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.h#1 $ + */ + +#ifndef MASTERINDEX005_H +#define MASTERINDEX005_H 1 + +#include "masterIndexOps.h" + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to authenticate the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex005(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a master index of a given + * configuration. + * + * @param config The configuration of the master index + * @param numBytes The number of bytes required to save the master index + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBytes005(const Configuration *config, + size_t *numBytes) + __attribute__((warn_unused_result)); + +#endif /* MASTERINDEX005_H */ diff --git a/uds/masterIndex006.c b/uds/masterIndex006.c new file mode 100644 index 0000000..3e1ef00 --- /dev/null +++ b/uds/masterIndex006.c @@ -0,0 +1,791 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.c#2 $ + */ +#include "masterIndex006.h" + +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "masterIndex005.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "threads.h" +#include "uds.h" + +/* + * The master index is a kept as a wrapper around 2 master index + * implementations, one for dense chapters and one for sparse chapters. + * Methods will be routed to one or the other, or both, depending on the + * method and data passed in. + * + * The master index is divided into zones, and in normal operation there is + * one thread operating on each zone. Any operation that operates on all + * the zones needs to do its operation at a safe point that ensures that + * only one thread is operating on the master index. + * + * The only multithreaded operation supported by the sparse master index is + * the lookupMasterIndexName() method. It is called by the thread that + * assigns an index request to the proper zone, and needs to do a master + * index query for sampled chunk names. The zone mutexes are used to make + * this lookup operation safe. + */ + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { + Mutex hookMutex; // Protects the sampled index in this zone +} MasterIndexZone; + +typedef struct { + MasterIndex common; // Common master index methods + unsigned int sparseSampleRate; // The sparse sample rate + unsigned int numZones; // The number of zones + MasterIndex *miNonHook; // The non-hook index + MasterIndex *miHook; // The hook index == sample index + MasterIndexZone *masterZones; // The zones +} MasterIndex6; + +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static INLINE bool isMasterIndexSample_006(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (extractSamplingBytes(name) % mi6->sparseSampleRate) == 0; +} + +/***********************************************************************/ +/** + * Get the subindex for the given chunk name + * + * @param masterIndex The master index + * @param name The block name + * + * @return the subindex + **/ +static INLINE MasterIndex *getSubIndex(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isMasterIndexSample_006(masterIndex, name) + ? mi6->miHook + : mi6->miNonHook); +} + +/***********************************************************************/ +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static void freeMasterIndex_006(MasterIndex *masterIndex) +{ + if (masterIndex != NULL) { + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + if (mi6->masterZones != NULL) { + unsigned int zone; + for (zone = 0; zone < mi6->numZones; zone++) { + destroyMutex(&mi6->masterZones[zone].hookMutex); + } + FREE(mi6->masterZones); + mi6->masterZones = NULL; + } + if (mi6->miNonHook != NULL) { + freeMasterIndex(mi6->miNonHook); + mi6->miNonHook = NULL; + } + if (mi6->miHook != NULL) { + freeMasterIndex(mi6->miHook); + mi6->miHook = NULL; + } + FREE(masterIndex); + } +} + +/***********************************************************************/ +/** + * Constants and structures for the saved master index file. "MI6" is for + * masterIndex006, and "-XXXX" is a number to increment when the format of + * the data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_MI_START[] = "MI6-0001"; + +struct mi006_data { + char magic[MAGIC_SIZE]; // MAGIC_MI_START + unsigned int sparseSampleRate; +}; + +/***********************************************************************/ +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static void setMasterIndexTag_006(MasterIndex *masterIndex + __attribute__((unused)), + byte tag __attribute__((unused))) +{ +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi006_data), + "%zu bytes of config written, of %zu expected", + contentLength(buffer), sizeof(struct mi006_data)); + return result; +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi006_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + struct mi006_data header; + memset(&header, 0, sizeof(header)); + memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); + header.sparseSampleRate = mi6->sparseSampleRate; + result = encodeMasterIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write master index header"); + return result; + } + + result = startSavingMasterIndex(mi6->miNonHook, zoneNumber, bufferedWriter); + if (result != UDS_SUCCESS) { + return result; + } + + result = startSavingMasterIndex(mi6->miHook, zoneNumber, bufferedWriter); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Have all the data been written while saving a master index to an output + * stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static bool isSavingMasterIndexDone_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isSavingMasterIndexDone(mi6->miNonHook, zoneNumber) + && isSavingMasterIndexDone(mi6->miHook, zoneNumber)); +} + +/***********************************************************************/ +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int finishSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result = finishSavingMasterIndex(mi6->miNonHook, zoneNumber); + if (result == UDS_SUCCESS) { + result = finishSavingMasterIndex(mi6->miHook, zoneNumber); + } + return result; +} + +/***********************************************************************/ +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int abortSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result = abortSavingMasterIndex(mi6->miNonHook, zoneNumber); + int result2 = abortSavingMasterIndex(mi6->miHook, zoneNumber); + if (result == UDS_SUCCESS) { + result = result2; + } + return result; +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) +{ + int result = getBytesFromBuffer(buffer, sizeof(header->magic), + &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered reader to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startRestoringMasterIndex_006(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + int result = ASSERT_WITH_ERROR_CODE(masterIndex != NULL, UDS_BAD_STATE, + "cannot restore to null master index"); + if (result != UDS_SUCCESS) { + return result; + } + + int i; + for (i = 0; i < numReaders; i++) { + Buffer *buffer; + result = makeBuffer(sizeof(struct mi006_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + struct mi006_data header; + result = decodeMasterIndexHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index file had bad magic" + " number"); + } + if (i == 0) { + mi6->sparseSampleRate = header.sparseSampleRate; + } else if (mi6->sparseSampleRate != header.sparseSampleRate) { + logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "Inconsistent sparse sample rate in delta" + " index zone files: %u vs. %u", + mi6->sparseSampleRate, + header.sparseSampleRate); + return UDS_CORRUPT_COMPONENT; + } + } + + result = startRestoringMasterIndex(mi6->miNonHook, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return result; + } + return startRestoringMasterIndex(mi6->miHook, bufferedReaders, numReaders); +} + +/***********************************************************************/ +/** + * Have all the data been read while restoring a master index from an + * input stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static bool isRestoringMasterIndexDone_006(const MasterIndex *masterIndex) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isRestoringMasterIndexDone(mi6->miNonHook) + && isRestoringMasterIndexDone(mi6->miHook)); +} + +/***********************************************************************/ +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static int restoreDeltaListToMasterIndex_006(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + int result = restoreDeltaListToMasterIndex(mi6->miNonHook, dlsi, data); + if (result != UDS_SUCCESS) { + result = restoreDeltaListToMasterIndex(mi6->miHook, dlsi, data); + } + return result; +} + +/***********************************************************************/ +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static void abortRestoringMasterIndex_006(MasterIndex *masterIndex) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + abortRestoringMasterIndex(mi6->miNonHook); + abortRestoringMasterIndex(mi6->miHook); +} + +/***********************************************************************/ +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexZoneOpenChapter_006(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + setMasterIndexZoneOpenChapter(mi6->miNonHook, zoneNumber, virtualChapter); + + // We need to prevent a lookupMasterIndexName() happening while we are + // changing the open chapter number + Mutex *mutex = &mi6->masterZones[zoneNumber].hookMutex; + lockMutex(mutex); + setMasterIndexZoneOpenChapter(mi6->miHook, zoneNumber, virtualChapter); + unlockMutex(mutex); +} + +/***********************************************************************/ +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexOpenChapter_006(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + unsigned int zone; + for (zone = 0; zone < mi6->numZones; zone++) { + setMasterIndexZoneOpenChapter_006(masterIndex, zone, virtualChapter); + } +} + +/***********************************************************************/ +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static unsigned int getMasterIndexZone_006(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return getMasterIndexZone(getSubIndex(masterIndex, name), name); +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexName_006(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + triage->isSample = isMasterIndexSample_006(masterIndex, name); + triage->inSampledChapter = false; + triage->zone = getMasterIndexZone_006(masterIndex, name); + int result = UDS_SUCCESS; + if (triage->isSample) { + Mutex *mutex = &mi6->masterZones[triage->zone].hookMutex; + lockMutex(mutex); + result = lookupMasterIndexSampledName(mi6->miHook, name, triage); + unlockMutex(mutex); + } + return result; +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexSampledName_006(const MasterIndex *masterIndex + __attribute__((unused)), + const UdsChunkName *name + __attribute__((unused)), + MasterIndexTriage *triage + __attribute__((unused))) +{ + return ASSERT_WITH_ERROR_CODE(false, UDS_BAD_STATE, + "%s should not be called", __func__); +} + +/***********************************************************************/ +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block + * name. Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at + * the proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" + * and "isCollision" fields reflect the entry found. + * Calls to removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this + * entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexRecord_006(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result; + if (isMasterIndexSample_006(masterIndex, name)) { + /* + * We need to prevent a lookupMasterIndexName() happening while we are + * finding the master index record. Remember that because of lazy LRU + * flushing of the master index, getMasterIndexRecord() is not a + * read-only operation. + */ + unsigned int zone = getMasterIndexZone(mi6->miHook, name); + Mutex *mutex = &mi6->masterZones[zone].hookMutex; + lockMutex(mutex); + result = getMasterIndexRecord(mi6->miHook, name, record); + unlockMutex(mutex); + // Remember the mutex so that other operations on the MasterIndexRecord + // can use it + record->mutex = mutex; + } else { + result = getMasterIndexRecord(mi6->miNonHook, name, record); + } + return result; +} + +/***********************************************************************/ +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static size_t getMasterIndexMemoryUsed_006(const MasterIndex *masterIndex) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (getMasterIndexMemoryUsed(mi6->miNonHook) + + getMasterIndexMemoryUsed(mi6->miHook)); +} + +/***********************************************************************/ +/** + * Return the master index stats. There is only one portion of the master + * index in this implementation, and we call it the dense portion of the + * index. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static void getMasterIndexStats_006(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + MasterIndexStats dummyStats; + getMasterIndexStats(mi6->miNonHook, dense, &dummyStats); + getMasterIndexStats(mi6->miHook, sparse, &dummyStats); +} + +/***********************************************************************/ +typedef struct { + Configuration hookConfig; // Describe the hook part of the index + Geometry hookGeometry; + Configuration nonHookConfig; // Describe the non-hook part of the index + Geometry nonHookGeometry; +} SplitConfig; + +/***********************************************************************/ +static int splitConfiguration006(const Configuration *config, + SplitConfig *split) +{ + int result + = ASSERT_WITH_ERROR_CODE(config->geometry->sparseChaptersPerVolume != 0, + UDS_INVALID_ARGUMENT, + "cannot initialize sparse+dense master index" + " with no sparse chapters"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(config->sparseSampleRate != 0, + UDS_INVALID_ARGUMENT, + "cannot initialize sparse+dense master" + " index with a sparse sample rate of %u", + config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + + // Start with copies of the base configuration + split->hookConfig = *config; + split->hookGeometry = *config->geometry; + split->hookConfig.geometry = &split->hookGeometry; + split->nonHookConfig = *config; + split->nonHookGeometry = *config->geometry; + split->nonHookConfig.geometry = &split->nonHookGeometry; + + uint64_t sampleRate = config->sparseSampleRate; + uint64_t numChapters = config->geometry->chaptersPerVolume; + uint64_t numSparseChapters = config->geometry->sparseChaptersPerVolume; + uint64_t numDenseChapters = numChapters - numSparseChapters; + uint64_t sampleRecords = config->geometry->recordsPerChapter / sampleRate; + + // Adjust the number of records indexed for each chapter + split->hookGeometry.recordsPerChapter = sampleRecords; + split->nonHookGeometry.recordsPerChapter -= sampleRecords; + + // Adjust the number of chapters indexed + split->hookGeometry.sparseChaptersPerVolume = 0; + split->nonHookGeometry.sparseChaptersPerVolume = 0; + split->nonHookGeometry.chaptersPerVolume = numDenseChapters; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int computeMasterIndexSaveBytes006(const Configuration *config, + size_t *numBytes) +{ + SplitConfig split; + int result = splitConfiguration006(config, &split); + if (result != UDS_SUCCESS) { + return result; + } + size_t hookBytes, nonHookBytes; + result = computeMasterIndexSaveBytes005(&split.hookConfig, &hookBytes); + if (result != UDS_SUCCESS) { + return result; + } + result = computeMasterIndexSaveBytes005(&split.nonHookConfig, &nonHookBytes); + if (result != UDS_SUCCESS) { + return result; + } + // Saving a MasterIndex006 needs a header plus the hook index plus the + // non-hook index + *numBytes = sizeof(struct mi006_data) + hookBytes + nonHookBytes; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeMasterIndex006(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + SplitConfig split; + int result = splitConfiguration006(config, &split); + if (result != UDS_SUCCESS) { + return result; + } + + MasterIndex6 *mi6; + result = ALLOCATE(1, MasterIndex6, "master index", &mi6); + if (result != UDS_SUCCESS) { + return result; + } + + mi6->common.abortRestoringMasterIndex = abortRestoringMasterIndex_006; + mi6->common.abortSavingMasterIndex = abortSavingMasterIndex_006; + mi6->common.finishSavingMasterIndex = finishSavingMasterIndex_006; + mi6->common.freeMasterIndex = freeMasterIndex_006; + mi6->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_006; + mi6->common.getMasterIndexRecord = getMasterIndexRecord_006; + mi6->common.getMasterIndexStats = getMasterIndexStats_006; + mi6->common.getMasterIndexZone = getMasterIndexZone_006; + mi6->common.isMasterIndexSample = isMasterIndexSample_006; + mi6->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_006; + mi6->common.isSavingMasterIndexDone = isSavingMasterIndexDone_006; + mi6->common.lookupMasterIndexName = lookupMasterIndexName_006; + mi6->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_006; + mi6->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_006; + mi6->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_006; + mi6->common.setMasterIndexTag = setMasterIndexTag_006; + mi6->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_006; + mi6->common.startRestoringMasterIndex = startRestoringMasterIndex_006; + mi6->common.startSavingMasterIndex = startSavingMasterIndex_006; + + mi6->numZones = numZones; + mi6->sparseSampleRate = config->sparseSampleRate; + + result = ALLOCATE(numZones, MasterIndexZone, "master index zones", + &mi6->masterZones); + unsigned int zone; + for (zone = 0; zone < numZones; zone++) { + if (result == UDS_SUCCESS) { + result = initMutex(&mi6->masterZones[zone].hookMutex); + } + } + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return result; + } + + result = makeMasterIndex005(&split.nonHookConfig, numZones, volumeNonce, + &mi6->miNonHook); + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return logErrorWithStringError(result, + "Error creating non hook master index"); + } + setMasterIndexTag(mi6->miNonHook, 'd'); + + result = makeMasterIndex005(&split.hookConfig, numZones, volumeNonce, + &mi6->miHook); + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return logErrorWithStringError(result, + "Error creating hook master index"); + } + setMasterIndexTag(mi6->miHook, 's'); + + *masterIndex = &mi6->common; + return UDS_SUCCESS; +} diff --git a/uds/masterIndex006.h b/uds/masterIndex006.h new file mode 100644 index 0000000..1d3b377 --- /dev/null +++ b/uds/masterIndex006.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.h#1 $ + */ + +#ifndef MASTERINDEX006_H +#define MASTERINDEX006_H 1 + +#include "masterIndexOps.h" + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to authenticate the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex006(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a master index of a given + * configuration. + * + * @param config The configuration of the master index + * @param numBytes The number of bytes required to save the master index + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBytes006(const Configuration *config, + size_t *numBytes) + __attribute__((warn_unused_result)); + +#endif /* MASTERINDEX006_H */ diff --git a/uds/masterIndexOps.c b/uds/masterIndexOps.c new file mode 100644 index 0000000..1cbd10b --- /dev/null +++ b/uds/masterIndexOps.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.c#4 $ + */ +#include "masterIndexOps.h" + +#include "compiler.h" +#include "errors.h" +#include "indexComponent.h" +#include "logger.h" +#include "masterIndex005.h" +#include "masterIndex006.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" +#include "zone.h" + +/**********************************************************************/ +static INLINE bool usesSparse(const Configuration *config) +{ + return config->geometry->sparseChaptersPerVolume > 0; +} + +/**********************************************************************/ +void getMasterIndexCombinedStats(const MasterIndex *masterIndex, + MasterIndexStats *stats) +{ + MasterIndexStats dense, sparse; + getMasterIndexStats(masterIndex, &dense, &sparse); + stats->memoryAllocated = dense.memoryAllocated + sparse.memoryAllocated; + stats->rebalanceTime = dense.rebalanceTime + sparse.rebalanceTime; + stats->rebalanceCount = dense.rebalanceCount + sparse.rebalanceCount; + stats->recordCount = dense.recordCount + sparse.recordCount; + stats->collisionCount = dense.collisionCount + sparse.collisionCount; + stats->discardCount = dense.discardCount + sparse.discardCount; + stats->overflowCount = dense.overflowCount + sparse.overflowCount; + stats->numLists = dense.numLists + sparse.numLists; + stats->earlyFlushes = dense.earlyFlushes + sparse.earlyFlushes; +} + +/**********************************************************************/ +int makeMasterIndex(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + if (usesSparse(config)) { + return makeMasterIndex006(config, numZones, volumeNonce, masterIndex); + } else { + return makeMasterIndex005(config, numZones, volumeNonce, masterIndex); + } +} + +/**********************************************************************/ +int computeMasterIndexSaveBlocks(const Configuration *config, + size_t blockSize, uint64_t *blockCount) +{ + size_t numBytes; + int result = (usesSparse(config) + ? computeMasterIndexSaveBytes006(config, &numBytes) + : computeMasterIndexSaveBytes005(config, &numBytes)); + if (result != UDS_SUCCESS) { + return result; + } + numBytes += sizeof(DeltaListSaveInfo); + *blockCount = (numBytes + blockSize - 1) / blockSize + MAX_ZONES; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int readMasterIndex(ReadPortal *portal) +{ + MasterIndex *masterIndex = indexComponentContext(portal->component); + unsigned int numZones = portal->zones; + if (numZones > MAX_ZONES) { + return logErrorWithStringError(UDS_BAD_STATE, + "zone count %u must not exceed MAX_ZONES", + numZones); + } + + BufferedReader *readers[MAX_ZONES]; + unsigned int z; + for (z = 0; z < numZones; ++z) { + int result = getBufferedReaderForPortal(portal, z, &readers[z]); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot read component for zone %u", z); + } + } + return restoreMasterIndex(readers, numZones, masterIndex); +} + +/**********************************************************************/ +static int writeMasterIndex(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed) +{ + MasterIndex *masterIndex = indexComponentContext(component); + bool isComplete = false; + + int result = UDS_SUCCESS; + + switch (command) { + case IWC_START: + result = startSavingMasterIndex(masterIndex, zone, writer); + isComplete = result != UDS_SUCCESS; + break; + case IWC_CONTINUE: + isComplete = isSavingMasterIndexDone(masterIndex, zone); + break; + case IWC_FINISH: + result = finishSavingMasterIndex(masterIndex, zone); + if (result == UDS_SUCCESS) { + result = writeGuardDeltaList(writer); + } + isComplete = true; + break; + case IWC_ABORT: + result = abortSavingMasterIndex(masterIndex, zone); + isComplete = true; + break; + default: + result = logWarningWithStringError(UDS_INVALID_ARGUMENT, + "Invalid writer command"); + break; + } + if (completed != NULL) { + *completed = isComplete; + } + return result; +} + +/**********************************************************************/ + +static const IndexComponentInfo MASTER_INDEX_INFO_DATA = { + .kind = RL_KIND_MASTER_INDEX, + .name = "master index", + .saveOnly = false, + .chapterSync = false, + .multiZone = true, + .ioStorage = true, + .loader = readMasterIndex, + .saver = NULL, + .incremental = writeMasterIndex, +}; +const IndexComponentInfo *const MASTER_INDEX_INFO = &MASTER_INDEX_INFO_DATA; + +/**********************************************************************/ +static int restoreMasterIndexBody(BufferedReader **bufferedReaders, + unsigned int numReaders, + MasterIndex *masterIndex, + byte dlData[DELTA_LIST_MAX_BYTE_COUNT]) +{ + // Start by reading the "header" section of the stream + int result = startRestoringMasterIndex(masterIndex, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return result; + } + // Loop to read the delta lists, stopping when they have all been processed. + unsigned int z; + for (z = 0; z < numReaders; z++) { + for (;;) { + DeltaListSaveInfo dlsi; + result = readSavedDeltaList(&dlsi, dlData, bufferedReaders[z]); + if (result == UDS_END_OF_FILE) { + break; + } else if (result != UDS_SUCCESS) { + abortRestoringMasterIndex(masterIndex); + return result; + } + result = restoreDeltaListToMasterIndex(masterIndex, &dlsi, dlData); + if (result != UDS_SUCCESS) { + abortRestoringMasterIndex(masterIndex); + return result; + } + } + } + if (!isRestoringMasterIndexDone(masterIndex)) { + abortRestoringMasterIndex(masterIndex); + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "incomplete delta list data"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int restoreMasterIndex(BufferedReader **bufferedReaders, + unsigned int numReaders, + MasterIndex *masterIndex) +{ + byte *dlData; + int result = ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, byte, __func__, &dlData); + if (result != UDS_SUCCESS) { + return result; + } + result = restoreMasterIndexBody(bufferedReaders, numReaders, masterIndex, + dlData); + FREE(dlData); + return result; +} diff --git a/uds/masterIndexOps.h b/uds/masterIndexOps.h new file mode 100644 index 0000000..90802ac --- /dev/null +++ b/uds/masterIndexOps.h @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.h#1 $ + */ + +#ifndef MASTERINDEXOPS_H +#define MASTERINDEXOPS_H 1 + +#include "compiler.h" +#include "deltaIndex.h" +#include "indexComponent.h" +#include "indexConfig.h" +#include "threads.h" +#include "uds.h" + +extern const IndexComponentInfo *const MASTER_INDEX_INFO; +extern unsigned int minMasterIndexDeltaLists; + +typedef struct masterIndex MasterIndex; + +typedef struct { + size_t memoryAllocated; // Number of bytes allocated + RelTime rebalanceTime; // The number of seconds spent rebalancing + int rebalanceCount; // Number of memory rebalances + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int numLists; // The number of delta lists + long earlyFlushes; // Number of early flushes +} MasterIndexStats; + +/* + * The MasterIndexTriage structure is used by lookupMasterIndexName(), + * which is a read-only operation that looks at the chunk name and returns + * some information used by the index to select the thread/queue/code_path + * that will process the chunk. + */ +typedef struct { + uint64_t virtualChapter; // If inSampledChapter is true, then this is the + // chapter containing the entry for the chunk name + unsigned int zone; // The zone containing the chunk name + bool isSample; // If true, this chunk name belongs to the + // sampled index + bool inSampledChapter; // If true, this chunk already has an entry in the + // sampled index and virtualChapter is valid +} MasterIndexTriage; + +/* + * The MasterIndexRecord structure is used for normal index read-write + * processing of a chunk name. The first call must be to + * getMasterIndexRecord() to find the master index record for a chunk name. + * This call can be followed by putMasterIndexRecord() to add a master + * index record, or by setMasterIndexRecordChapter() to associate the chunk + * name with a different chapter, or by removeMasterIndexRecord() to delete + * a master index record. + */ +typedef struct { + // Public fields + uint64_t virtualChapter; // Chapter where the block info is found + bool isCollision; // This record is a collision + bool isFound; // This record is the block searched for + + // Private fields + unsigned char magic; // The magic number for valid records + unsigned int zoneNumber; // Zone that contains this block + MasterIndex *masterIndex; // The master index + Mutex *mutex; // Mutex that must be held while accessing + // this delta index entry; used only for + // a sampled index; otherwise is NULL + const UdsChunkName *name; // The blockname to which this record refers + DeltaIndexEntry deltaEntry; // The delta index entry for this record +} MasterIndexRecord; + +struct masterIndex { + void (*abortRestoringMasterIndex)(MasterIndex *masterIndex); + int (*abortSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + int (*finishSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + void (*freeMasterIndex)(MasterIndex *masterIndex); + size_t (*getMasterIndexMemoryUsed)(const MasterIndex *masterIndex); + int (*getMasterIndexRecord)(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record); + void (*getMasterIndexStats)(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse); + unsigned int (*getMasterIndexZone)(const MasterIndex *masterIndex, + const UdsChunkName *name); + bool (*isMasterIndexSample)(const MasterIndex *masterIndex, + const UdsChunkName *name); + bool (*isRestoringMasterIndexDone)(const MasterIndex *masterIndex); + bool (*isSavingMasterIndexDone)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + int (*lookupMasterIndexName)(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage); + int (*lookupMasterIndexSampledName)(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage); + int (*restoreDeltaListToMasterIndex)(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]); + void (*setMasterIndexOpenChapter)(MasterIndex *masterIndex, + uint64_t virtualChapter); + void (*setMasterIndexTag)(MasterIndex *masterIndex, byte tag); + void (*setMasterIndexZoneOpenChapter)(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter); + int (*startRestoringMasterIndex)(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders); + int (*startSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter); +}; + +/** + * Return the combined master index stats. + * + * @param masterIndex The master index + * @param stats Combined stats for the index + **/ +void getMasterIndexCombinedStats(const MasterIndex *masterIndex, + MasterIndexStats *stats); + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to store the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of blocks required to save a master index of a given + * configuration. + * + * @param [in] config The configuration of a master index + * @param [in] blockSize The size of a block in bytes. + * @param [out] blockCount The resulting number of blocks. + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBlocks(const Configuration *config, + size_t blockSize, + uint64_t *blockCount) + __attribute__((warn_unused_result)); + +/** + * Restore a master index. This is exposed for unit tests. + * + * @param readers The readers to read from. + * @param numReaders The number of readers. + * @param masterIndex The master index + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int restoreMasterIndex(BufferedReader **readers, + unsigned int numReaders, + MasterIndex *masterIndex) + __attribute__((warn_unused_result)); + +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static INLINE void abortRestoringMasterIndex(MasterIndex *masterIndex) +{ + masterIndex->abortRestoringMasterIndex(masterIndex); +} + +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int abortSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->abortSavingMasterIndex(masterIndex, zoneNumber); +} + +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int finishSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->finishSavingMasterIndex(masterIndex, zoneNumber); +} + +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static INLINE void freeMasterIndex(MasterIndex *masterIndex) +{ + masterIndex->freeMasterIndex(masterIndex); +} + +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static INLINE size_t getMasterIndexMemoryUsed(const MasterIndex *masterIndex) +{ + return masterIndex->getMasterIndexMemoryUsed(masterIndex); +} + +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block name. + * Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at the + * proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" and + * "isCollision" fields reflect the entry found. Calls to + * removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int getMasterIndexRecord(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + return masterIndex->getMasterIndexRecord(masterIndex, name, record); +} + +/** + * Return the master index stats. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static INLINE void getMasterIndexStats(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + masterIndex->getMasterIndexStats(masterIndex, dense, sparse); +} + +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static INLINE unsigned int getMasterIndexZone(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return masterIndex->getMasterIndexZone(masterIndex, name); +} + +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static INLINE bool isMasterIndexSample(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return masterIndex->isMasterIndexSample(masterIndex, name); +} + +/** + * Have all the data been read while restoring a master index from an input + * stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static INLINE bool isRestoringMasterIndexDone(const MasterIndex *masterIndex) +{ + return masterIndex->isRestoringMasterIndexDone(masterIndex); +} + +/** + * Have all the data been written while saving a master index to an + * output stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static INLINE bool isSavingMasterIndexDone(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->isSavingMasterIndexDone(masterIndex, zoneNumber); +} + +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int lookupMasterIndexName(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + return masterIndex->lookupMasterIndexName(masterIndex, name, triage); +} + +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int lookupMasterIndexSampledName(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + return masterIndex->lookupMasterIndexSampledName(masterIndex, name, triage); +} + +/** + * Create a new record associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is found + * + * @return UDS_SUCCESS or an error code + **/ +int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Remove an existing record. + * + * @param record The master index record found by getRecord() + * + * @return UDS_SUCCESS or an error code + **/ +int removeMasterIndexRecord(MasterIndexRecord *record) + __attribute__((warn_unused_result)); + +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static INLINE int restoreDeltaListToMasterIndex(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + return masterIndex->restoreDeltaListToMasterIndex(masterIndex, dlsi, data); +} + +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * In normal operation, the virtual chapter number will be the next chapter + * following the currently open chapter. We will advance the master index + * one chapter forward in the virtual chapter space, invalidating the + * oldest chapter in the index and be prepared to add index entries for the + * newly opened chapter. + * + * In abnormal operation we make a potentially large change to the range of + * chapters being indexed. This happens when we are replaying chapters or + * rebuilding an entire index. If we move the open chapter forward, we + * will invalidate many chapters (potentially the entire index). If we + * move the open chapter backward, we invalidate any entry in the newly + * open chapter and any higher numbered chapter (potentially the entire + * index). + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static INLINE void setMasterIndexOpenChapter(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + masterIndex->setMasterIndexOpenChapter(masterIndex, virtualChapter); +} + +/** + * Set the chapter number associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is now found. + * + * @return UDS_SUCCESS or an error code + **/ +int setMasterIndexRecordChapter(MasterIndexRecord *record, uint64_t chapter) + __attribute__((warn_unused_result)); + +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static INLINE void setMasterIndexTag(MasterIndex *masterIndex, byte tag) +{ + masterIndex->setMasterIndexTag(masterIndex, tag); +} + +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static INLINE void setMasterIndexZoneOpenChapter(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + masterIndex->setMasterIndexZoneOpenChapter(masterIndex, zoneNumber, + virtualChapter); +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered reader to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int startRestoringMasterIndex(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + return masterIndex->startRestoringMasterIndex(masterIndex, bufferedReaders, + numReaders); +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int startSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + return masterIndex->startSavingMasterIndex(masterIndex, zoneNumber, + bufferedWriter); +} + +#endif /* MASTERINDEXOPS_H */ diff --git a/uds/memoryAlloc.c b/uds/memoryAlloc.c new file mode 100644 index 0000000..e47494c --- /dev/null +++ b/uds/memoryAlloc.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.c#1 $ + */ + +#include "memoryAlloc.h" + +#include "stringUtils.h" + +/**********************************************************************/ +int duplicateString(const char *string, const char *what, char **newString) +{ + return memdup(string, strlen(string) + 1, what, newString); +} + +/**********************************************************************/ +int memdup(const void *buffer, size_t size, const char *what, void *dupPtr) +{ + byte *dup; + int result = ALLOCATE(size, byte, what, &dup); + if (result != UDS_SUCCESS) { + return result; + } + + memcpy(dup, buffer, size); + *((void **) dupPtr) = dup; + return UDS_SUCCESS; +} diff --git a/uds/memoryAlloc.h b/uds/memoryAlloc.h new file mode 100644 index 0000000..c669e2b --- /dev/null +++ b/uds/memoryAlloc.h @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.h#2 $ + */ + +#ifndef MEMORY_ALLOC_H +#define MEMORY_ALLOC_H 1 + +#include + +#include "compiler.h" +#include "cpu.h" +#include "memoryDefs.h" +#include "permassert.h" + +/** + * Allocate storage based on memory size and alignment, logging an error if + * the allocation fails. The memory will be zeroed. + * + * @param size The size of an object + * @param align The required alignment + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +int allocateMemory(size_t size, size_t align, const char *what, void *ptr) + __attribute__((warn_unused_result)); + +/** + * Free storage + * + * @param ptr The memory to be freed + **/ +void freeMemory(void *ptr); + +/** + * Allocate storage based on element counts, sizes, and alignment. + * + * This is a generalized form of our allocation use case: It allocates + * an array of objects, optionally preceded by one object of another + * type (i.e., a struct with trailing variable-length array), with the + * alignment indicated. + * + * Why is this inline? The sizes and alignment will always be + * constant, when invoked through the macros below, and often the + * count will be a compile-time constant 1 or the number of extra + * bytes will be a compile-time constant 0. So at least some of the + * arithmetic can usually be optimized away, and the run-time + * selection between allocation functions always can. In many cases, + * it'll boil down to just a function call with a constant size. + * + * @param count The number of objects to allocate + * @param size The size of an object + * @param extra The number of additional bytes to allocate + * @param align The required alignment + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int doAllocation(size_t count, + size_t size, + size_t extra, + size_t align, + const char *what, + void *ptr) +{ + size_t totalSize = count * size + extra; + // Overflow check: + if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { + /* + * This is kind of a hack: We rely on the fact that SIZE_MAX would + * cover the entire address space (minus one byte) and thus the + * system can never allocate that much and the call will always + * fail. So we can report an overflow as "out of memory" by asking + * for "merely" SIZE_MAX bytes. + */ + totalSize = SIZE_MAX; + } + + return allocateMemory(totalSize, align, what, ptr); +} + +/** + * Reallocate dynamically allocated memory. There are no alignment guarantees + * for the reallocated memory. + * + * @param ptr The memory to reallocate. + * @param oldSize The old size of the memory + * @param size The new size to allocate + * @param what What is being allocated (for error logging) + * @param newPtr A pointer to hold the reallocated pointer + * + * @return UDS_SUCCESS or an error code + **/ +int reallocateMemory(void *ptr, + size_t oldSize, + size_t size, + const char *what, + void *newPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate one or more elements of the indicated type, logging an + * error if the allocation fails. The memory will be zeroed. + * + * @param COUNT The number of objects to allocate + * @param TYPE The type of objects to allocate. This type determines the + * alignment of the allocated memory. + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE(COUNT, TYPE, WHAT, PTR) \ + doAllocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) + +/** + * Allocate one object of an indicated type, followed by one or more + * elements of a second type, logging an error if the allocation + * fails. The memory will be zeroed. + * + * @param TYPE1 The type of the primary object to allocate. This type + * determines the alignment of the allocated memory. + * @param COUNT The number of objects to allocate + * @param TYPE2 The type of array objects to allocate + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE_EXTENDED(TYPE1, COUNT, TYPE2, WHAT, PTR) \ + __extension__ ({ \ + TYPE1 **_ptr = (PTR); \ + STATIC_ASSERT(__alignof__(TYPE1) >= __alignof__(TYPE2)); \ + int _result = doAllocation(COUNT, sizeof(TYPE2), sizeof(TYPE1), \ + __alignof__(TYPE1), WHAT, _ptr); \ + _result; \ + }) + +/** + * Free memory allocated with ALLOCATE(). + * + * @param ptr Pointer to the memory to free + **/ +static INLINE void FREE(void *ptr) +{ + freeMemory(ptr); +} + +/** + * Allocate memory starting on a cache line boundary, logging an error if the + * allocation fails. The memory will be zeroed. + * + * @param size The number of bytes to allocate + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static INLINE int allocateCacheAligned(size_t size, + const char *what, + void *ptr) +{ + return allocateMemory(size, CACHE_LINE_BYTES, what, ptr); +} + +/** + * Duplicate a string. + * + * @param string The string to duplicate + * @param what What is being allocated (for error logging) + * @param newString A pointer to hold the duplicated string + * + * @return UDS_SUCCESS or an error code + **/ +int duplicateString(const char *string, const char *what, char **newString) + __attribute__((warn_unused_result)); + +/** + * Duplicate a buffer, logging an error if the allocation fails. + * + * @param ptr The buffer to copy + * @param size The size of the buffer + * @param what What is being duplicated (for error logging) + * @param dupPtr A pointer to hold the allocated array + * + * @return UDS_SUCCESS or ENOMEM + **/ +int memdup(const void *ptr, size_t size, const char *what, void *dupPtr) + __attribute__((warn_unused_result)); + +/** + * Wrapper which permits freeing a const pointer. + * + * @param pointer the pointer to be freed + **/ +static INLINE void freeConst(const void *pointer) +{ + union { + const void *constP; + void *notConst; + } u = { .constP = pointer }; + FREE(u.notConst); +} + +/** + * Wrapper which permits freeing a volatile pointer. + * + * @param pointer the pointer to be freed + **/ +static INLINE void freeVolatile(volatile void *pointer) +{ + union { + volatile void *volP; + void *notVol; + } u = { .volP = pointer }; + FREE(u.notVol); +} + +#endif /* MEMORY_ALLOC_H */ diff --git a/uds/memoryDefs.h b/uds/memoryDefs.h new file mode 100644 index 0000000..3f8041e --- /dev/null +++ b/uds/memoryDefs.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryDefs.h#2 $ + */ + +#ifndef LINUX_KERNEL_MEMORY_DEFS_H +#define LINUX_KERNEL_MEMORY_DEFS_H 1 + +#include // for PAGE_SIZE + +#include "compiler.h" +#include "threadRegistry.h" +#include "typeDefs.h" + +/** + * Allocate one or more elements of the indicated type, aligning them + * on the boundary that will allow them to be used in io, logging an + * error if the allocation fails. The memory will be zeroed. + * + * @param COUNT The number of objects to allocate + * @param TYPE The type of objects to allocate + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE_IO_ALIGNED(COUNT, TYPE, WHAT, PTR) \ + doAllocation(COUNT, sizeof(TYPE), 0, PAGE_SIZE, WHAT, PTR) + +/** + * Allocate one element of the indicated type immediately, failing if the + * required memory is not immediately available. + * + * @param TYPE The type of objects to allocate + * @param WHAT What is being allocated (for error logging) + * + * @return pointer to the memory, or NULL if the memory is not available. + **/ +#define ALLOCATE_NOWAIT(TYPE, WHAT) allocateMemoryNowait(sizeof(TYPE), WHAT) + +/** + * Perform termination of the memory allocation subsystem. + **/ +void memoryExit(void); + +/** + * Perform initialization of the memory allocation subsystem. + **/ +void memoryInit(void); + +/** + * Allocate storage based on memory size, failing immediately if the required + * memory is not available. The memory will be zeroed. + * + * @param size The size of an object. + * @param what What is being allocated (for error logging) + * + * @return pointer to the allocated memory, or NULL if the required space is + * not available. + **/ +void *allocateMemoryNowait(size_t size, const char *what) + __attribute__((warn_unused_result)); + + +/** + * Register the current thread as an allocating thread. + * + * An optional flag location can be supplied indicating whether, at + * any given point in time, the threads associated with that flag + * should be allocating storage. If the flag is false, a message will + * be logged. + * + * If no flag is supplied, the thread is always allowed to allocate + * storage without complaint. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param flagPtr Location of the allocation-allowed flag + **/ +void registerAllocatingThread(RegisteredThread *newThread, + const bool *flagPtr); + +/** + * Unregister the current thread as an allocating thread. + **/ +void unregisterAllocatingThread(void); + +/** + * Get the memory statistics. + * + * @param bytesUsed A pointer to hold the number of bytes in use + * @param peakBytesUsed A pointer to hold the maximum value bytesUsed has + * attained + **/ +void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed); + +/** + * Report stats on any allocated memory that we're tracking. + * + * Not all allocation types are guaranteed to be tracked in bytes + * (e.g., bios). + **/ +void reportMemoryUsage(void); + + +#endif /* LINUX_KERNEL_MEMORY_DEFS_H */ diff --git a/uds/memoryLinuxKernel.c b/uds/memoryLinuxKernel.c new file mode 100644 index 0000000..5a42583 --- /dev/null +++ b/uds/memoryLinuxKernel.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryLinuxKernel.c#6 $ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "compilerDefs.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + + +/* + ****************************************************************************** + * Production: UDS and VDO keep track of which threads are allowed to allocate + * memory freely, and which threads must be careful to not do a memory + * allocation that does an I/O request. The allocatingThreads ThreadsRegistry + * and its associated methods implement this tracking. + */ + +static ThreadRegistry allocatingThreads; + +/*****************************************************************************/ +static bool allocationsAllowed(void) +{ + const bool *pointer = lookupThread(&allocatingThreads); + return pointer != NULL ? *pointer : false; +} + +/*****************************************************************************/ +void registerAllocatingThread(RegisteredThread *newThread, const bool *flagPtr) +{ + if (flagPtr == NULL) { + static const bool allocationAlwaysAllowed = true; + flagPtr = &allocationAlwaysAllowed; + } + registerThread(&allocatingThreads, newThread, flagPtr); +} + +/*****************************************************************************/ +void unregisterAllocatingThread(void) +{ + unregisterThread(&allocatingThreads); +} + +/* + ****************************************************************************** + * Production: We track how much memory has been allocated and freed. When we + * unload the UDS module, we log an error if we have not freed all the memory + * that we allocated. Nearly all memory allocation and freeing is done using + * this module. + * + * We do not use kernel functions like the kvasprintf() method, which allocate + * memory indirectly using kmalloc. + * + * These data structures and methods are used to track the amount of memory + * used. + */ + +// We allocate very few large objects, and allocation/deallocation isn't done +// in a performance-critical stage for us, so a linked list should be fine. +typedef struct vmallocBlockInfo { + void *ptr; + size_t size; + struct vmallocBlockInfo *next; +} VmallocBlockInfo; + +static struct { + spinlock_t lock; + size_t kmallocBlocks; + size_t kmallocBytes; + size_t vmallocBlocks; + size_t vmallocBytes; + size_t peakBytes; + VmallocBlockInfo *vmallocList; +} memoryStats __cacheline_aligned; + +/*****************************************************************************/ +static void updatePeakUsage(void) +{ + size_t totalBytes = memoryStats.kmallocBytes + memoryStats.vmallocBytes; + if (totalBytes > memoryStats.peakBytes) { + memoryStats.peakBytes = totalBytes; + } +} + +/*****************************************************************************/ +static void addKmallocBlock(size_t size) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + memoryStats.kmallocBlocks++; + memoryStats.kmallocBytes += size; + updatePeakUsage(); + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void removeKmallocBlock(size_t size) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + memoryStats.kmallocBlocks--; + memoryStats.kmallocBytes -= size; + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void addVmallocBlock(VmallocBlockInfo *block) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + block->next = memoryStats.vmallocList; + memoryStats.vmallocList = block; + memoryStats.vmallocBlocks++; + memoryStats.vmallocBytes += block->size; + updatePeakUsage(); + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void removeVmallocBlock(void *ptr) +{ + VmallocBlockInfo *block, **blockPtr; + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + for (blockPtr = &memoryStats.vmallocList; + (block = *blockPtr) != NULL; + blockPtr = &block->next) { + if (block->ptr == ptr) { + *blockPtr = block->next; + memoryStats.vmallocBlocks--; + memoryStats.vmallocBytes -= block->size; + break; + } + } + spin_unlock_irqrestore(&memoryStats.lock, flags); + if (block != NULL) { + FREE(block); + } else { + logInfo("attempting to remove ptr %" PRIptr " not found in vmalloc list", + ptr); + } +} + + + +/** + * Determine whether allocating a memory block should use kmalloc or vmalloc. + * + * vmalloc can allocate any integral number of pages. + * + * kmalloc can allocate any number of bytes up to a configured limit, which + * defaults to 8 megabytes on some of our systems. kmalloc is especially good + * when memory is being both allocated and freed, and it does this efficiently + * in a multi CPU environment. + * + * kmalloc usually rounds the size of the block up to the next power of two. + * So when the requested block is bigger than PAGE_SIZE / 2 bytes, kmalloc will + * never give you less space than the corresponding vmalloc allocation. + * Sometimes vmalloc will use less overhead than kmalloc. + * + * The advantages of kmalloc do not help out UDS or VDO, because we allocate + * all our memory up front and do not free and reallocate it. Sometimes we + * have problems using kmalloc, because the Linux memory page map can become so + * fragmented that kmalloc will not give us a 32KB chunk. We have used vmalloc + * as a backup to kmalloc in the past, and a followup vmalloc of 32KB will + * work. But there is no strong case to be made for using kmalloc over vmalloc + * for these size chunks. + * + * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB + * requests. There is no strong reason for favoring either kmalloc or vmalloc + * for 4KB requests, except that the keeping of vmalloc statistics uses a + * linked list implementation. Using a simple test, this choice of boundary + * results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB + * results in an additional 6374 vmalloc calls, which will require a change to + * the code that tracks vmalloc statistics. + * + * @param size How many bytes to allocate + **/ +static INLINE bool useKmalloc(size_t size) +{ + return size <= PAGE_SIZE; +} + +/*****************************************************************************/ +int allocateMemory(size_t size, size_t align, const char *what, void *ptr) +{ + if (ptr == NULL) { + return UDS_INVALID_ARGUMENT; + } + if (size == 0) { + *((void **) ptr) = NULL; + return UDS_SUCCESS; + } + + + /* + * The __GFP_RETRY_MAYFAIL means: The VM implementation will retry memory + * reclaim procedures that have previously failed if there is some indication + * that progress has been made else where. It can wait for other tasks to + * attempt high level approaches to freeing memory such as compaction (which + * removes fragmentation) and page-out. There is still a definite limit to + * the number of retries, but it is a larger limit than with __GFP_NORETRY. + * Allocations with this flag may fail, but only when there is genuinely + * little unused memory. While these allocations do not directly trigger the + * OOM killer, their failure indicates that the system is likely to need to + * use the OOM killer soon. The caller must handle failure, but can + * reasonably do so by failing a higher-level request, or completing it only + * in a much less efficient manner. + */ + const gfp_t gfpFlags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL; + + bool allocationsRestricted = !allocationsAllowed(); + unsigned int noioFlags; + if (allocationsRestricted) { + noioFlags = memalloc_noio_save(); + } + + unsigned long startTime = jiffies; + void *p = NULL; + if (useKmalloc(size) && (align < PAGE_SIZE)) { + p = kmalloc(size, gfpFlags | __GFP_NOWARN); + if (p == NULL) { + /* + * If we had just done kmalloc(size, gfpFlags) it is possible that the + * allocation would fail (see VDO-3688). The kernel log would then + * contain a long report about the failure. Although the failure occurs + * because there is no page available to allocate, by the time it logs + * the available space, there is a page available. So hopefully a short + * sleep will allow the page reclaimer to free a single page, which is + * all that we need. + */ + msleep(1); + p = kmalloc(size, gfpFlags); + } + if (p != NULL) { + addKmallocBlock(ksize(p)); + } + } else { + VmallocBlockInfo *block; + if (ALLOCATE(1, VmallocBlockInfo, __func__, &block) == UDS_SUCCESS) { + /* + * If we just do __vmalloc(size, gfpFlags, PAGE_KERNEL) it is possible + * that the allocation will fail (see VDO-3661). The kernel log will + * then contain a long report about the failure. Although the failure + * occurs because there are not enough pages available to allocate, by + * the time it logs the available space, there may enough pages available + * for smaller allocations. So hopefully a short sleep will allow the + * page reclaimer to free enough pages for us. + * + * For larger allocations, the kernel page_alloc code is racing against + * the page reclaimer. If the page reclaimer can stay ahead of + * page_alloc, the __vmalloc will succeed. But if page_alloc overtakes + * the page reclaimer, the allocation fails. It is possible that more + * retries will succeed. + */ + for (;;) { + p = __vmalloc(size, gfpFlags | __GFP_NOWARN, PAGE_KERNEL); + // Try again unless we succeeded or more than 1 second has elapsed. + if ((p != NULL) || (jiffies_to_msecs(jiffies - startTime) > 1000)) { + break; + } + msleep(1); + } + if (p == NULL) { + // Try one more time, logging a failure for this call. + p = __vmalloc(size, gfpFlags, PAGE_KERNEL); + } + if (p == NULL) { + FREE(block); + } else { + block->ptr = p; + block->size = PAGE_ALIGN(size); + addVmallocBlock(block); + } + } + } + + if (allocationsRestricted) { + memalloc_noio_restore(noioFlags); + } + + if (p == NULL) { + unsigned int duration = jiffies_to_msecs(jiffies - startTime); + logError("Could not allocate %zu bytes for %s in %u msecs", + size, what, duration); + return ENOMEM; + } + *((void **) ptr) = p; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void *allocateMemoryNowait(size_t size, + const char *what __attribute__((unused))) +{ + void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO); + if (p != NULL) { + addKmallocBlock(ksize(p)); + } + return p; +} + +/*****************************************************************************/ +void freeMemory(void *ptr) +{ + if (ptr != NULL) { + if (is_vmalloc_addr(ptr)) { + removeVmallocBlock(ptr); + vfree(ptr); + } else { + removeKmallocBlock(ksize(ptr)); + kfree(ptr); + } + } +} + +/*****************************************************************************/ +int reallocateMemory(void *ptr, + size_t oldSize, + size_t size, + const char *what, + void *newPtr) +{ + // Handle special case of zero sized result + if (size == 0) { + FREE(ptr); + *(void **)newPtr = NULL; + return UDS_SUCCESS; + } + + int result = ALLOCATE(size, char, what, newPtr); + if (result != UDS_SUCCESS) { + return result; + } + + if (ptr != NULL) { + if (oldSize < size) { + size = oldSize; + } + memcpy(*((void **) newPtr), ptr, size); + FREE(ptr); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void memoryInit(void) +{ + + spin_lock_init(&memoryStats.lock); + initializeThreadRegistry(&allocatingThreads); +} + + +/*****************************************************************************/ +void memoryExit(void) +{ + + ASSERT_LOG_ONLY(memoryStats.kmallocBytes == 0, + "kmalloc memory used (%zd bytes in %zd blocks)" + " is returned to the kernel", + memoryStats.kmallocBytes, memoryStats.kmallocBlocks); + ASSERT_LOG_ONLY(memoryStats.vmallocBytes == 0, + "vmalloc memory used (%zd bytes in %zd blocks)" + " is returned to the kernel", + memoryStats.vmallocBytes, memoryStats.vmallocBlocks); + logDebug("%s peak usage %zd bytes", THIS_MODULE->name, + memoryStats.peakBytes); +} + +/**********************************************************************/ +void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + *bytesUsed = memoryStats.kmallocBytes + memoryStats.vmallocBytes; + *peakBytesUsed = memoryStats.peakBytes; + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/**********************************************************************/ +void reportMemoryUsage() +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + uint64_t kmallocBlocks = memoryStats.kmallocBlocks; + uint64_t kmallocBytes = memoryStats.kmallocBytes; + uint64_t vmallocBlocks = memoryStats.vmallocBlocks; + uint64_t vmallocBytes = memoryStats.vmallocBytes; + uint64_t peakUsage = memoryStats.peakBytes; + spin_unlock_irqrestore(&memoryStats.lock, flags); + uint64_t totalBytes = kmallocBytes + vmallocBytes; + logInfo("current module memory tracking" + " (actual allocation sizes, not requested):"); + logInfo(" %llu bytes in %llu kmalloc blocks", + kmallocBytes, kmallocBlocks); + logInfo(" %llu bytes in %llu vmalloc blocks", + vmallocBytes, vmallocBlocks); + logInfo(" total %llu bytes, peak usage %llu bytes", + totalBytes, peakUsage); +} diff --git a/uds/murmur/MurmurHash3.c b/uds/murmur/MurmurHash3.c new file mode 100644 index 0000000..42af11a --- /dev/null +++ b/uds/murmur/MurmurHash3.c @@ -0,0 +1,379 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +#include "cpu.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#if __GNUC__ >= 7 +#pragma GCC diagnostic warning "-Wimplicit-fallthrough=0" +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +static inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +static inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return p[i]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return __builtin_bswap32(p[i]); +#else +#error "can't figure out byte order" +#endif +} + +static FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return p[i]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return __builtin_bswap64(p[i]); +#else +#error "can't figure out byte order" +#endif +} + +// Block write +static FORCE_INLINE void putblock (uint32_t *p, int i, uint32_t value) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p[i] = value; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p[i] = __builtin_bswap32(value); +#else +#error "can't figure out byte order" +#endif +} + +static FORCE_INLINE void putblock64 (uint64_t *p, int i, uint64_t value) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p[i] = value; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p[i] = __builtin_bswap64(value); +#else +#error "can't figure out byte order" +#endif +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +static FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +static FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + putblock(out, 0, h1); +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + putblock((uint32_t*)out, 0, h1); + putblock((uint32_t*)out, 1, h2); + putblock((uint32_t*)out, 2, h3); + putblock((uint32_t*)out, 3, h4); +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + int i; + for(i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + putblock64((uint64_t*)out, 0, h1); + putblock64((uint64_t*)out, 1, h2); +} diff --git a/uds/murmur/MurmurHash3.h b/uds/murmur/MurmurHash3.h new file mode 100644 index 0000000..bebb8fa --- /dev/null +++ b/uds/murmur/MurmurHash3.h @@ -0,0 +1,44 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Linux kernel + +#ifdef __KERNEL__ +# include + +// Microsoft Visual Studio + +#else // defined(__KERNEL__) +# if defined(_MSC_VER) + + typedef unsigned char uint8_t; + typedef unsigned long uint32_t; + typedef unsigned __int64 uint64_t; + +// Other compilers + +# else // defined(_MSC_VER) + +# include + +# endif // !defined(_MSC_VER) +#endif // !defined(__KERNEL__) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/uds/nonce.c b/uds/nonce.c new file mode 100644 index 0000000..43b0f80 --- /dev/null +++ b/uds/nonce.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/nonce.c#3 $ + */ + +#include "nonce.h" + +#include "murmur/MurmurHash3.h" +#include "numeric.h" +#include "random.h" +#include "stringUtils.h" +#include "timeUtils.h" + +/*****************************************************************************/ +static uint64_t hashStuff(uint64_t start, const void *data, size_t len) +{ + uint32_t seed = start ^ (start >> 27); + byte hashBuffer[16]; + MurmurHash3_x64_128(data, len, seed, hashBuffer); + return getUInt64LE(hashBuffer + 4); +} + +/*****************************************************************************/ +static void *memput(void *buf, void *end, const void *data, size_t len) +{ + byte *bp = buf; + byte *be = end; + + size_t chunk = minSizeT(len, be - bp); + memcpy(bp, data, chunk); + return bp + chunk; +} + +/*****************************************************************************/ +size_t createUniqueNonceData(byte *buffer, size_t length) +{ + AbsTime now = currentTime(CLOCK_REALTIME); + + byte *be = buffer + length; + byte *bp = memput(buffer, be, &now, sizeof(now)); + + uint32_t rand = randomInRange(1, (1<<30) - 1); + + bp = memput(bp, be, &rand, sizeof(rand)); + + while (bp < be) { + size_t n = minSizeT(be - bp, bp - buffer); + memcpy(bp, buffer, n); + bp += n; + } + + return bp - buffer; +} + +/*****************************************************************************/ +uint64_t generateMasterNonce(const void *data, size_t len) +{ + return hashStuff(0xa1b1e0fc, data, len); +} + +/*****************************************************************************/ +uint64_t generateSecondaryNonce(uint64_t nonce, + const void *data, + size_t len) +{ + return hashStuff(nonce + 1, data, len); +} diff --git a/uds/nonce.h b/uds/nonce.h new file mode 100644 index 0000000..43f2054 --- /dev/null +++ b/uds/nonce.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/nonce.h#1 $ + */ + +#ifndef NONCE_H +#define NONCE_H + +#include "typeDefs.h" + +/** + * Create unique data for the master nonce, using system-specific + * methods such as the current time and a random number. + * + * @param buffer A buffer of length specified next. + * @param length Length of the buffer. + * + * @return the amount of the buffer that has been filled with unique data + **/ +size_t createUniqueNonceData(byte *buffer, size_t length); + +/** + * Generate a master nonce, using the specified data. + * + * @param data Some arbitrary information. + * @param len The length of the information. + * + * @return a number which will be fairly unique + **/ +uint64_t generateMasterNonce(const void *data, size_t len); + +/** + * Deterministically generate a secondary nonce based on an existing + * nonce and some arbitrary data. Effectively hashes the nonce and + * the data to produce a new nonce which is deterministic. + * + * @param nonce An existing nonce which is well known. + * @param data Some data related to the creation of this nonce. + * @param len The length of the data. + * + * @return a number which will be fairly unique and depend solely on + * the nonce and the data. + **/ +uint64_t generateSecondaryNonce(uint64_t nonce, + const void *data, + size_t len); + +#endif // NONCE_H diff --git a/uds/numeric.c b/uds/numeric.c new file mode 100644 index 0000000..4bc1e2d --- /dev/null +++ b/uds/numeric.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/numeric.c#2 $ + */ + +#include "numeric.h" +#include "permassert.h" + +#define STATIC_ASSERT_ALIGNOF(type, expectedAlignment) \ + STATIC_ASSERT(__alignof__(type) == (expectedAlignment)) + +/**********************************************************************/ +bool multiplyWouldOverflow(uint64_t a, uint64_t b) +{ + return b != 0 && a > UINT64_MAX / b; +} + +/**********************************************************************/ +void numericCompileTimeAssertions(void) +{ + STATIC_ASSERT_SIZEOF(uint64_t, 8); + STATIC_ASSERT_SIZEOF(uint32_t, 4); + STATIC_ASSERT_SIZEOF(uint16_t, 2); + + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint64_t), 8); + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint32_t), 4); + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint16_t), 2); + + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint64_t), 1); + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint32_t), 1); + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint16_t), 1); +} diff --git a/uds/numeric.h b/uds/numeric.h new file mode 100644 index 0000000..06d7eee --- /dev/null +++ b/uds/numeric.h @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/numeric.h#2 $ + */ + +#ifndef NUMERIC_H +#define NUMERIC_H 1 + +#include "compiler.h" +#include "numericDefs.h" +#include "typeDefs.h" + +#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ + || !defined(__BYTE_ORDER__) +#error "GCC byte order macros not defined?" +#endif + +/* + * Define a type describing an integer value that is only byte-aligned + * and may explicitly alias other types. GCC keeps getting better + * about type-based alias analysis (both for optimization and for + * warnings), so simply casting a pointer to pointer-to-uintXX_t isn't + * good enough. + * + * C is okay with defining the structures directly in a cast, but + * C++ is not, and we use this header in some C++ code internally. + */ +#define UNALIGNED_WRAPPER(TYPE) \ + unaligned_wrap_##TYPE +#define UNALIGNED_WRAPPER_DEF(TYPE) \ + typedef struct __attribute__((packed, may_alias)) { TYPE value; } \ + UNALIGNED_WRAPPER(TYPE) +UNALIGNED_WRAPPER_DEF(int64_t); +UNALIGNED_WRAPPER_DEF(uint64_t); +UNALIGNED_WRAPPER_DEF(int32_t); +UNALIGNED_WRAPPER_DEF(uint32_t); +UNALIGNED_WRAPPER_DEF(uint16_t); + +#define GET_UNALIGNED(TYPE,ADDR) \ + (((const UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value) +#define PUT_UNALIGNED(TYPE,ADDR,VALUE) \ + (((UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value = (VALUE)) + +/** + * Find the minimum of two ints. + * + * @param a The first int + * @param b The second int + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE int minInt(int a, int b) +{ + return ((a < b) ? a : b); +} + +/** + * Find the maximum of two ints. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE int maxInt(int a, int b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two unsigned ints. + * + * @param a The first value + * @param b The second value + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned int maxUInt(unsigned int a, unsigned int b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two signed longs. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE long maxLong(long a, long b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two unsigned longs. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned long maxULong(unsigned long a, unsigned long b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the minimum of two size_ts. + * + * @param a The first size_t + * @param b The second size_t + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE size_t minSizeT(size_t a, size_t b) +{ + return ((a < b) ? a : b); +} + +/** + * Find the maximum of two size_ts. + * + * @param a The first size_t + * @param b The second size_t + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE size_t maxSizeT(size_t a, size_t b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the minimum of two uint64_ts. + * + * @param a The first uint64_t + * @param b The second uint64_t + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t minUInt64(uint64_t a, uint64_t b) +{ + return ((a < b) ? a : b); +} + +/** + * Multiply two uint64_t and check for overflow. Does division. + **/ +bool multiplyWouldOverflow(uint64_t a, uint64_t b); + +/** + * Extract a 64 bit unsigned number from a buffer stored in + * big-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t getUInt64BE(const byte* data) +{ + uint64_t num = GET_UNALIGNED(uint64_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit unsigned big-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt64BE(const byte *buffer, + size_t *offset, + uint64_t *decoded) +{ + *decoded = getUInt64BE(buffer + *offset); + *offset += sizeof(uint64_t); +} + +/** + * Store a 64 bit unsigned number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt64BE(byte* data, uint64_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(uint64_t, data, num); +} + +/** + * Encode a 64 bit unsigned number into a buffer at a given offset + * using a big-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt64BE(byte *data, + size_t *offset, + uint64_t toEncode) +{ + storeUInt64BE(data + *offset, toEncode); + *offset += sizeof(uint64_t); +} + +/** + * Extract a 32 bit unsigned number from a buffer stored in big-endian + * representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint32_t getUInt32BE(const byte* data) +{ + uint32_t num = GET_UNALIGNED(uint32_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit unsigned big-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt32BE(const byte *buffer, + size_t *offset, + uint32_t *decoded) +{ + *decoded = getUInt32BE(buffer + *offset); + *offset += sizeof(uint32_t); +} + +/** + * Store a 32 bit number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt32BE(byte* data, uint32_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(uint32_t, data, num); +} + +/** + * Encode a 32 bit number into a buffer at a given offset using a + * big-endian representation. The offset will be advanced to first byte + * after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt32BE(byte *data, + size_t *offset, + uint32_t toEncode) +{ + storeUInt32BE(data + *offset, toEncode); + *offset += sizeof(uint32_t); +} + +/** + * Extract a 16 bit number from a buffer stored in + * big-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint16_t getUInt16BE(const byte* data) +{ + uint16_t num = GET_UNALIGNED(uint16_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + return num; +} + +/** + * Extract a 16 bit, big-endian number from a buffer at a specified offset. + * The offset will be advanced to the first byte after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to + * extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt16BE(const byte *buffer, + size_t *offset, + uint16_t *decoded) +{ + *decoded = getUInt16BE(buffer + *offset); + *offset += sizeof(uint16_t); +} + +/** + * Store a 16 bit number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt16BE(byte* data, uint16_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + PUT_UNALIGNED(uint16_t, data, num); +} + +/** + * Encode a 16 bit number into a buffer at a given offset using a + * big-endian representation. The offset will be advanced to first byte + * after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt16BE(byte *data, + size_t *offset, + uint16_t toEncode) +{ + storeUInt16BE(data + *offset, toEncode); + *offset += sizeof(uint16_t); +} + +/** + * Extract a 64 bit signed number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE int64_t getInt64LE(const byte* data) +{ + int64_t num = GET_UNALIGNED(int64_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit signed little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeInt64LE(const byte *buffer, + size_t *offset, + int64_t *decoded) +{ + *decoded = getInt64LE(buffer + *offset); + *offset += sizeof(int64_t); +} + +/** + * Store a signed 64 bit number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeInt64LE(byte* data, int64_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(int64_t, data, num); +} + +/** + * Encode a 64 bit signed number into a buffer at a given offset using + * a little-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeInt64LE(byte *data, + size_t *offset, + int64_t toEncode) +{ + storeInt64LE(data + *offset, toEncode); + *offset += sizeof(int64_t); +} + +/** + * Extract a 64 bit number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t getUInt64LE(const byte* data) +{ + uint64_t num = GET_UNALIGNED(uint64_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt64LE(const byte *buffer, + size_t *offset, + uint64_t *decoded) +{ + *decoded = getUInt64LE(buffer + *offset); + *offset += sizeof(uint64_t); +} + +/** + * Store a 64 bit unsigned number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt64LE(byte* data, uint64_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(uint64_t, data, num); +} + +/** + * Encode a 64 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt64LE(byte *data, + size_t *offset, + uint64_t toEncode) +{ + storeUInt64LE(data + *offset, toEncode); + *offset += sizeof(uint64_t); +} + +/** + * Extract a 32 bit signed number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE int32_t getInt32LE(const byte* data) +{ + int32_t num = GET_UNALIGNED(int32_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit signed little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeInt32LE(const byte *buffer, + size_t *offset, + int32_t *decoded) +{ + *decoded = getInt32LE(buffer + *offset); + *offset += sizeof(int32_t); +} + +/** + * Store a signed 32 bit number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeInt32LE(byte* data, int32_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(int32_t, data, num); +} + +/** + * Encode a 32 bit signed number into a buffer at a given offset using + * a little-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeInt32LE(byte *data, + size_t *offset, + int32_t toEncode) +{ + storeInt32LE(data + *offset, toEncode); + *offset += sizeof(int32_t); +} + +/** + * Extract a 32 bit unsigned number from a buffer stored in + * little-endian representation. + + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint32_t getUInt32LE(const byte* data) +{ + uint32_t num = GET_UNALIGNED(uint32_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt32LE(const byte *buffer, + size_t *offset, + uint32_t *decoded) +{ + *decoded = getUInt32LE(buffer + *offset); + *offset += sizeof(uint32_t); +} + +/** + * Store a 32 bit unsigned number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt32LE(byte* data, uint32_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(uint32_t, data, num); +} + +/** + * Encode a 32 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt32LE(byte *data, + size_t *offset, + uint32_t toEncode) +{ + storeUInt32LE(data + *offset, toEncode); + *offset += sizeof(uint32_t); +} + +/** + * Extract a 16 bit number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint16_t getUInt16LE(const byte* data) +{ + uint16_t num = GET_UNALIGNED(uint16_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + return num; +} + +/** + * Extract a 16 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to + * extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt16LE(const byte *buffer, + size_t *offset, + uint16_t *decoded) +{ + *decoded = getUInt16LE(buffer + *offset); + *offset += sizeof(uint16_t); +} + +/** + * Store a 16 bit number in a buffer in little-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt16LE(byte* data, uint16_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + PUT_UNALIGNED(uint16_t, data, num); +} + +/** + * Encode a 16 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt16LE(byte *data, + size_t *offset, + uint16_t toEncode) +{ + storeUInt16LE(data + *offset, toEncode); + *offset += sizeof(uint16_t); +} + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if any of the uint*_t types are not of the + * size we expect. This function should never be called. + **/ +void numericCompileTimeAssertions(void); + +#endif /* NUMERIC_H */ diff --git a/uds/numericDefs.h b/uds/numericDefs.h new file mode 100644 index 0000000..c8795a1 --- /dev/null +++ b/uds/numericDefs.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/numericDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_NUMERIC_DEFS_H +#define LINUX_KERNEL_NUMERIC_DEFS_H 1 + +#ifdef __x86_64__ +/* + * __builtin_bswap16 should work fine here too, but check for a + * performance impact before changing it, just to be safe. + */ +#define bswap_16(x) \ + (__extension__ \ + ({ register unsigned short int __v, __x = (unsigned short int) (x); \ + __asm__ ("rorw $8, %w0" \ + : "=r" (__v) \ + : "0" (__x) \ + : "cc"); \ + __v; })) +#else +#define bswap_16(x) __builtin_bswap16(x) +#endif + +#endif /* LINUX_KERNEL_NUMERIC_DEFS_H */ diff --git a/uds/opaqueTypes.h b/uds/opaqueTypes.h new file mode 100644 index 0000000..478631a --- /dev/null +++ b/uds/opaqueTypes.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/opaqueTypes.h#3 $ + */ + +#ifndef OPAQUE_TYPES_H +#define OPAQUE_TYPES_H + +/* + * This file contains typedefs of structures internal to the UDS library + * for which many users of those structures do need to know the details + * of the structures themselves. + */ +typedef struct indexRouter IndexRouter; +typedef struct internalRequest Request; +typedef struct requestQueue RequestQueue; + +#endif /* OPAQUE_TYPES_H */ diff --git a/uds/openChapter.c b/uds/openChapter.c new file mode 100644 index 0000000..7a8a613 --- /dev/null +++ b/uds/openChapter.c @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapter.c#4 $ + */ + +#include "openChapter.h" + +#include "compiler.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "zone.h" + +static int readOpenChapters(ReadPortal *portal); +static int writeOpenChapters(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +const IndexComponentInfo OPEN_CHAPTER_INFO = { + .kind = RL_KIND_OPEN_CHAPTER, + .name = "open chapter", + .saveOnly = true, + .chapterSync = false, + .multiZone = false, + .ioStorage = true, + .loader = readOpenChapters, + .saver = writeOpenChapters, + .incremental = NULL, +}; + +static const byte OPEN_CHAPTER_MAGIC[] = "ALBOC"; +static const byte OPEN_CHAPTER_VERSION[] = "02.00"; + +enum { + OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, + OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1 +}; + +/**********************************************************************/ +static int fillDeltaChapterIndex(OpenChapterZone **chapterZones, + unsigned int zoneCount, + OpenChapterIndex *index, + UdsChunkRecord *collatedRecords) +{ + // Find a record to replace any deleted records, and fill the chapter if + // it was closed early. The last record in any filled zone is guaranteed + // to not have been deleted in this chapter, so use one of those. + OpenChapterZone *fillChapterZone = NULL; + UdsChunkRecord *fillRecord = NULL; + unsigned int z; + for (z = 0; z < zoneCount; ++z) { + fillChapterZone = chapterZones[z]; + if (fillChapterZone->size == fillChapterZone->capacity) { + fillRecord = &fillChapterZone->records[fillChapterZone->size]; + break; + } + } + int result = ASSERT((fillRecord != NULL), + "some open chapter zone filled"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT(!fillChapterZone->slots[fillChapterZone->size].recordDeleted, + "chapter fill record not deleted"); + if (result != UDS_SUCCESS) { + return result; + } + + const Geometry *geometry = index->geometry; + unsigned int pagesPerChapter = geometry->recordPagesPerChapter; + unsigned int recordsPerPage = geometry->recordsPerPage; + int overflowCount = 0; + unsigned int recordsAdded = 0; + unsigned int zone = 0; + + unsigned int page; + for (page = 0; page < pagesPerChapter; page++) { + unsigned int i; + for (i = 0; + i < recordsPerPage; + i++, recordsAdded++, zone = (zone + 1) % zoneCount) { + + // The record arrays are 1-based. + unsigned int recordNumber = 1 + (recordsAdded / zoneCount); + + // If the zone has been exhausted, or the record was deleted, + // add the fill record to the chapter. + if (recordNumber > chapterZones[zone]->size + || chapterZones[zone]->slots[recordNumber].recordDeleted) { + collatedRecords[1 + recordsAdded] = *fillRecord; + continue; + } + + UdsChunkRecord *nextRecord = &chapterZones[zone]->records[recordNumber]; + collatedRecords[1 + recordsAdded] = *nextRecord; + + int result = putOpenChapterIndexRecord(index, &nextRecord->name, page); + switch (result) { + case UDS_SUCCESS: + break; + case UDS_OVERFLOW: + overflowCount++; + break; + default: + logErrorWithStringError(result, "failed to build open chapter index"); + return result; + } + } + } + if (overflowCount > 0) { + logWarning("Failed to add %d entries to chapter index", overflowCount); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int closeOpenChapter(OpenChapterZone **chapterZones, + unsigned int zoneCount, + Volume *volume, + OpenChapterIndex *chapterIndex, + UdsChunkRecord *collatedRecords, + uint64_t virtualChapterNumber) +{ + // Empty the delta chapter index, and prepare it for the new virtual chapter. + emptyOpenChapterIndex(chapterIndex, virtualChapterNumber); + + // Map each non-deleted record name to its record page number in the delta + // chapter index. + int result = fillDeltaChapterIndex(chapterZones, zoneCount, chapterIndex, + collatedRecords); + if (result != UDS_SUCCESS) { + return result; + } + + // Pass the populated chapter index and the records to the volume, which + // will generate and write the index and record pages for the chapter. + return writeChapter(volume, chapterIndex, collatedRecords); +} + +/**********************************************************************/ +int saveOpenChapters(Index *index, BufferedWriter *writer) +{ + int result = writeToBufferedWriter(writer, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = writeToBufferedWriter(writer, OPEN_CHAPTER_VERSION, + OPEN_CHAPTER_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + uint32_t totalRecords = 0; + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + totalRecords += openChapterSize(index->zones[i]->openChapter); + } + + // Store the record count in little-endian order. + byte totalRecordData[sizeof(totalRecords)]; + storeUInt32LE(totalRecordData, totalRecords); + + result = writeToBufferedWriter(writer, totalRecordData, + sizeof(totalRecordData)); + if (result != UDS_SUCCESS) { + return result; + } + + // Only write out the records that have been added and not deleted. + uint32_t recordsAdded = 0; + unsigned int recordIndex = 1; + while(recordsAdded < totalRecords) { + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + if (recordIndex > index->zones[i]->openChapter->size) { + continue; + } + if (index->zones[i]->openChapter->slots[recordIndex].recordDeleted) { + continue; + } + UdsChunkRecord *record + = &index->zones[i]->openChapter->records[recordIndex]; + result = writeToBufferedWriter(writer, record, sizeof(UdsChunkRecord)); + if (result != UDS_SUCCESS) { + return result; + } + recordsAdded++; + } + recordIndex++; + } + + return flushBufferedWriter(writer); +} + +/**********************************************************************/ +uint64_t computeSavedOpenChapterSize(Geometry *geometry) +{ + return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + + sizeof(uint32_t) + geometry->recordsPerChapter * sizeof(UdsChunkRecord); +} + +/**********************************************************************/ +static int writeOpenChapters(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + int result = ASSERT((zone == 0), "open chapter write not zoned"); + if (result != UDS_SUCCESS) { + return result; + } + + Index *index = indexComponentData(component); + return saveOpenChapters(index, writer); +} + +/** + * Read the version field from a buffered reader, checking whether it is a + * supported version. Returns (via a pointer parameter) the matching + * version constant, which can be used by comparing to the version + * constants using simple pointer equality. + * + * @param [in] reader A buffered reader. + * @param [out] version The version constant that was matched. + * + * @return UDS_SUCCESS or an error code if the file could not be read or + * the version is invalid or unsupported + **/ +static int readVersion(BufferedReader *reader, const byte **version) +{ + byte buffer[OPEN_CHAPTER_VERSION_LENGTH]; + int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "Invalid open chapter version: %.*s", + (int) sizeof(buffer), buffer); + } + *version = OPEN_CHAPTER_VERSION; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int loadVersion20(Index *index, BufferedReader *reader) +{ + byte numRecordsData[sizeof(uint32_t)]; + int result + = readFromBufferedReader(reader, &numRecordsData, sizeof(numRecordsData)); + if (result != UDS_SUCCESS) { + return result; + } + uint32_t numRecords = getUInt32LE(numRecordsData); + + // Keep track of which zones cannot accept any more records. + bool fullFlags[MAX_ZONES] = { false, }; + + // Assign records to the correct zones. + UdsChunkRecord record; + uint32_t records; + for (records = 0; records < numRecords; records++) { + result = readFromBufferedReader(reader, &record, sizeof(UdsChunkRecord)); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zone = 0; + if (index->zoneCount > 1) { + // A read-only index has no master index, but it also has only one zone. + zone = getMasterIndexZone(index->masterIndex, &record.name); + } + // Add records until the open chapter zone almost runs out of space. + // The chapter can't be closed here, so don't add the last record. + if (!fullFlags[zone]) { + unsigned int remaining; + result = putOpenChapter(index->zones[zone]->openChapter, + &record.name, &record.data, &remaining); + fullFlags[zone] = (remaining <= 1); + if (result != UDS_SUCCESS) { + return result; + } + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int loadOpenChapters(Index *index, BufferedReader *reader) +{ + // Read and check the magic number. + int result = + verifyBufferedData(reader, OPEN_CHAPTER_MAGIC, OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + // Read and check the version. + const byte *version = NULL; + result = readVersion(reader, &version); + if (result != UDS_SUCCESS) { + return result; + } + + return loadVersion20(index, reader); +} + +/**********************************************************************/ +int readOpenChapters(ReadPortal *portal) +{ + Index *index = indexComponentData(portal->component); + + BufferedReader *reader; + int result = getBufferedReaderForPortal(portal, 0, &reader); + if (result != UDS_SUCCESS) { + return result; + } + return loadOpenChapters(index, reader); +} diff --git a/uds/openChapter.h b/uds/openChapter.h new file mode 100644 index 0000000..381badd --- /dev/null +++ b/uds/openChapter.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapter.h#1 $ + */ + +#ifndef OPENCHAPTER_H +#define OPENCHAPTER_H 1 + +#include "common.h" +#include "geometry.h" +#include "index.h" +#include "indexComponent.h" + +extern const IndexComponentInfo OPEN_CHAPTER_INFO; + +/** + * OpenChapter handles writing the open chapter records to the volume. It also + * manages the open chapter index component, and all the tools to generate and + * parse the open chapter file. The open chapter file interleaves records from + * each openChapterZone structure. + * + *

Once each open chapter zone is filled, the records are interleaved to + * preserve temporal locality, the index pages are generated through a + * delta chapter index, and the record pages are derived by sorting each + * page-sized batch of records by their names. + * + *

Upon index shutdown, the open chapter zone records are again + * interleaved, and the records are stored as a single array. The hash + * slots are not preserved, since the records may be reassigned to new + * zones at load time. + **/ + +/** + * Close the open chapter and write it to disk. + * + * @param chapterZones The zones of the chapter to close + * @param zoneCount The number of zones + * @param volume The volume to which to write the chapter + * @param chapterIndex The OpenChapterIndex to use while writing + * @param collatedRecords Collated records array to use while writing + * @param virtualChapterNumber The virtual chapter number of the open chapter + * + * @return UDS_SUCCESS or an error code + **/ +int closeOpenChapter(OpenChapterZone **chapterZones, + unsigned int zoneCount, + Volume *volume, + OpenChapterIndex *chapterIndex, + UdsChunkRecord *collatedRecords, + uint64_t virtualChapterNumber) + __attribute__((warn_unused_result)); + +/** + * Write out a partially filled chapter to a file. + * + * @param index the index to save the data from + * @param writer the writer to write out the chapters + * + * @return UDS_SUCCESS on success + **/ +int saveOpenChapters(Index *index, BufferedWriter *writer) + __attribute__((warn_unused_result)); + +/** + * Read a partially filled chapter from a file. + * + * @param index the index to load the data into + * @param reader the buffered reader to read from + * + * @return UDS_SUCCESS on success + **/ +int loadOpenChapters(Index *index, BufferedReader *reader) + __attribute__((warn_unused_result)); + +/** + * Compute the size of the maximum open chapter save image. + * + * @param geometry the index geometry + * + * @return the number of bytes of the largest possible open chapter save + * image + **/ +uint64_t computeSavedOpenChapterSize(Geometry *geometry); + +#endif /* OPENCHAPTER_H */ diff --git a/uds/openChapterZone.c b/uds/openChapterZone.c new file mode 100644 index 0000000..f346409 --- /dev/null +++ b/uds/openChapterZone.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.c#2 $ + */ + +#include "openChapterZone.h" + +#include "compiler.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +/**********************************************************************/ +static INLINE size_t recordsSize(const OpenChapterZone *openChapter) +{ + return (sizeof(UdsChunkRecord) * (1 + openChapter->capacity)); +} + +/**********************************************************************/ +static INLINE size_t slotsSize(size_t slotCount) +{ + return (sizeof(Slot) * slotCount); +} + +/** + * Round up to the first power of two greater than or equal + * to the supplied number. + * + * @param val the number to round up + * + * @return the first power of two not smaller than val for any + * val <= 2^63 + **/ +static INLINE size_t nextPowerOfTwo(size_t val) +{ + if (val == 0) { + return 1; + } + return (1 << computeBits(val - 1)); +} + +/**********************************************************************/ +int makeOpenChapter(const Geometry *geometry, + unsigned int zoneCount, + OpenChapterZone **openChapterPtr) +{ + int result = ASSERT(zoneCount > 0, "zone count must be > 0"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(geometry->openChapterLoadRatio > 1, + UDS_BAD_STATE, + "Open chapter hash table is too small"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE((geometry->recordsPerChapter + <= OPEN_CHAPTER_MAX_RECORD_NUMBER), + UDS_BAD_STATE, + "Too many records (%u) for a single chapter", + geometry->recordsPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + if (geometry->recordsPerChapter < zoneCount) { + return logUnrecoverable( + UDS_INVALID_ARGUMENT, + "zone count: %u is larger than the records per chapter %u", + zoneCount, geometry->recordsPerChapter); + } + size_t capacity = geometry->recordsPerChapter / zoneCount; + + // The slot count must be at least one greater than the capacity. + // Using a power of two slot count guarantees that hash insertion + // will never fail if the hash table is not full. + size_t slotCount = nextPowerOfTwo(capacity * geometry->openChapterLoadRatio); + OpenChapterZone *openChapter; + result = ALLOCATE_EXTENDED(OpenChapterZone, slotCount, Slot, + "open chapter", &openChapter); + if (result != UDS_SUCCESS) { + return result; + } + openChapter->slotCount = slotCount; + openChapter->capacity = capacity; + result = allocateCacheAligned(recordsSize(openChapter), "record pages", + &openChapter->records); + if (result != UDS_SUCCESS) { + freeOpenChapter(openChapter); + return result; + } + + *openChapterPtr = openChapter; + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t openChapterSize(const OpenChapterZone *openChapter) +{ + return openChapter->size - openChapter->deleted; +} + +/**********************************************************************/ +void resetOpenChapter(OpenChapterZone *openChapter) +{ + openChapter->size = 0; + openChapter->deleted = 0; + + memset(openChapter->records, 0, recordsSize(openChapter)); + memset(openChapter->slots, 0, slotsSize(openChapter->slotCount)); +} + +/**********************************************************************/ +static UdsChunkRecord *probeChapterSlots(OpenChapterZone *openChapter, + const UdsChunkName *name, + unsigned int *slotPtr, + unsigned int *recordNumberPtr) +{ + unsigned int slots = openChapter->slotCount; + unsigned int probe = nameToHashSlot(name, slots); + unsigned int firstSlot = 0; + + UdsChunkRecord *record; + unsigned int probeSlot; + unsigned int recordNumber; + unsigned int probeAttempts; + + for (probeAttempts = 1; ; ++probeAttempts) { + probeSlot = firstSlot + probe; + recordNumber = openChapter->slots[probeSlot].recordNumber; + + // If the hash slot is empty, we've reached the end of a chain without + // finding the record and should terminate the search. + if (recordNumber == 0) { + record = NULL; + break; + } + + // If the name of the record referenced by the slot matches and has not + // been deleted, then we've found the requested name. + record = &openChapter->records[recordNumber]; + if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0) + && !openChapter->slots[recordNumber].recordDeleted) { + break; + } + + // Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. + // This performs better than linear probing and works best for 2^N slots. + probe += probeAttempts; + if (probe >= slots) { + probe = probe % slots; + } + } + + // These NULL checks will be optimized away in callers who don't care about + // the values when this function is inlined. + if (slotPtr != NULL) { + *slotPtr = probeSlot; + } + if (recordNumberPtr != NULL) { + *recordNumberPtr = recordNumber; + } + + return record; +} + +/**********************************************************************/ +void searchOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + UdsChunkData *metadata, + bool *found) +{ + UdsChunkRecord *record = probeChapterSlots(openChapter, name, NULL, NULL); + + if (record == NULL) { + *found = false; + } else { + *found = true; + if (metadata != NULL) { + *metadata = record->data; + } + } +} + +/**********************************************************************/ +int putOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + const UdsChunkData *metadata, + unsigned int *remaining) +{ + unsigned int slot; + UdsChunkRecord *record = probeChapterSlots(openChapter, name, &slot, NULL); + + if (record != NULL) { + record->data = *metadata; + *remaining = openChapter->capacity - openChapter->size; + return UDS_SUCCESS; + } + + if (openChapter->size >= openChapter->capacity) { + return makeUnrecoverable(UDS_VOLUME_OVERFLOW); + } + + unsigned int recordNumber = ++openChapter->size; + openChapter->slots[slot].recordNumber = recordNumber; + record = &openChapter->records[recordNumber]; + record->name = *name; + record->data = *metadata; + + *remaining = openChapter->capacity - openChapter->size; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void removeFromOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + bool *removed) +{ + unsigned int recordNumber; + UdsChunkRecord *record + = probeChapterSlots(openChapter, name, NULL, &recordNumber); + + if (record == NULL) { + *removed = false; + return; + } + + // Set the deleted flag on the recordNumber in the slot array so search + // won't find it and close won't index it. + openChapter->slots[recordNumber].recordDeleted = true; + openChapter->deleted += 1; + *removed = true; +} + +/**********************************************************************/ +void freeOpenChapter(OpenChapterZone *openChapter) +{ + if (openChapter != NULL) { + FREE(openChapter->records); + FREE(openChapter); + } +} diff --git a/uds/openChapterZone.h b/uds/openChapterZone.h new file mode 100644 index 0000000..cecee4b --- /dev/null +++ b/uds/openChapterZone.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.h#1 $ + */ + +#ifndef OPEN_CHAPTER_ZONE_H +#define OPEN_CHAPTER_ZONE_H 1 + +#include "common.h" +#include "geometry.h" +#include "typeDefs.h" + +/** + * OpenChapterZone is the mutable, in-memory representation of one zone's + * section of an Albireo index chapter. + * + *

In addition to providing the same access to records as an on-disk + * chapter, the open chapter zone must allow records to be added or + * modified. It must provide a way to generate the on-disk representation + * without excessive work. It does that by accumulating records in the order + * they are added (maintaining temporal locality), and referencing them (as + * record numbers) from hash slots selected from the name. If the metadata for + * a name changes, the record field is just modified in place. + * + *

Storage for the records (names and metadata) is allocated when the zone + * is created. It keeps no references to the data passed to it, and performs + * no additional allocation when adding records. Opening a new chapter simply + * marks it as being empty. + * + *

Records are stored in a flat array. To allow a value of zero in a + * hash slot to indicate that the slot is empty, records are numbered starting + * at one (1-based). Since C arrays are 0-based, the records array contains + * enough space for N+1 records, and the record that starts at array index + * zero is never used or referenced. + * + *

The array of hash slots is actually two arrays, superimposed: an + * array of record numbers, indexed by hash value, and an array of deleted + * flags, indexed by record number. This overlay is possible because the + * number of hash slots always exceeds the number of records, and is done + * simply to save on memory. + **/ + +enum { + OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, + OPEN_CHAPTER_MAX_RECORD_NUMBER = (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1 +}; + +typedef struct { + /** If non-zero, the record number addressed by this hash slot */ + unsigned int recordNumber : OPEN_CHAPTER_RECORD_NUMBER_BITS; + /** If true, the record at the index of this hash slot was deleted */ + bool recordDeleted : 1; +} __attribute__((packed)) Slot; + +typedef struct openChapterZone { + /** Maximum number of records that can be stored */ + unsigned int capacity; + /** Number of records stored */ + unsigned int size; + /** Number of deleted records */ + unsigned int deleted; + /** Record data, stored as (name, metadata), 1-based */ + UdsChunkRecord *records; + /** The number of slots in the chapter zone hash table. */ + unsigned int slotCount; + /** Hash table, referencing virtual record numbers */ + Slot slots[]; +} OpenChapterZone; + +/** + * Allocate an open chapter zone. + * + * @param geometry the geometry of the volume + * @param zoneCount the total number of open chapter zones + * @param openChapterPtr a pointer to hold the new open chapter + * + * @return UDS_SUCCESS or an error code + **/ +int makeOpenChapter(const Geometry *geometry, + unsigned int zoneCount, + OpenChapterZone **openChapterPtr) + __attribute__((warn_unused_result)); + +/** + * Return the number of records in the open chapter zone that have not been + * deleted. + * + * @return the number of non-deleted records + **/ +size_t openChapterSize(const OpenChapterZone *openChapter) + __attribute__((warn_unused_result)); + +/** + * Open a chapter by marking it empty. + * + * @param openChapter The chapter to open + **/ +void resetOpenChapter(OpenChapterZone *openChapter); + +/** + * Search the open chapter for a chunk name. + * + * @param openChapter The chapter to search + * @param name The name of the desired chunk + * @param metadata The holder for the metadata associated with the + * chunk, if found (or NULL) + * @param found A pointer which will be set to true if the chunk + * name was found + **/ +void searchOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + UdsChunkData *metadata, + bool *found); + +/** + * Put a record into the open chapter. + * + * @param openChapter The chapter into which to put the record + * @param name The name of the record + * @param metadata The record data + * @param remaining Pointer to an integer set to the number of additional + * records that can be added to this chapter + * + * @return UDS_SUCCESS or an error code + **/ +int putOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + const UdsChunkData *metadata, + unsigned int *remaining) + __attribute__((warn_unused_result)); + +/** + * Remove a record from the open chapter. + * + * @param openChapter The chapter from which to remove the record + * @param name The name of the record + * @param removed Pointer to bool set to true if the + * record was found + **/ +void removeFromOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + bool *removed); + +/** + * Clean up an open chapter and its memory. + * + * @param openChapter the chapter to destroy + **/ +void freeOpenChapter(OpenChapterZone *openChapter); + +#endif /* OPEN_CHAPTER_ZONE_H */ diff --git a/uds/pageCache.c b/uds/pageCache.c new file mode 100644 index 0000000..b2db9a5 --- /dev/null +++ b/uds/pageCache.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/pageCache.c#6 $ + */ + +#include "pageCache.h" + +#include "atomicDefs.h" +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "compiler.h" +#include "errors.h" +#include "geometry.h" +#include "hashUtils.h" +#include "indexConfig.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "recordPage.h" +#include "stringUtils.h" +#include "threads.h" +#include "zone.h" + +/**********************************************************************/ +int assertPageInCache(PageCache *cache, CachedPage *page) +{ + int result = ASSERT((page->cp_physicalPage < cache->numIndexEntries), + "physicalPage %u is valid (< %u)", + page->cp_physicalPage, cache->numIndexEntries); + if (result != UDS_SUCCESS) { + return result; + } + + uint16_t pageIndex = cache->index[page->cp_physicalPage]; + return ASSERT((pageIndex < cache->numCacheEntries) + && (&cache->cache[pageIndex] == page), + "page is at expected location in cache"); +} + +/** + * Clear a cache page. Note: this does not clear readPending - a read could + * still be pending and the read thread needs to be able to proceed and restart + * the requests regardless. This page will still be marked invalid, but it + * won't get reused (see getLeastRecentPage()) until the readPending flag + * is cleared. This is a valid case, e.g. the chapter gets forgotten and + * replaced with a new one in LRU. Restarting the requests will lead them to + * not find the records in the MI. + * + * @param cache the cache + * @param page the cached page to clear + * + **/ +static void clearPage(PageCache *cache, CachedPage *page) +{ + page->cp_physicalPage = cache->numIndexEntries; + WRITE_ONCE(page->cp_lastUsed, 0); +} + +/** + * Get a page from the cache, but with no stats + * + * @param cache the cache + * @param physicalPage the physical page to get + * @param queueIndex the index of the page in the read queue if + * queued, -1 otherwise + * @param pagePtr a pointer to hold the page + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int getPageNoStats(PageCache *cache, + unsigned int physicalPage, + int *queueIndex, + CachedPage **pagePtr) +{ + /* + * ASSERTION: We are either a zone thread holding a searchPendingCounter, + * or we are any thread holding the readThreadsMutex. + * + * Holding only a searchPendingCounter is the most frequent case. + */ + + int result = ASSERT((physicalPage < cache->numIndexEntries), + "physical page %u is invalid", physicalPage); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * It would be unlikely that the compiler turns the usage of indexValue into + * two reads of cache->index, but it would be possible and very bad if those + * reads did not return the same bits. + */ + uint16_t indexValue = READ_ONCE(cache->index[physicalPage]); + bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; + uint16_t index = indexValue & ~VOLUME_CACHE_QUEUED_FLAG; + + if (!queued && (index < cache->numCacheEntries)) { + *pagePtr = &cache->cache[index]; + /* + * We have acquired access to the cached page, but unless we hold the + * readThreadsMutex, we need a read memory barrier now. The corresponding + * write memory barrier is in putPageInCache. + */ + smp_rmb(); + } else { + *pagePtr = NULL; + } + if (queueIndex != NULL) { + *queueIndex = queued ? index : -1; + } + return UDS_SUCCESS; +} + +/** + * Wait for all pending searches on a page in the cache to complete + * + * @param cache the page cache + * @param physicalPage the page to check searches on + **/ +static void waitForPendingSearches(PageCache *cache, unsigned int physicalPage) +{ + /* + * We hold the readThreadsMutex. We are waiting for threads that do not hold + * the readThreadsMutex. Those threads have "locked" their targeted page by + * setting the searchPendingCounter. The corresponding write memory barrier + * is in beginPendingSearch. + */ + smp_mb(); + + InvalidateCounter initialCounters[MAX_ZONES]; + unsigned int i; + for (i = 0; i < cache->zoneCount; i++) { + initialCounters[i] = getInvalidateCounter(cache, i); + } + for (i = 0; i < cache->zoneCount; i++) { + if (searchPending(initialCounters[i]) + && (pageBeingSearched(initialCounters[i]) == physicalPage)) { + // There is an active search using the physical page. + // We need to wait for the search to finish. + while (initialCounters[i] == getInvalidateCounter(cache, i)) { + yieldScheduler(); + } + } + } +} + +/** + * Invalidate a cache page + * + * @param cache the cache + * @param page the cached page + * @param reason the reason for invalidation, for stats + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int invalidatePageInCache(PageCache *cache, + CachedPage *page, + InvalidationReason reason) +{ + // We hold the readThreadsMutex. + if (page == NULL) { + return UDS_SUCCESS; + } + + if (page->cp_physicalPage != cache->numIndexEntries) { + switch (reason) { + case INVALIDATION_EVICT: + cache->counters.evictions++; + break; + case INVALIDATION_EXPIRE: + cache->counters.expirations++; + break; + default: + break; + } + + if (reason != INVALIDATION_ERROR) { + int result = assertPageInCache(cache, page); + if (result != UDS_SUCCESS) { + return result; + } + } + + WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); + waitForPendingSearches(cache, page->cp_physicalPage); + } + + clearPage(cache, page); + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int findInvalidateAndMakeLeastRecent(PageCache *cache, + unsigned int physicalPage, + QueuedRead *readQueue, + InvalidationReason reason, + bool mustFind) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return UDS_SUCCESS; + } + + CachedPage *page; + int queuedIndex = -1; + int result + = getPageNoStats(cache, physicalPage, + ((readQueue != NULL) ? &queuedIndex : NULL), &page); + if (result != UDS_SUCCESS) { + return result; + } + + if (page == NULL) { + result = ASSERT(!mustFind, "found page"); + if (result != UDS_SUCCESS) { + return result; + } + + if (queuedIndex > -1) { + logDebug("setting pending read to invalid"); + readQueue[queuedIndex].invalid = true; + } + return UDS_SUCCESS; + } + + // Invalidate the page and unmap it from the cache. + result = invalidatePageInCache(cache, page, reason); + if (result != UDS_SUCCESS) { + return result; + } + + // Move the cached page to the least recently used end of the list + // so it will be replaced before any page with valid data. + WRITE_ONCE(page->cp_lastUsed, 0); + + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int initializePageCache(PageCache *cache, + const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount) +{ + cache->geometry = geometry; + cache->numIndexEntries = geometry->pagesPerVolume + 1; + cache->numCacheEntries = chaptersInCache * geometry->recordPagesPerChapter; + cache->readQueueMaxSize = readQueueMaxSize; + cache->zoneCount = zoneCount; + atomic64_set(&cache->clock, 1); + + int result = ALLOCATE(readQueueMaxSize, QueuedRead, + "volume read queue", &cache->readQueue); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(cache->zoneCount, SearchPendingCounter, + "Volume Cache Zones", &cache->searchPendingCounters); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((cache->numCacheEntries <= VOLUME_CACHE_MAX_ENTRIES), + "requested cache size, %u, within limit %u", + cache->numCacheEntries, VOLUME_CACHE_MAX_ENTRIES); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(cache->numIndexEntries, uint16_t, "page cache index", + &cache->index); + if (result != UDS_SUCCESS) { + return result; + } + + // Initialize index values to invalid values. + unsigned int i; + for (i = 0; i < cache->numIndexEntries; i++) { + cache->index[i] = cache->numCacheEntries; + } + + result = ALLOCATE(cache->numCacheEntries, CachedPage, + "page cache cache", &cache->cache); + if (result != UDS_SUCCESS) { + return result; + } + + for (i = 0; i < cache->numCacheEntries; i++) { + CachedPage *page = &cache->cache[i]; + result = initializeVolumePage(geometry, &page->cp_pageData); + if (result != UDS_SUCCESS) { + return result; + } + clearPage(cache, page); + } + + return UDS_SUCCESS; +} + +/*********************************************************************/ +int makePageCache(const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + PageCache **cachePtr) +{ + if (chaptersInCache < 1) { + return logWarningWithStringError(UDS_BAD_STATE, + "cache size must be" + " at least one chapter"); + } + if (readQueueMaxSize <= 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "read queue max size must be" + " greater than 0"); + } + if (zoneCount < 1) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cache must have at least one zone"); + } + + PageCache *cache; + int result = ALLOCATE(1, PageCache, "volume cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializePageCache(cache, geometry, chaptersInCache, + readQueueMaxSize, zoneCount); + if (result != UDS_SUCCESS) { + freePageCache(cache); + return result; + } + + *cachePtr = cache; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freePageCache(PageCache *cache) +{ + if (cache == NULL) { + return; + } + if (cache->cache != NULL) { + unsigned int i; + for (i = 0; i < cache->numCacheEntries; i++) { + destroyVolumePage(&cache->cache[i].cp_pageData); + } + } + FREE(cache->index); + FREE(cache->cache); + FREE(cache->searchPendingCounters); + FREE(cache->readQueue); + FREE(cache); +} + +/**********************************************************************/ +int invalidatePageCacheForChapter(PageCache *cache, + unsigned int chapter, + unsigned int pagesPerChapter, + InvalidationReason reason) +{ + // We hold the readThreadsMutex. + if ((cache == NULL) || (cache->cache == NULL)) { + return UDS_SUCCESS; + } + + int result; + unsigned int i; + for (i = 0; i < pagesPerChapter; i++) { + unsigned int physicalPage = 1 + (pagesPerChapter * chapter) + i; + result = findInvalidateAndMakeLeastRecent(cache, physicalPage, + cache->readQueue, reason, false); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*********************************************************************/ +void makePageMostRecent(PageCache *cache, CachedPage *page) +{ + // ASSERTION: We are either a zone thread holding a searchPendingCounter, + // or we are any thread holding the readThreadsMutex. + if (atomic64_read(&cache->clock) != READ_ONCE(page->cp_lastUsed)) { + WRITE_ONCE(page->cp_lastUsed, atomic64_inc_return(&cache->clock)); + } +} + +/** + * Get the least recent valid page from the cache. + * + * @param cache the cache + * @param pagePtr a pointer to hold the new page (will be set to NULL + * if the page was not found) + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int getLeastRecentPage(PageCache *cache, CachedPage **pagePtr) +{ + // We hold the readThreadsMutex. + int oldestIndex = 0; + // Our first candidate is any page that does have a pending read. We ensure + // above that there are more entries than read threads, so there must be one. + unsigned int i; + for (i = 0;; i++) { + if (i >= cache->numCacheEntries) { + // This should never happen. + return ASSERT(false, "oldest page is not NULL"); + } + if (!cache->cache[i].cp_readPending) { + oldestIndex = i; + break; + } + } + // Now find the least recently used page that does not have a pending read. + for (i = 0; i < cache->numCacheEntries; i++) { + if (!cache->cache[i].cp_readPending + && (READ_ONCE(cache->cache[i].cp_lastUsed) + <= READ_ONCE(cache->cache[oldestIndex].cp_lastUsed))) { + oldestIndex = i; + } + } + *pagePtr = &cache->cache[oldestIndex]; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int getPageFromCache(PageCache *cache, + unsigned int physicalPage, + int probeType, + CachedPage **pagePtr) +{ + // ASSERTION: We are in a zone thread. + // ASSERTION: We holding a searchPendingCounter or the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot get page with NULL cache"); + } + + // Get the cache page from the index + CachedPage *page; + int queueIndex = -1; + int result = getPageNoStats(cache, physicalPage, &queueIndex, &page); + if (result != UDS_SUCCESS) { + return result; + } + + CacheResultKind cacheResult = ((page != NULL) + ? CACHE_RESULT_HIT + : ((queueIndex != -1) + ? CACHE_RESULT_QUEUED + : CACHE_RESULT_MISS)); + incrementCacheCounter(&cache->counters, probeType, cacheResult); + + if (pagePtr != NULL) { + *pagePtr = page; + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) +{ + // We hold the readThreadsMutex. + uint16_t first = cache->readQueueFirst; + uint16_t last = cache->readQueueLast; + uint16_t next = (last + 1) % cache->readQueueMaxSize; + uint16_t readQueuePos; + + if ((cache->index[physicalPage] & VOLUME_CACHE_QUEUED_FLAG) == 0) { + /* Not seen before, add this to the read queue and mark it as queued */ + if (next == first) { + /* queue is full */ + return UDS_SUCCESS; + } + /* fill the read queue entry */ + cache->readQueue[last].physicalPage = physicalPage; + cache->readQueue[last].invalid = false; + + /* point the cache index to it */ + readQueuePos = last; + WRITE_ONCE(cache->index[physicalPage], + readQueuePos | VOLUME_CACHE_QUEUED_FLAG); + cache->readQueue[readQueuePos].requestList.first = NULL; + cache->readQueue[readQueuePos].requestList.last = NULL; + /* bump the last pointer */ + cache->readQueueLast = next; + } else { + /* It's already queued, just add on to it */ + readQueuePos = cache->index[physicalPage] & ~VOLUME_CACHE_QUEUED_FLAG; + } + + int result = ASSERT((readQueuePos < cache->readQueueMaxSize), + "queue is not overfull"); + if (result != UDS_SUCCESS) { + return result; + } + + request->nextRequest = NULL; + if (cache->readQueue[readQueuePos].requestList.first == NULL) { + cache->readQueue[readQueuePos].requestList.first = request; + } else { + cache->readQueue[readQueuePos].requestList.last->nextRequest = request; + } + cache->readQueue[readQueuePos].requestList.last = request; + return UDS_QUEUED; +} + +/***********************************************************************/ +bool reserveReadQueueEntry(PageCache *cache, + unsigned int *queuePos, + Request **firstRequest, + unsigned int *physicalPage, + bool *invalid) +{ + // We hold the readThreadsMutex. + uint16_t lastRead = cache->readQueueLastRead; + + // No items to dequeue + if (lastRead == cache->readQueueLast) { + return false; + } + + unsigned int pageNo = cache->readQueue[lastRead].physicalPage; + bool isInvalid = cache->readQueue[lastRead].invalid; + + uint16_t indexValue = cache->index[pageNo]; + bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; + + // ALB-1429 ... need to check to see if its still queued before resetting + if (isInvalid && queued) { + // invalidate cache index slot + WRITE_ONCE(cache->index[pageNo], cache->numCacheEntries); + } + + // If a sync read has taken this page, set invalid to true so we don't + // overwrite, we simply just requeue requests. + if (!queued) { + isInvalid = true; + } + + cache->readQueue[lastRead].reserved = true; + + *queuePos = lastRead; + *firstRequest = cache->readQueue[lastRead].requestList.first; + *physicalPage = pageNo; + *invalid = isInvalid; + cache->readQueueLastRead = (lastRead + 1) % cache->readQueueMaxSize; + + return true; +} + +/************************************************************************/ +void releaseReadQueueEntry(PageCache *cache, unsigned int queuePos) +{ + // We hold the readThreadsMutex. + cache->readQueue[queuePos].reserved = false; + + uint16_t lastRead = cache->readQueueLastRead; + + // Move the readQueueFirst pointer along when we can + while ((cache->readQueueFirst != lastRead) + && (!cache->readQueue[cache->readQueueFirst].reserved)) { + cache->readQueueFirst = + (cache->readQueueFirst + 1) % cache->readQueueMaxSize; + } +} + +/***********************************************************************/ +int selectVictimInCache(PageCache *cache, + CachedPage **pagePtr) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot put page in NULL cache"); + } + + CachedPage *page = NULL; + int result = getLeastRecentPage(cache, &page); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((page != NULL), "least recent page was not NULL"); + if (result != UDS_SUCCESS) { + return result; + } + + // If the page is currently being pointed to by the page map, clear + // it from the page map, and update cache stats + if (page->cp_physicalPage != cache->numIndexEntries) { + cache->counters.evictions++; + WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); + waitForPendingSearches(cache, page->cp_physicalPage); + } + + page->cp_readPending = true; + + *pagePtr = page; + + return UDS_SUCCESS; +} + +/***********************************************************************/ +int putPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot complete page in NULL cache"); + } + + int result = ASSERT((page != NULL), "page to install exists"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((page->cp_readPending), + "page to install has a pending read"); + if (result != UDS_SUCCESS) { + return result; + } + + clearPage(cache, page); + + page->cp_physicalPage = physicalPage; + + // Figure out the index into the cache array using pointer arithmetic + uint16_t value = page - cache->cache; + result = ASSERT((value < cache->numCacheEntries), "cache index is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + makePageMostRecent(cache, page); + + page->cp_readPending = false; + + /* + * We hold the readThreadsMutex, but we must have a write memory barrier + * before making the CachedPage available to the readers that do not hold the + * mutex. The corresponding read memory barrier is in getPageNoStats. + */ + smp_wmb(); + + // Point the page map to the new page. Will clear queued flag + WRITE_ONCE(cache->index[physicalPage], value); + + return UDS_SUCCESS; +} + +/***********************************************************************/ +void cancelPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + logWarning("cannot cancel page in NULL cache"); + return; + } + + int result = ASSERT((page != NULL), "page to install exists"); + if (result != UDS_SUCCESS) { + return; + } + + result = ASSERT((page->cp_readPending), + "page to install has a pending read"); + if (result != UDS_SUCCESS) { + return; + } + + clearPage(cache, page); + page->cp_readPending = false; + + // Clear the page map for the new page. Will clear queued flag + WRITE_ONCE(cache->index[physicalPage], cache->numCacheEntries); +} + +/**********************************************************************/ +size_t getPageCacheSize(PageCache *cache) +{ + if (cache == NULL) { + return 0; + } + return sizeof(DeltaIndexPage) * cache->numCacheEntries; +} + diff --git a/uds/pageCache.h b/uds/pageCache.h new file mode 100644 index 0000000..d639b4a --- /dev/null +++ b/uds/pageCache.h @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/pageCache.h#5 $ + */ + +#ifndef PAGE_CACHE_H +#define PAGE_CACHE_H + +#include "atomicDefs.h" +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "common.h" +#include "compiler.h" +#include "indexConfig.h" +#include "opaqueTypes.h" +#include "permassert.h" +#include "request.h" +#include "volumeStore.h" + +typedef struct requestList { + Request *first; + Request *last; +} RequestList; + +typedef struct cachedPage { + /* whether this page is currently being read asynchronously */ + bool cp_readPending; + /* if equal to numCacheEntries, the page is invalid */ + unsigned int cp_physicalPage; + /* the value of the volume clock when this page was last used */ + int64_t cp_lastUsed; + /* the cache page data */ + struct volume_page cp_pageData; + /* the chapter index page. This is here, even for record pages */ + DeltaIndexPage cp_indexPage; +} CachedPage; + +enum { + VOLUME_CACHE_MAX_ENTRIES = (UINT16_MAX >> 1), + VOLUME_CACHE_QUEUED_FLAG = (1 << 15), + VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS = 4096 +}; + +typedef struct queuedRead { + /* whether this queue entry is invalid */ + bool invalid; + /* whether this queue entry has a pending read on it */ + bool reserved; + /* physical page to read */ + unsigned int physicalPage; + /* list of requests waiting on a queued read */ + RequestList requestList; +} QueuedRead; + +// Reason for invalidating a cache entry, used for gathering statistics +typedef enum invalidationReason { + INVALIDATION_EVICT, // cache is full, goodbye + INVALIDATION_EXPIRE, // your chapter is being overwritten + INVALIDATION_ERROR, // error happened; don't try to use data + INVALIDATION_INIT_SHUTDOWN +} InvalidationReason; + +/* + * Value stored atomically in a SearchPendingCounter. The low order 32 bits is + * the physical page number of the cached page being read. The high order 32 + * bits is a sequence number. + * + * An InvalidateCounter is only written by its zone thread by calling the + * beginPendingSearch or endPendingSearch methods. + * + * Any other thread that is accessing an InvalidateCounter is reading the value + * in the waitForPendingSearches method. + */ +typedef int64_t InvalidateCounter; +// Fields of InvalidateCounter. +// These must be 64 bit, so an enum cannot be not used. +#define PAGE_FIELD ((long)UINT_MAX) // The page number field +#define COUNTER_LSB (PAGE_FIELD + 1L) // The LSB of the counter field + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) { + atomic64_t atomicValue; +} SearchPendingCounter; + +typedef struct pageCache { + // Geometry governing the volume + const Geometry *geometry; + // The number of zones + unsigned int zoneCount; + // The number of index entries + unsigned int numIndexEntries; + // The max number of cached entries + uint16_t numCacheEntries; + // The index used to quickly access page in cache - top bit is a 'queued' + // flag + uint16_t *index; + // The cache + CachedPage *cache; + // A counter for each zone to keep track of when a search is occurring + // within that zone. + SearchPendingCounter *searchPendingCounters; + // Queued reads, as a circular array, with first and last indexes + QueuedRead *readQueue; + // Cache counters for stats. This is the first field of a PageCache that is + // not constant after the struct is initialized. + CacheCounters counters; + /** + * Entries are enqueued at readQueueLast. + * To 'reserve' entries, we get the entry pointed to by readQueueLastRead + * and increment last read. This is done with a lock so if another reader + * thread reserves a read, it will grab the next one. After every read + * is completed, the reader thread calls releaseReadQueueEntry which + * increments readQueueFirst until it is equal to readQueueLastRead, but only + * if the value pointed to by readQueueFirst is no longer pending. + * This means that if n reads are outstanding, readQueueFirst may not + * be incremented until the last of the reads finishes. + * + * First Last + * || | | | | | || + * LR (1) (2) + * + * Read thread 1 increments last read (1), then read thread 2 increments it + * (2). When each read completes, it checks to see if it can increment first, + * when all concurrent reads have completed, readQueueFirst should equal + * readQueueLastRead. + **/ + uint16_t readQueueFirst; + uint16_t readQueueLastRead; + uint16_t readQueueLast; + // The size of the read queue + unsigned int readQueueMaxSize; + // Page access counter + atomic64_t clock; +} PageCache; + +/** + * Allocate a cache for a volume. + * + * @param geometry The geometry governing the volume + * @param chaptersInCache The size (in chapters) of the page cache + * @param readQueueMaxSize The maximum size of the read queue + * @param zoneCount The number of zones in the index + * @param cachePtr A pointer to hold the new page cache + * + * @return UDS_SUCCESS or an error code + **/ +int makePageCache(const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + PageCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Clean up a volume's cache + * + * @param cache the volumecache + **/ +void freePageCache(PageCache *cache); + +/** + * Invalidates a page cache for a particular chapter + * + * @param cache the page cache + * @param chapter the chapter + * @param pagesPerChapter the number of pages per chapter + * @param reason the reason for invalidation + * + * @return UDS_SUCCESS or an error code + **/ +int invalidatePageCacheForChapter(PageCache *cache, + unsigned int chapter, + unsigned int pagesPerChapter, + InvalidationReason reason) + __attribute__((warn_unused_result)); + +/** + * Find a page, invalidate it, and make its memory the least recent. This + * method is only exposed for the use of unit tests. + * + * @param cache The cache containing the page + * @param physicalPage The id of the page to invalidate + * @param readQueue The queue of pending reads (may be NULL) + * @param reason The reason for the invalidation, for stats + * @param mustFind If true, it is an error if the page + * can't be found + * + * @return UDS_SUCCESS or an error code + **/ +int findInvalidateAndMakeLeastRecent(PageCache *cache, + unsigned int physicalPage, + QueuedRead *readQueue, + InvalidationReason reason, + bool mustFind); + +/** + * Make the page the most recent in the cache + * + * @param cache the page cache + * @param pagePtr the page to make most recent + * + * @return UDS_SUCCESS or an error code + **/ +void makePageMostRecent(PageCache *cache, CachedPage *pagePtr); + +/** + * Verifies that a page is in the cache. This method is only exposed for the + * use of unit tests. + * + * @param cache the cache to verify + * @param page the page to find + * + * @return UDS_SUCCESS or an error code + **/ +int assertPageInCache(PageCache *cache, CachedPage *page) + __attribute__((warn_unused_result)); + +/** + * Gets a page from the cache. + * + * @param [in] cache the page cache + * @param [in] physicalPage the page number + * @param [in] probeType the type of cache access being done (CacheProbeType + * optionally OR'ed with CACHE_PROBE_IGNORE_FAILURE) + * @param [out] pagePtr the found page + * + * @return UDS_SUCCESS or an error code + **/ +int getPageFromCache(PageCache *cache, + unsigned int physicalPage, + int probeType, + CachedPage **pagePtr) + __attribute__((warn_unused_result)); + +/** + * Enqueue a read request + * + * @param cache the page cache + * @param request the request that depends on the read + * @param physicalPage the physicalPage for the request + * + * @return UDS_QUEUED if the page was queued + * UDS_SUCCESS if the queue was full + * an error code if there was an error + **/ +int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) + __attribute__((warn_unused_result)); + +/** + * Reserves a queued read for future dequeuing, but does not remove it from + * the queue. Must call releaseReadQueueEntry to complete the process + * + * @param cache the page cache + * @param queuePos the position in the read queue for this pending read + * @param firstRequests list of requests for the pending read + * @param physicalPage the physicalPage for the requests + * @param invalid whether or not this entry is invalid + * + * @return UDS_SUCCESS or an error code + **/ +bool reserveReadQueueEntry(PageCache *cache, + unsigned int *queuePos, + Request **firstRequests, + unsigned int *physicalPage, + bool *invalid); + +/** + * Releases a read from the queue, allowing it to be reused by future + * enqueues + * + * @param cache the page cache + * @param queuePos queue entry position + * + * @return UDS_SUCCESS or an error code + **/ +void releaseReadQueueEntry(PageCache *cache, + unsigned int queuePos); + +/** + * Check for the page cache read queue being empty. + * + * @param cache the page cache for which to check the read queue. + * + * @return true if the read queue for cache is empty, false otherwise. + **/ +static INLINE bool readQueueIsEmpty(PageCache *cache) +{ + return (cache->readQueueFirst == cache->readQueueLast); +} + +/** + * Check for the page cache read queue being full. + * + * @param cache the page cache for which to check the read queue. + * + * @return true if the read queue for cache is full, false otherwise. + **/ +static INLINE bool readQueueIsFull(PageCache *cache) +{ + return (cache->readQueueFirst == + (cache->readQueueLast + 1) % cache->readQueueMaxSize); +} + +/** + * Selects a page in the cache to be used for a read. + * + * This will clear the pointer in the page map and + * set readPending to true on the cache page + * + * @param cache the page cache + * @param pagePtr the page to add + * + * @return UDS_SUCCESS or an error code + **/ +int selectVictimInCache(PageCache *cache, + CachedPage **pagePtr) + __attribute__((warn_unused_result)); + +/** + * Completes an async page read in the cache, so that + * the page can now be used for incoming requests. + * + * This will invalidate the old cache entry and point + * the page map for the new page to this entry + * + * @param cache the page cache + * @param physicalPage the page number + * @param page the page to complete processing on + * + * @return UDS_SUCCESS or an error code + **/ +int putPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) + __attribute__((warn_unused_result)); + +/** + * Cancels an async page read in the cache, so that + * the page can now be used for incoming requests. + * + * This will invalidate the old cache entry and clear + * the read queued flag on the page map entry, if it + * was set. + * + * @param cache the page cache + * @param physicalPage the page number to clear the queued read flag on + * @param page the page to cancel processing on + * + * @return UDS_SUCCESS or an error code + **/ +void cancelPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page); + +/** + * Get the page cache size + * + * @param cache the page cache + * + * @return the size of the page cache + **/ +size_t getPageCacheSize(PageCache *cache) + __attribute__((warn_unused_result)); + + +/** + * Read the InvalidateCounter for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * + * @return the InvalidateCounter value + **/ +static INLINE InvalidateCounter getInvalidateCounter(PageCache *cache, + unsigned int zoneNumber) +{ + return atomic64_read(&cache->searchPendingCounters[zoneNumber].atomicValue); +} + +/** + * Write the InvalidateCounter for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * @param invalidateCounter the InvalidateCounter value to write + **/ +static INLINE void setInvalidateCounter(PageCache *cache, + unsigned int zoneNumber, + InvalidateCounter invalidateCounter) +{ + atomic64_set(&cache->searchPendingCounters[zoneNumber].atomicValue, + invalidateCounter); +} + +/** + * Return the physical page number of the page being searched. The return + * value is only valid if searchPending indicates that a search is in progress. + * + * @param counter the InvalidateCounter value to check + * + * @return the page that the zone is searching + **/ +static INLINE unsigned int pageBeingSearched(InvalidateCounter counter) +{ + return counter & PAGE_FIELD; +} + +/** + * Determines whether a given value indicates that a search is occuring. + * + * @param invalidateCounter the InvalidateCounter value to check + * + * @return true if a search is pending, false otherwise + **/ +static INLINE bool searchPending(InvalidateCounter invalidateCounter) +{ + return (invalidateCounter & COUNTER_LSB) != 0; +} + +/** + * Determines whether there is a search occuring for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * + * @return true if a search is pending, false otherwise + **/ +static INLINE bool isSearchPending(PageCache *cache, + unsigned int zoneNumber) +{ + return searchPending(getInvalidateCounter(cache, zoneNumber)); +} + +/** + * Increment the counter for the specified zone to signal that a search has + * begun. Also set which page is being searched. The searchPendingCounters + * are protecting read access to pages indexed by the cache. This is the + * "lock" action. + * + * @param cache the page cache + * @param physicalPage the page that the zone is searching + * @param zoneNumber the zone number + **/ +static INLINE void beginPendingSearch(PageCache *cache, + unsigned int physicalPage, + unsigned int zoneNumber) +{ + InvalidateCounter invalidateCounter = getInvalidateCounter(cache, + zoneNumber); + invalidateCounter &= ~PAGE_FIELD; + invalidateCounter |= physicalPage; + invalidateCounter += COUNTER_LSB; + setInvalidateCounter(cache, zoneNumber, invalidateCounter); + ASSERT_LOG_ONLY(searchPending(invalidateCounter), + "Search is pending for zone %u", zoneNumber); + /* + * This memory barrier ensures that the write to the invalidate counter is + * seen by other threads before this threads accesses the cached page. The + * corresponding read memory barrier is in waitForPendingSearches. + */ + smp_mb(); +} + +/** + * Increment the counter for the specified zone to signal that a search has + * finished. We do not need to reset the page since we only should ever look + * at the page value if the counter indicates a search is ongoing. The + * searchPendingCounters are protecting read access to pages indexed by the + * cache. This is the "unlock" action. + * + * @param cache the page cache + * @param zoneNumber the zone number + **/ +static INLINE void endPendingSearch(PageCache *cache, + unsigned int zoneNumber) +{ + // This memory barrier ensures that this thread completes reads of the + // cached page before other threads see the write to the invalidate counter. + smp_mb(); + + InvalidateCounter invalidateCounter = getInvalidateCounter(cache, + zoneNumber); + ASSERT_LOG_ONLY(searchPending(invalidateCounter), + "Search is pending for zone %u", zoneNumber); + invalidateCounter += COUNTER_LSB; + setInvalidateCounter(cache, zoneNumber, invalidateCounter); +} + +#endif /* PAGE_CACHE_H */ diff --git a/uds/permassert.c b/uds/permassert.c new file mode 100644 index 0000000..0c8afeb --- /dev/null +++ b/uds/permassert.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassert.c#1 $ + */ + +#include "permassert.h" +#include "permassertInternals.h" + +#include "errors.h" + +/*****************************************************************************/ +int assertionFailed(const char *expressionString, + int code, + const char *fileName, + int lineNumber, + const char *format, + ...) +{ + va_list args; + va_start(args, format); + handleAssertionFailure(expressionString, fileName, lineNumber, format, args); + va_end(args); + + return code; +} + +/*****************************************************************************/ +int assertionFailedLogOnly(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + ...) +{ + va_list args; + va_start(args, format); + handleAssertionFailure(expressionString, fileName, lineNumber, format, args); + va_end(args); + + return UDS_ASSERTION_FAILED; +} diff --git a/uds/permassert.h b/uds/permassert.h new file mode 100644 index 0000000..d04336b --- /dev/null +++ b/uds/permassert.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassert.h#1 $ + */ + +#ifndef PERMASSERT_H +#define PERMASSERT_H + +#include "compiler.h" +#include "errors.h" +#include "uds-error.h" + +#define STRINGIFY(X) #X +#define STRINGIFY_VALUE(X) STRINGIFY(X) + +/* + * A hack to apply the "warn if unused" attribute to an integral expression. + * + * Since GCC doesn't propagate the warn_unused_result attribute to + * conditional expressions incorporating calls to functions with that + * attribute, this function can be used to wrap such an expression. + * With optimization enabled, this function contributes no additional + * instructions, but the warn_unused_result attribute still applies to + * the code calling it. + * + * @param value The value to return + * + * @return The supplied value + */ +__attribute__((warn_unused_result)) +static INLINE int mustUse(int value) +{ + return value; +} + +/* + * A replacement for assert() from assert.h. + * + * @param expr The boolean expression being asserted + * @param code The error code to return on non-fatal assertion + * failure + * @param format A printf() style format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + * + * @return UDS_SUCCESS If expr is true, code if expr is false and + * exitOnAssertionFailure is false. When exitOnAssertionFailure + * is true and expr is false, the program will exit from within + * this macro. + */ +#define ASSERT_WITH_ERROR_CODE(expr, code, ...) \ + mustUse(__builtin_expect(!!(expr), 1) \ + ? UDS_SUCCESS \ + : assertionFailed(STRINGIFY(expr), code, __FILE__, __LINE__, \ + __VA_ARGS__)) + +/* + * A replacement for assert() from assert.h. + * + * @param expr The boolean expression being asserted + * @param format A printf() style format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + * + * @return UDS_SUCCESS If expr is true, UDS_ASSERTION_FAILED if expr is + * false and exitOnAssertionFailure is false. When + * exitOnAssertionFailure is true and expr is false, the + * program will exit from within this macro. + */ +#define ASSERT(expr, ...) \ + ASSERT_WITH_ERROR_CODE(expr, UDS_ASSERTION_FAILED, __VA_ARGS__) + +/* + * A replacement for assert() which logs on failure, but does not return an + * error code. This should be used sparingly. If the expression is false and + * exitOnAssertionFailure is true, the program will exit from within this macro. + * + * @param expr The boolean expression being asserted + * @param format A printf() syle format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + */ +#define ASSERT_LOG_ONLY(expr, ...) \ + (__builtin_expect(!!(expr), 1) \ + ? UDS_SUCCESS \ + : assertionFailedLogOnly(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__)) + +/* + * This macro is a convenient wrapper for ASSERT(false, ...). + */ +#define ASSERT_FALSE(...) \ + ASSERT(false, __VA_ARGS__) + +#define STATIC_ASSERT(expr) \ + do { \ + switch (0) { \ + case 0: \ + case expr: \ + ; \ + default: \ + ; \ + } \ + } while(0) + +#define STATIC_ASSERT_SIZEOF(type, expectedSize) \ + STATIC_ASSERT(sizeof(type) == (expectedSize)) + +/** + * Set whether or not to exit on an assertion failure. + * + * @param shouldExit If true assertion failures will cause + * the program to exit + * + * @return The previous setting + **/ +bool setExitOnAssertionFailure(bool shouldExit); + +/** + * Log an assertion failure. + * + * @param expressionString The assertion + * @param errorCode The error code to return + * @param fileName The file in which the assertion appears + * @param lineNumber The line number on which the assertion + * appears + * @param format A printf() style format describing the + * assertion + * + * @return The supplied errorCode unless exitOnAssertionFailure is + * true, in which case the process will be aborted + **/ +int assertionFailed(const char *expressionString, + int errorCode, + const char *fileName, + int lineNumber, + const char *format, + ...) + __attribute__((format(printf, 5, 6), warn_unused_result)); + +/** + * Log an assertion failure. This function is different from + * assertionFailed() in that its return value may be ignored, and so should + * only be used in cases where the return value will be ignored. + * + * @param expressionString The assertion + * @param fileName The file in which the assertion appears + * @param lineNumber The line number on which the assertion + * appears + * @param format A printf() style format describing the + * assertion + * + * @return UDS_ASSERTION_FAILED unless exitOnAssertionFailure is + * true, in which case the process will be aborted + **/ +int assertionFailedLogOnly(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + ...) + __attribute__((format(printf, 4, 5))); + +#endif /* PERMASSERT_H */ diff --git a/uds/permassertInternals.h b/uds/permassertInternals.h new file mode 100644 index 0000000..f0a3b95 --- /dev/null +++ b/uds/permassertInternals.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassertInternals.h#1 $ + */ + +#ifndef PERMASSERT_INTERNALS_H +#define PERMASSERT_INTERNALS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void handleAssertionFailure(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + va_list args) + __attribute__((format(printf, 4, 0))); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* PERMASSERT_INTERNALS_H */ diff --git a/uds/permassertLinuxKernel.c b/uds/permassertLinuxKernel.c new file mode 100644 index 0000000..67f66d9 --- /dev/null +++ b/uds/permassertLinuxKernel.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/permassertLinuxKernel.c#1 $ + */ + +#include "logger.h" +#include "permassert.h" +#include "permassertInternals.h" + +/**********************************************************************/ +__attribute__((format(printf, 4, 0))) +void handleAssertionFailure(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + va_list args) +{ + logEmbeddedMessage(LOG_ERR, "assertion \"", format, args, + "\" (%s) failed at %s:%d", + expressionString, fileName, lineNumber); + logBacktrace(LOG_ERR); +} diff --git a/uds/random.c b/uds/random.c new file mode 100644 index 0000000..acad146 --- /dev/null +++ b/uds/random.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/random.c#2 $ + */ + +#include "random.h" + +#include "permassert.h" + +/*****************************************************************************/ +unsigned int randomInRange(unsigned int lo, unsigned int hi) +{ + return lo + random() % (hi - lo + 1); +} + +/*****************************************************************************/ +void randomCompileTimeAssertions(void) +{ + STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0); +} + +#ifndef __KERNEL__ +/*****************************************************************************/ +void fillRandomly(void *ptr, size_t len) +{ + uint64_t randNum = 0; + uint64_t randMask = 0; + const uint64_t multiplier = (uint64_t) RAND_MAX + 1; + + byte *bp = ptr; + for (size_t i = 0; i < len; ++i) { + if (randMask < 0xff) { + randNum = randNum * multiplier + random(); + randMask = randMask * multiplier + RAND_MAX; + } + bp[i] = randNum & 0xff; + randNum >>= 8; + randMask >>= 8; + } +} +#endif diff --git a/uds/random.h b/uds/random.h new file mode 100644 index 0000000..f5d2f49 --- /dev/null +++ b/uds/random.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/random.h#2 $ + */ + +#ifndef RANDOM_H +#define RANDOM_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#include "compiler.h" +#include "typeDefs.h" + +/** + * Get random unsigned integer in a given range + * + * @param lo Minimum unsigned integer value + * @param hi Maximum unsigned integer value + * + * @return unsigned integer in the interval [lo,hi] + **/ +unsigned int randomInRange(unsigned int lo, unsigned int hi); + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if RAND_MAX is not of the form 2^n - 1. + **/ +void randomCompileTimeAssertions(void); + +/** + * Fill bytes with random data. + * + * @param ptr where to store bytes + * @param len number of bytes to write + **/ +#ifdef __KERNEL__ +static INLINE void fillRandomly(void *ptr, size_t len) +{ + prandom_bytes(ptr, len); +} +#else +void fillRandomly(void *ptr, size_t len); +#endif + +#ifdef __KERNEL__ +#define RAND_MAX 2147483647 + +/** + * Random number generator + * + * @return a random number in the rand 0 to RAND_MAX + **/ +static INLINE long random(void) +{ + long value; + fillRandomly(&value, sizeof(value)); + return value & RAND_MAX; +} +#endif + +#endif /* RANDOM_H */ diff --git a/uds/recordPage.c b/uds/recordPage.c new file mode 100644 index 0000000..f4c2572 --- /dev/null +++ b/uds/recordPage.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/recordPage.c#3 $ + */ + +#include "recordPage.h" + +#include "permassert.h" + +/**********************************************************************/ +static unsigned int encodeTree(byte recordPage[], + const UdsChunkRecord *sortedPointers[], + unsigned int nextRecord, + unsigned int node, + unsigned int nodeCount) +{ + if (node < nodeCount) { + unsigned int child = (2 * node) + 1; + nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, + child, nodeCount); + + // In-order traversal: copy the contents of the next record + // into the page at the node offset. + memcpy(&recordPage[node * BYTES_PER_RECORD], + sortedPointers[nextRecord], + BYTES_PER_RECORD); + ++nextRecord; + + nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, + child + 1, nodeCount); + } + return nextRecord; +} + +/**********************************************************************/ +int encodeRecordPage(const Volume *volume, + const UdsChunkRecord records[], + byte recordPage[]) +{ + unsigned int recordsPerPage = volume->geometry->recordsPerPage; + const UdsChunkRecord **recordPointers = volume->recordPointers; + + // Build an array of record pointers. We'll sort the pointers by the block + // names in the records, which is less work than sorting the record values. + unsigned int i; + for (i = 0; i < recordsPerPage; i++) { + recordPointers[i] = &records[i]; + } + + STATIC_ASSERT(offsetof(UdsChunkRecord, name) == 0); + int result = radixSort(volume->radixSorter, (const byte **) recordPointers, + recordsPerPage, UDS_CHUNK_NAME_SIZE); + if (result != UDS_SUCCESS) { + return result; + } + + // Use the sorted pointers to copy the records from the chapter to the + // record page in tree order. + encodeTree(recordPage, recordPointers, 0, 0, recordsPerPage); + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool searchRecordPage(const byte recordPage[], + const UdsChunkName *name, + const Geometry *geometry, + UdsChunkData *metadata) +{ + // The record page is just an array of chunk records. + const UdsChunkRecord *records = (const UdsChunkRecord *) recordPage; + + // The array of records is sorted by name and stored as a binary tree in + // heap order, so the root of the tree is the first array element. + unsigned int node = 0; + while (node < geometry->recordsPerPage) { + const UdsChunkRecord *record = &records[node]; + int result = memcmp(name, &record->name, UDS_CHUNK_NAME_SIZE); + if (result == 0) { + if (metadata != NULL) { + *metadata = record->data; + } + return true; + } + // The children of node N are in the heap at indexes 2N+1 and 2N+2. + node = ((2 * node) + ((result < 0) ? 1 : 2)); + } + return false; +} diff --git a/uds/recordPage.h b/uds/recordPage.h new file mode 100644 index 0000000..ecf9ddc --- /dev/null +++ b/uds/recordPage.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/recordPage.h#2 $ + */ + +#ifndef RECORDPAGE_H +#define RECORDPAGE_H 1 + +#include "common.h" +#include "volume.h" + +/** + * Generate the on-disk encoding of a record page from the list of records + * in the open chapter representation. + * + * @param volume The volume + * @param records The records to be encoded + * @param recordPage The record page + * + * @return UDS_SUCCESS or an error code + **/ +int encodeRecordPage(const Volume *volume, + const UdsChunkRecord records[], + byte recordPage[]); + +/** + * Find the metadata for a given block name in this page. + * + * @param recordPage The record page + * @param name The block name to look for + * @param geometry The geometry of the volume + * @param metadata an array in which to place the metadata of the + * record, if one was found + * + * @return true if the record was found + **/ +bool searchRecordPage(const byte recordPage[], + const UdsChunkName *name, + const Geometry *geometry, + UdsChunkData *metadata); + +#endif /* RECORDPAGE_H */ diff --git a/uds/regionIdentifiers.h b/uds/regionIdentifiers.h new file mode 100644 index 0000000..ff72b19 --- /dev/null +++ b/uds/regionIdentifiers.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/regionIdentifiers.h#1 $ + */ + +#ifndef REGION_IDENTIFIERS_H +#define REGION_IDENTIFIERS_H + +enum { + RH_TYPE_FREE = 0, // unused + RH_TYPE_SUPER = 1, + RH_TYPE_SAVE = 2, + RH_TYPE_CHECKPOINT = 3, + RH_TYPE_UNSAVED = 4, + + RL_KIND_SCRATCH = 0, // uninitialized or scrapped + RL_KIND_HEADER = 1, // for self-referential items + RL_KIND_CONFIG = 100, + RL_KIND_INDEX = 101, + RL_KIND_SEAL = 102, + RL_KIND_VOLUME = 201, + RL_KIND_SAVE = 202, + RL_KIND_INDEX_PAGE_MAP = 301, + RL_KIND_MASTER_INDEX = 302, + RL_KIND_OPEN_CHAPTER = 303, + RL_KIND_INDEX_STATE = 401, // not saved as region + + RL_SOLE_INSTANCE = 65535, +}; + +typedef unsigned int RegionType; +typedef unsigned int RegionKind; + +#endif // REGION_IDENTIFIERS_H diff --git a/uds/request.c b/uds/request.c new file mode 100644 index 0000000..c994181 --- /dev/null +++ b/uds/request.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/request.c#6 $ + */ + +#include "request.h" + +#include "indexRouter.h" +#include "indexSession.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "requestQueue.h" + +/**********************************************************************/ +int udsStartChunkOperation(UdsRequest *udsRequest) +{ + if (udsRequest->callback == NULL) { + return UDS_CALLBACK_REQUIRED; + } + switch (udsRequest->type) { + case UDS_DELETE: + case UDS_POST: + case UDS_QUERY: + case UDS_UPDATE: + break; + default: + return UDS_INVALID_OPERATION_TYPE; + } + memset(udsRequest->private, 0, sizeof(udsRequest->private)); + Request *request = (Request *)udsRequest; + + int result = getIndexSession(request->session); + if (result != UDS_SUCCESS) { + return sansUnrecoverable(result); + } + + request->found = false; + request->action = (RequestAction) request->type; + request->isControlMessage = false; + request->unbatched = false; + request->router = request->session->router; + + enqueueRequest(request, STAGE_TRIAGE); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int launchZoneControlMessage(RequestAction action, + ZoneMessage message, + unsigned int zone, + IndexRouter *router) +{ + Request *request; + int result = ALLOCATE(1, Request, __func__, &request); + if (result != UDS_SUCCESS) { + return result; + } + + request->router = router; + request->isControlMessage = true; + request->unbatched = true; + request->action = action; + request->zoneNumber = zone; + request->zoneMessage = message; + + enqueueRequest(request, STAGE_INDEX); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeRequest(Request *request) +{ + if (request != NULL) { + FREE(request); + } +} + +/**********************************************************************/ +static RequestQueue *getNextStageQueue(Request *request, + RequestStage nextStage) +{ + if (nextStage == STAGE_CALLBACK) { + return request->session->callbackQueue; + } + + // Local and remote index routers handle the rest of the pipeline + // differently, so delegate the choice of queue to the router. + return selectIndexRouterQueue(request->router, request, nextStage); +} + +/**********************************************************************/ +static void handleRequestErrors(Request *request) +{ + // XXX Use the router's callback function to hand back the error + // and clean up the request? (Possible thread issues doing that.) + + freeRequest(request); +} + +/**********************************************************************/ +void enqueueRequest(Request *request, RequestStage nextStage) +{ + RequestQueue *nextQueue = getNextStageQueue(request, nextStage); + if (nextQueue == NULL) { + handleRequestErrors(request); + return; + } + + requestQueueEnqueue(nextQueue, request); +} + +/* + * This function pointer allows unit test code to intercept the slow-lane + * requeuing of a request. + */ +static RequestRestarter requestRestarter = NULL; + +/**********************************************************************/ +void restartRequest(Request *request) +{ + request->requeued = true; + if (requestRestarter == NULL) { + enqueueRequest(request, STAGE_INDEX); + } else { + requestRestarter(request); + } +} + +/**********************************************************************/ +void setRequestRestarter(RequestRestarter restarter) +{ + requestRestarter = restarter; +} + +/**********************************************************************/ +static INLINE void increment_once(uint64_t *countPtr) +{ + WRITE_ONCE(*countPtr, READ_ONCE(*countPtr) + 1); +} + +/**********************************************************************/ +void updateRequestContextStats(Request *request) +{ + /* + * We don't need any synchronization since the context stats are only + * modified from the single callback thread. + * + * We increment either 2 or 3 counters in this method. + * + * XXX We always increment the "requests" counter. But there is no code + * that uses the value stored in this counter. + * + * We always increment exactly one of these counters (unless there is an + * error in the code, which never happens): + * postsFound postsNotFound + * updatesFound updatesNotFound + * deletionsFound deletionsNotFound + * queriesFound queriesNotFound + * + * XXX In the case of post request that were found in the index, we increment + * exactly one of these counters. But there is no code that uses the + * value stored in these counters. + * inMemoryPostsFound + * densePostsFound + * sparsePostsFound + */ + + SessionStats *sessionStats = &request->session->stats; + + increment_once(&sessionStats->requests); + bool found = (request->location != LOC_UNAVAILABLE); + + switch (request->action) { + case REQUEST_INDEX: + if (found) { + increment_once(&sessionStats->postsFound); + + if (request->location == LOC_IN_OPEN_CHAPTER) { + increment_once(&sessionStats->postsFoundOpenChapter); + } else if (request->location == LOC_IN_DENSE) { + increment_once(&sessionStats->postsFoundDense); + } else if (request->location == LOC_IN_SPARSE) { + increment_once(&sessionStats->postsFoundSparse); + } + } else { + increment_once(&sessionStats->postsNotFound); + } + break; + + case REQUEST_UPDATE: + if (found) { + increment_once(&sessionStats->updatesFound); + } else { + increment_once(&sessionStats->updatesNotFound); + } + break; + + case REQUEST_DELETE: + if (found) { + increment_once(&sessionStats->deletionsFound); + } else { + increment_once(&sessionStats->deletionsNotFound); + } + break; + + case REQUEST_QUERY: + if (found) { + increment_once(&sessionStats->queriesFound); + } else { + increment_once(&sessionStats->queriesNotFound); + } + break; + + default: + request->status = ASSERT(false, "unknown next action in request: %d", + request->action); + } +} + +/**********************************************************************/ +void enterCallbackStage(Request *request) +{ + if (!request->isControlMessage) { + if (isUnrecoverable(request->status)) { + // Unrecoverable errors must disable the index session + disableIndexSession(request->session); + // The unrecoverable state is internal and must not sent to the client. + request->status = sansUnrecoverable(request->status); + } + + // Handle asynchronous client callbacks in the designated thread. + enqueueRequest(request, STAGE_CALLBACK); + } else { + /* + * Asynchronous control messages are complete when they are executed. + * There should be nothing they need to do on the callback thread. The + * message has been completely processed, so just free it. + */ + freeRequest(request); + } +} diff --git a/uds/request.h b/uds/request.h new file mode 100644 index 0000000..fb6250e --- /dev/null +++ b/uds/request.h @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/request.h#7 $ + */ + +#ifndef REQUEST_H +#define REQUEST_H + +#include "cacheCounters.h" +#include "common.h" +#include "compiler.h" +#include "opaqueTypes.h" +#include "threads.h" +#include "timeUtils.h" +#include "uds.h" +#include "util/funnelQueue.h" + +/** + * RequestAction values indicate what action, command, or query is to be + * performed when processing a Request instance. + **/ +typedef enum { + // Map the API's UdsCallbackType values directly to a corresponding action. + REQUEST_INDEX = UDS_POST, + REQUEST_UPDATE = UDS_UPDATE, + REQUEST_DELETE = UDS_DELETE, + REQUEST_QUERY = UDS_QUERY, + + REQUEST_CONTROL, + + // REQUEST_SPARSE_CACHE_BARRIER is the action for the control request used + // by localIndexRouter. + REQUEST_SPARSE_CACHE_BARRIER, + + // REQUEST_ANNOUNCE_CHAPTER_CLOSED is the action for the control + // request used by an indexZone to signal the other zones that it + // has closed the current open chapter. + REQUEST_ANNOUNCE_CHAPTER_CLOSED, +} RequestAction; + +/** + * The block's rough location in the index, if any. + **/ +typedef enum { + /* the block doesn't exist or the location isn't available */ + LOC_UNAVAILABLE, + /* if the block was found in the open chapter */ + LOC_IN_OPEN_CHAPTER, + /* if the block was found in the dense part of the index */ + LOC_IN_DENSE, + /* if the block was found in the sparse part of the index */ + LOC_IN_SPARSE +} IndexRegion; + +/** + * Abstract request pipeline stages, which can also be viewed as stages in the + * life-cycle of a request. + **/ +typedef enum { + STAGE_TRIAGE, + STAGE_INDEX, + STAGE_CALLBACK, +} RequestStage; + +/** + * Control message fields for the barrier messages used to coordinate the + * addition of a chapter to the sparse chapter index cache. + **/ +typedef struct barrierMessageData { + /** virtual chapter number of the chapter index to add to the sparse cache */ + uint64_t virtualChapter; +} BarrierMessageData; + +/** + * Control message fields for the chapter closed messages used to inform + * lagging zones of the first zone to close a given open chapter. + **/ +typedef struct chapterClosedMessageData { + /** virtual chapter number of the chapter which was closed */ + uint64_t virtualChapter; +} ChapterClosedMessageData; + +/** + * Union of the all the zone control message fields. The RequestAction field + * (or launch function argument) selects which of the members is valid. + **/ +typedef union zoneMessageData { + BarrierMessageData barrier; // for REQUEST_SPARSE_CACHE_BARRIER + ChapterClosedMessageData chapterClosed; // for REQUEST_ANNOUNCE_CHAPTER_CLOSED +} ZoneMessageData; + +typedef struct zoneMessage { + /** the index to which the message is directed */ + struct index *index; + /** the message specific data */ + ZoneMessageData data; +} ZoneMessage; + +/** + * Request context for queuing throughout the uds pipeline + * + * XXX Note that the typedef for this struct defines "Request", and that this + * should therefore be "struct request". However, this conflicts with the + * Linux kernel which also has a "struct request". This is a workaround so + * that we can make upstreaming progress. The real solution is to expose + * this structure as the true "struct uds_request" and do a lot of + * renaming. + **/ +struct internalRequest { + /* + * The first part of this structure must be exactly parallel to the + * UdsRequest structure, which is part of the public UDS API. + */ + UdsChunkName chunkName; // hash value + UdsChunkData oldMetadata; // metadata from index + UdsChunkData newMetadata; // metadata from request + UdsChunkCallback *callback; // callback method when complete + struct uds_index_session *session; // The public index session + UdsCallbackType type; // the type of request + int status; // success or error code for this request + bool found; // True if the block was found in index + bool update; // move record to newest chapter if found + + /* + * The remainder of this structure is private to the UDS implementation. + */ + FunnelQueueEntry requestQueueLink; // for lock-free request queue + Request *nextRequest; + IndexRouter *router; + + // Data for control message requests + ZoneMessage zoneMessage; + bool isControlMessage; + + bool unbatched; // if true, must wake worker when enqueued + bool requeued; + RequestAction action; // the action for the index to perform + unsigned int zoneNumber; // the zone for this request to use + IndexRegion location; // if and where the block was found + + bool slLocationKnown; // slow lane has determined a location + IndexRegion slLocation; // location determined by slowlane +}; + +typedef void (*RequestRestarter)(Request *); + +/** + * Make an asynchronous control message for an index zone and enqueue it for + * processing. + * + * @param action The control action to perform + * @param message The message to send + * @param zone The zone number of the zone to receive the message + * @param router The index router responsible for handling the message + * + * @return UDS_SUCCESS or an error code + **/ +int launchZoneControlMessage(RequestAction action, + ZoneMessage message, + unsigned int zone, + IndexRouter *router) + __attribute__((warn_unused_result)); + +/** + * Free an index request. + * + * @param request The request to free + **/ +void freeRequest(Request *request); + +/** + * Enqueue a request for the next stage of the pipeline. If there is more than + * one possible queue for a stage, this function uses the request to decide + * which queue should handle it. + * + * @param request The request to enqueue + * @param nextStage The next stage of the pipeline to process the request + **/ +void enqueueRequest(Request *request, RequestStage nextStage); + +/** + * A method to restart delayed requests. + * + * @param request The request to restart + **/ +void restartRequest(Request *request); + +/** + * Set the function pointer which is used to restart requests. + * This is needed by albserver code and is used as a test hook by the unit + * tests. + * + * @param restarter The function to call to restart requests. + **/ +void setRequestRestarter(RequestRestarter restarter); + +/** + * Enter the callback stage of processing for a request, notifying the waiting + * thread if the request is synchronous, freeing the request if it is an + * asynchronous control message, or placing it on the callback queue if it is + * an asynchronous client request. + * + * @param request the request which has completed execution + **/ +void enterCallbackStage(Request *request); + +/** + * Update the context statistics to reflect the successful completion of a + * client request. + * + * @param request a client request that has successfully completed execution + **/ +void updateRequestContextStats(Request *request); + +/** + * Compute the CacheProbeType value reflecting the request and page type. + * + * @param request The request being processed, or NULL + * @param isIndexPage Whether the cache probe will be for an index page + * + * @return the cache probe type enumeration + **/ +static INLINE CacheProbeType cacheProbeType(Request *request, + bool isIndexPage) +{ + if ((request != NULL) && request->requeued) { + return isIndexPage ? CACHE_PROBE_INDEX_RETRY : CACHE_PROBE_RECORD_RETRY; + } else { + return isIndexPage ? CACHE_PROBE_INDEX_FIRST : CACHE_PROBE_RECORD_FIRST; + } +} +#endif /* REQUEST_H */ diff --git a/uds/requestQueue.h b/uds/requestQueue.h new file mode 100644 index 0000000..5bf7ef6 --- /dev/null +++ b/uds/requestQueue.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/requestQueue.h#1 $ + */ + +#ifndef REQUEST_QUEUE_H +#define REQUEST_QUEUE_H + +#include "opaqueTypes.h" +#include "typeDefs.h" + +/* void return value because this function will process its own errors */ +typedef void RequestQueueProcessor(Request *); + +/** + * Allocate a new request processing queue and start a worker thread to + * consume and service requests in the queue. + * + * @param queueName the name of the queue and the worker thread + * @param processOne the function the worker will invoke on each request + * @param queuePtr a pointer to receive the new queue + * + * @return UDS_SUCCESS or an error code + **/ +int makeRequestQueue(const char *queueName, + RequestQueueProcessor *processOne, + RequestQueue **queuePtr) + __attribute__((warn_unused_result)); + +/** + * Add a request to the end of the queue for processing by the worker thread. + * If the requeued flag is set on the request, it will be processed before + * any non-requeued requests under most circumstances. + * + * @param queue the request queue that should process the request + * @param request the request to be processed on the queue's worker thread + **/ +void requestQueueEnqueue(RequestQueue *queue, Request *request); + +/** + * Shut down the request queue worker thread, then destroy and free the queue. + * + * @param queue the queue to shut down and free + **/ +void requestQueueFinish(RequestQueue *queue); + +#endif /* REQUEST_QUEUE_H */ diff --git a/uds/requestQueueKernel.c b/uds/requestQueueKernel.c new file mode 100644 index 0000000..a53ff12 --- /dev/null +++ b/uds/requestQueueKernel.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/requestQueueKernel.c#3 $ + */ + +#include "requestQueue.h" + +#include + +#include "atomicDefs.h" +#include "compiler.h" +#include "logger.h" +#include "request.h" +#include "memoryAlloc.h" +#include "threads.h" +#include "util/funnelQueue.h" + +/* + * Ordering: + * + * Multiple retry requests or multiple non-retry requests enqueued from + * a single producer thread will be processed in the order enqueued. + * + * Retry requests will generally be processed before normal requests. + * + * HOWEVER, a producer thread can enqueue a retry request (generally given + * higher priority) and then enqueue a normal request, and they can get + * processed in the reverse order. The checking of the two internal queues is + * very simple and there's a potential race with the producer regarding the + * "priority" handling. If an ordering guarantee is needed, it can be added + * without much difficulty, it just makes the code a bit more complicated. + * + * If requests are enqueued while the processing of another request is + * happening, and the enqueuing operations complete while the request + * processing is still in progress, then the retry request(s) *will* + * get processed next. (This is used for testing.) + */ + +/** + * Time constants, all in units of nanoseconds. + **/ +enum { + ONE_NANOSECOND = 1, + ONE_MICROSECOND = 1000 * ONE_NANOSECOND, + ONE_MILLISECOND = 1000 * ONE_MICROSECOND, + ONE_SECOND = 1000 * ONE_MILLISECOND, + + /** The initial time to wait after waiting with no timeout */ + DEFAULT_WAIT_TIME = 20 * ONE_MICROSECOND, + + /** The minimum time to wait when waiting with a timeout */ + MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, + + /** The maximimum time to wait when waiting with a timeout */ + MAXIMUM_WAIT_TIME = ONE_MILLISECOND +}; + +/** + * Batch size tuning constants. These are compared to the number of requests + * that have been processed since the worker thread last woke up. + **/ +enum { + MINIMUM_BATCH = 32, // wait time increases if batches are smaller than this + MAXIMUM_BATCH = 64 // wait time decreases if batches are larger than this +}; + +struct requestQueue { + /* Wait queue for synchronizing producers and consumer */ + struct wait_queue_head wqhead; + /* function to process 1 request */ + RequestQueueProcessor *processOne; + /* new incoming requests */ + FunnelQueue *mainQueue; + /* old requests to retry first */ + FunnelQueue *retryQueue; + /* thread id of the worker thread */ + Thread thread; + /* true if the worker was started */ + bool started; + /* when true, requests can be enqueued */ + bool alive; + /* A flag set when the worker is waiting without a timeout */ + atomic_t dormant; +}; + +/*****************************************************************************/ +/** + * Poll the underlying lock-free queues for a request to process. Must only be + * called by the worker thread. + * + * @param queue the RequestQueue being serviced + * + * @return a dequeued request, or NULL if no request was available + **/ +static INLINE Request *pollQueues(RequestQueue *queue) +{ + // The retry queue has higher priority. + FunnelQueueEntry *entry = funnelQueuePoll(queue->retryQueue); + if (entry != NULL) { + return container_of(entry, Request, requestQueueLink); + } + + // The main queue has lower priority. + entry = funnelQueuePoll(queue->mainQueue); + if (entry != NULL) { + return container_of(entry, Request, requestQueueLink); + } + + // No entry found. + return NULL; +} + +/*****************************************************************************/ +/** + * Check if the underlying lock-free queues appear not just not to have any + * requests available right now, but also not to be in the intermediate state + * of getting requests added. Must only be called by the worker thread. + * + * @param queue the RequestQueue being serviced + * + * @return true iff both funnel queues are idle + **/ +static INLINE bool areQueuesIdle(RequestQueue *queue) +{ + return (isFunnelQueueIdle(queue->retryQueue) && + isFunnelQueueIdle(queue->mainQueue)); +} + +/*****************************************************************************/ +/** + * Remove the next request to be processed from the queue. Must only be called + * by the worker thread. + * + * @param queue the queue from which to remove an entry + * @param requestPtr the next request is returned here, or a NULL pointer to + * indicate that there will be no more requests + * @param waitedPtr return a boolean to indicate that we need to wait + * + * @return True when there is a next request, or when we know that there will + * never be another request. False when we must wait for a request. + **/ +static INLINE bool dequeueRequest(RequestQueue *queue, + Request **requestPtr, + bool *waitedPtr) +{ + // Because of batching, we expect this to be the most common code path. + Request *request = pollQueues(queue); + if (request != NULL) { + // Return because we found a request + *requestPtr = request; + return true; + } + + if (!READ_ONCE(queue->alive)) { + // Return because we see that shutdown is happening + *requestPtr = NULL; + return true; + } + + // Return indicating that we need to wait. + *requestPtr = NULL; + *waitedPtr = true; + return false; +} + +/*****************************************************************************/ +static void requestQueueWorker(void *arg) +{ + RequestQueue *queue = (RequestQueue *) arg; + unsigned long timeBatch = DEFAULT_WAIT_TIME; + bool dormant = atomic_read(&queue->dormant); + long currentBatch = 0; + + for (;;) { + Request *request; + bool waited = false; + if (dormant) { + /* + * Sleep/wakeup protocol: + * + * The enqueue operation updates "newest" in the + * funnel queue via xchg which is a memory barrier, + * and later checks "dormant" to decide whether to do + * a wakeup of the worker thread. + * + * The worker thread, when deciding to go to sleep, + * sets "dormant" and then examines "newest" to decide + * if the funnel queue is idle. In dormant mode, the + * last examination of "newest" before going to sleep + * is done inside the wait_event_interruptible macro, + * after a point where (one or more) memory barriers + * have been issued. (Preparing to sleep uses spin + * locks.) Even if the "next" field update isn't + * visible yet to make the entry accessible, its + * existence will kick the worker thread out of + * dormant mode and back into timer-based mode. + * + * So the two threads should agree on the ordering of + * the updating of the two fields. + */ + wait_event_interruptible(queue->wqhead, + dequeueRequest(queue, &request, &waited) || + !areQueuesIdle(queue)); + } else { + wait_event_interruptible_hrtimeout(queue->wqhead, + dequeueRequest(queue, &request, + &waited), + ns_to_ktime(timeBatch)); + } + + if (likely(request != NULL)) { + // We got a request. + currentBatch++; + queue->processOne(request); + } else if (!READ_ONCE(queue->alive)) { + // We got no request and we know we are shutting down. + break; + } + + if (dormant) { + // We've been roused from dormancy. Clear the flag so enqueuers can stop + // broadcasting (no fence needed for this transition). + atomic_set(&queue->dormant, false); + dormant = false; + // Reset the timeout back to the default since we don't know how long + // we've been asleep and we also want to be responsive to a new burst. + timeBatch = DEFAULT_WAIT_TIME; + } else if (waited) { + // We waited for this request to show up. Adjust the wait time if the + // last batch of requests was too small or too large.. + if (currentBatch < MINIMUM_BATCH) { + // Adjust the wait time if the last batch of requests was too small. + timeBatch += timeBatch / 4; + if (timeBatch >= MAXIMUM_WAIT_TIME) { + // The timeout is getting long enough that we need to switch into + // dormant mode. + atomic_set(&queue->dormant, true); + dormant = true; + } + } else if (currentBatch > MAXIMUM_BATCH) { + // Adjust the wait time if the last batch of requests was too large. + timeBatch -= timeBatch / 4; + if (timeBatch < MINIMUM_WAIT_TIME) { + // But if the producer is very fast or the scheduler doesn't wake up + // up promptly, waiting for very short times won't make the batches + // smaller. + timeBatch = MINIMUM_WAIT_TIME; + } + } + // And we must now start a new batch count + currentBatch = 0; + } + } + + /* + * Ensure that we see any requests that were guaranteed to have been fully + * enqueued before shutdown was flagged. The corresponding write barrier + * is in requestQueueFinish. + */ + smp_rmb(); + + // Process every request that is still in the queue, and never wait for any + // new requests to show up. + for (;;) { + Request *request = pollQueues(queue); + if (request == NULL) { + break; + } + queue->processOne(request); + } +} + +/**********************************************************************/ +int makeRequestQueue(const char *queueName, + RequestQueueProcessor *processOne, + RequestQueue **queuePtr) +{ + RequestQueue *queue; + int result = ALLOCATE(1, RequestQueue, __func__, &queue); + if (result != UDS_SUCCESS) { + return result; + } + queue->processOne = processOne; + queue->alive = true; + atomic_set(&queue->dormant, false); + init_waitqueue_head(&queue->wqhead); + + result = makeFunnelQueue(&queue->mainQueue); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + result = makeFunnelQueue(&queue->retryQueue); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + result = createThread(requestQueueWorker, queue, queueName, &queue->thread); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + queue->started = true; + smp_mb(); + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static INLINE void wakeUpWorker(RequestQueue *queue) +{ + // This is the code sequence recommended in + smp_mb(); + if (waitqueue_active(&queue->wqhead)) { + wake_up(&queue->wqhead); + } +} + +/**********************************************************************/ +void requestQueueEnqueue(RequestQueue *queue, Request *request) +{ + bool unbatched = request->unbatched; + funnelQueuePut(request->requeued ? queue->retryQueue : queue->mainQueue, + &request->requestQueueLink); + + /* + * We must wake the worker thread when it is dormant (waiting with no + * timeout). An atomic load (read fence) isn't needed here since we know the + * queue operation acts as one. + */ + if (atomic_read(&queue->dormant) || unbatched) { + wakeUpWorker(queue); + } +} + +/**********************************************************************/ +void requestQueueFinish(RequestQueue *queue) +{ + if (queue == NULL) { + return; + } + + /* + * This memory barrier ensures that any requests we queued will be seen. The + * point is that when dequeueRequest sees the following update to the alive + * flag, it will also be able to see any change we made to a next field in + * the FunnelQueue entry. The corresponding read barrier is in + * requestQueueWorker. + */ + smp_wmb(); + + // Mark the queue as dead. + WRITE_ONCE(queue->alive, false); + + if (queue->started) { + // Wake the worker so it notices that it should exit. + wakeUpWorker(queue); + + // Wait for the worker thread to finish processing any additional pending + // work and exit. + int result = joinThreads(queue->thread); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "Failed to join worker thread"); + } + } + + freeFunnelQueue(queue->mainQueue); + freeFunnelQueue(queue->retryQueue); + FREE(queue); +} diff --git a/uds/searchList.c b/uds/searchList.c new file mode 100644 index 0000000..ec2ef70 --- /dev/null +++ b/uds/searchList.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/searchList.c#2 $ + */ + +#include "searchList.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" + +/**********************************************************************/ +int makeSearchList(unsigned int capacity, + SearchList **listPtr) +{ + if (capacity == 0) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "search list must have entries"); + } + if (capacity > UINT8_MAX) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "search list capacity must fit in 8 bits"); + } + + // We need three temporary entry arrays for purgeSearchList(). Allocate them + // contiguously with the main array. + unsigned int bytes = (sizeof(SearchList) + (4 * capacity * sizeof(uint8_t))); + SearchList *list; + int result = allocateCacheAligned(bytes, "search list", &list); + if (result != UDS_SUCCESS) { + return result; + } + + list->capacity = capacity; + list->firstDeadEntry = 0; + + // Fill in the indexes of the chapter index cache entries. These will be + // only ever be permuted as the search list is used. + uint8_t i; + for (i = 0; i < capacity; i++) { + list->entries[i] = i; + } + + *listPtr = list; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeSearchList(SearchList **listPtr) +{ + FREE(*listPtr); + *listPtr = NULL; +} + +/**********************************************************************/ +void purgeSearchList(SearchList *searchList, + const CachedChapterIndex chapters[], + uint64_t oldestVirtualChapter) +{ + if (searchList->firstDeadEntry == 0) { + // There are no live entries in the list to purge. + return; + } + + /* + * Partition the previously-alive entries in the list into three temporary + * lists, keeping the current LRU search order within each list. The element + * array was allocated with enough space for all four lists. + */ + uint8_t *entries = &searchList->entries[0]; + uint8_t *alive = &entries[searchList->capacity]; + uint8_t *skipped = &alive[searchList->capacity]; + uint8_t *dead = &skipped[searchList->capacity]; + unsigned int nextAlive = 0; + unsigned int nextSkipped = 0; + unsigned int nextDead = 0; + + int i; + for (i = 0; i < searchList->firstDeadEntry; i++) { + uint8_t entry = entries[i]; + const CachedChapterIndex *chapter = &chapters[entry]; + if ((chapter->virtualChapter < oldestVirtualChapter) + || (chapter->virtualChapter == UINT64_MAX)) { + dead[nextDead++] = entry; + } else if (chapter->skipSearch) { + skipped[nextSkipped++] = entry; + } else { + alive[nextAlive++] = entry; + } + } + + // Copy the temporary lists back to the search list so we wind up with + // [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ] + memcpy(entries, alive, nextAlive); + entries += nextAlive; + + memcpy(entries, skipped, nextSkipped); + entries += nextSkipped; + + memcpy(entries, dead, nextDead); + // The first dead entry is now the start of the copied dead list. + searchList->firstDeadEntry = (nextAlive + nextSkipped); +} diff --git a/uds/searchList.h b/uds/searchList.h new file mode 100644 index 0000000..25d99e9 --- /dev/null +++ b/uds/searchList.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/searchList.h#1 $ + */ + +#ifndef SEARCH_LIST_H +#define SEARCH_LIST_H + +#include "cachedChapterIndex.h" +#include "compiler.h" +#include "stringUtils.h" +#include "typeDefs.h" + +/** + * A SearchList represents the permutations of the sparse chapter index cache + * entry array. Those permutations express an ordering on the chapter indexes, + * from most recently accessed to least recently accessed, which is the order + * in which the indexes should be searched and the reverse order in which they + * should be evicted from the cache (LRU cache replacement policy). + * + * Cache entries that are dead (virtualChapter == UINT64_MAX) are kept as a + * suffix of the list, avoiding the need to even iterate over them to search, + * and ensuring that dead entries are replaced before any live entries are + * evicted. + * + * The search list is intended to be instantated for each zone thread, + * avoiding any need for synchronization. The structure is allocated on a + * cache boundary to avoid false sharing of memory cache lines between zone + * threads. + **/ +typedef struct searchList { + /** The number of cached chapter indexes and search list entries */ + uint8_t capacity; + + /** The index in the entries array of the first dead cache entry */ + uint8_t firstDeadEntry; + + /** The chapter array indexes representing the chapter search order */ + uint8_t entries[]; +} SearchList; + +/** + * SearchListIterator captures the fields needed to iterate over the live + * entries in a search list and return the CachedChapterIndex pointers that + * the search code actually wants to deal with. + **/ +typedef struct { + /** The search list defining the chapter search iteration order */ + SearchList *list; + + /** The index of the next entry to return from the search list */ + unsigned int nextEntry; + + /** The cached chapters that are referenced by the search list */ + CachedChapterIndex *chapters; +} SearchListIterator; + +/** + * Allocate and initialize a new chapter cache search list with the same + * capacity as the cache. The index of each entry in the cache will appear + * exactly once in the array. All the chapters in the cache are assumed to be + * initially dead, so firstDeadEntry will be zero and no chapters will be + * returned when the search list is iterated. + * + * @param [in] capacity the number of entries in the search list + * @param [out] listPtr a pointer in which to return the new search list + **/ +int makeSearchList(unsigned int capacity, + SearchList **listPtr) + __attribute__((warn_unused_result)); + +/** + * Free a search list and null out the reference to it. + * + * @param listPtr the reference to the search list to free + **/ +void freeSearchList(SearchList **listPtr); + +/** + * Copy the contents of one search list to another. + * + * @param source the list to copy + * @param target the list to replace + **/ +static INLINE void copySearchList(const SearchList *source, + SearchList *target) +{ + *target = *source; + memcpy(target->entries, source->entries, source->capacity); +} + +/** + * Prepare to iterate over the live cache entries a search list. + * + * @param list the list defining the live chapters and the search order + * @param chapters the chapter index entries to return from getNextChapter() + * + * @return an iterator positioned at the start of the search list + **/ +static INLINE SearchListIterator +iterateSearchList(SearchList *list, CachedChapterIndex chapters[]) +{ + SearchListIterator iterator = { + .list = list, + .nextEntry = 0, + .chapters = chapters, + }; + return iterator; +} + +/** + * Check if the search list iterator has another entry to return. + * + * @param iterator the search list iterator + * + * @return true if getNextChapter() may be called + **/ +static INLINE bool hasNextChapter(const SearchListIterator *iterator) +{ + return (iterator->nextEntry < iterator->list->firstDeadEntry); +} + +/** + * Return a pointer to the next live chapter in the search list iteration and + * advance the iterator. This must only be called when hasNextChapter() + * returns true. + * + * @param iterator the search list iterator + * + * @return a pointer to the next live chapter index in the search list order + **/ +static INLINE CachedChapterIndex *getNextChapter(SearchListIterator *iterator) +{ + return &iterator->chapters[iterator->list->entries[iterator->nextEntry++]]; +} + +/** + * Rotate the pointers in a prefix of a search list downwards by one item, + * pushing elements deeper into the list and moving a new chapter to the start + * of the search list. This is the "make most recent" operation on the search + * list. + * + * If the search list provided is [ 0 1 2 3 4 ] and the prefix + * length is 4, then 3 is being moved to the front. + * The search list after the call will be [ 3 0 1 2 4 ] and the + * function will return 3. + * + * @param searchList the chapter index search list to rotate + * @param prefixLength the length of the prefix of the list to rotate + * + * @return the array index of the chapter cache entry that is now at the front + * of the search list + **/ +static INLINE uint8_t rotateSearchList(SearchList *searchList, + uint8_t prefixLength) +{ + // Grab the value of the last entry in the list prefix. + uint8_t mostRecent = searchList->entries[prefixLength - 1]; + + if (prefixLength > 1) { + // Push the first N-1 entries down by one entry, overwriting the entry + // we just grabbed. + memmove(&searchList->entries[1], + &searchList->entries[0], + prefixLength - 1); + + // We now have a hole at the front of the list in which we can place the + // rotated entry. + searchList->entries[0] = mostRecent; + } + + // This function is also used to move a dead chapter to the front of the + // list, in which case the suffix of dead chapters was pushed down too. + if (searchList->firstDeadEntry < prefixLength) { + searchList->firstDeadEntry += 1; + } + + return mostRecent; +} + +/** + * Purge invalid cache entries, marking them as dead and moving them to the + * end of the search list, then push any chapters that have skipSearch set + * down so they follow all the remaining live, valid chapters in the search + * list. This effectively sorts the search list into three regions--active, + * skippable, and dead--while maintaining the LRU ordering that already + * existed (a stable sort). + * + * This operation must only be called during the critical section in + * updateSparseCache() since it effectively changes cache membership. + * + * @param searchList the chapter index search list to purge + * @param chapters the chapter index cache entries + * @param oldestVirtualChapter the oldest virtual chapter + **/ +void purgeSearchList(SearchList *searchList, + const CachedChapterIndex chapters[], + uint64_t oldestVirtualChapter); + +#endif /* SEARCH_LIST_H */ diff --git a/uds/sparseCache.c b/uds/sparseCache.c new file mode 100644 index 0000000..f816d12 --- /dev/null +++ b/uds/sparseCache.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.c#3 $ + */ + +/** + * The sparse chapter index cache is implemented as a simple array of cache + * entries. Since the cache is small (seven chapters by default), searching + * for a specific virtual chapter is implemented as a linear search. The cache + * replacement policy is least-recently-used (LRU). Again, size of the cache + * allows the LRU order to be maintained by shifting entries in an array list. + * + * The most important property of this cache is the absence of synchronization + * for read operations. Safe concurrent access to the cache by the zone + * threads is controlled by the triage queue and the barrier requests it + * issues to the zone queues. The set of cached chapters does not and must not + * change between the carefully coordinated calls to updateSparseCache() from + * the zone threads. + * + * The critical invariant for that coordination is the cache membership must + * not change between those updates; the calls to sparseCacheContains() from + * the zone threads must all receive the same results for any virtual chapter + * number. To ensure that critical invariant, state changes such as "that + * virtual chapter is no longer in the volume" and "skip searching that + * chapter because it has had too many cache misses" are represented + * separately from the cache membership information (the virtual chapter + * number). + * + * As a result of this invariant, we have the guarantee that every zone thread + * will call updateSparseCache() once and exactly once to request a chapter + * that is not in the cache, and the serialization of the barrier requests + * from the triage queue ensures they will all request the same chapter + * number. This means the only synchronization we need can be provided by a + * pair of thread barriers used only in the updateSparseCache() call, + * providing a critical section where a single zone thread can drive the cache + * update while all the other zone threads are known to be blocked, waiting in + * the second barrier. Outside that critical section, all the zone threads + * implicitly hold a shared lock. Inside it, the "captain" (the thread that + * was uniquely flagged when passing through the first barrier) holds an + * exclusive lock. No other threads may access or modify the cache, except for + * accessing cache statistics and similar queries. + * + * Cache statistics must only be modified by a single thread, conventionally + * the zone zero thread. All fields that might be frequently updated by that + * thread are kept in separate cache-aligned structures so they will not cause + * cache contention via "false sharing" with the fields that are frequently + * accessed by all of the zone threads. + * + * LRU order is kept independently by each zone thread, and each zone uses its + * own list for searching and cache membership queries. The zone zero list is + * used to decide which chapter to evict when the cache is updated, and its + * search list is copied to the other threads at that time. + * + * The virtual chapter number field of the cache entry is the single field + * indicating whether a chapter is a member of the cache or not. The value + * UINT64_MAX is used to represent a null, undefined, or wildcard + * chapter number. When present in the virtual chapter number field + * CachedChapterIndex, it indicates that the cache entry is dead, and all + * the other fields of that entry (other than immutable pointers to cache + * memory) are undefined and irrelevant. Any cache entry that is not marked as + * dead is fully defined and a member of the cache--sparseCacheContains() + * must always return true for any virtual chapter number that appears in any + * of the cache entries. + * + * A chapter index that is a member of the cache may be marked for different + * treatment (disabling search) between calls to updateSparseCache() in two + * different ways. When a chapter falls off the end of the volume, its virtual + * chapter number will be less that the oldest virtual chapter number. Since + * that chapter is no longer part of the volume, there's no point in continuing + * to search that chapter index. Once invalidated, that virtual chapter will + * still be considered a member of the cache, but it will no longer be searched + * for matching chunk names. + * + * The second mechanism for disabling search is the heuristic based on keeping + * track of the number of consecutive search misses in a given chapter index. + * Once that count exceeds a threshold, the skipSearch flag will be set to + * true, causing the chapter to be skipped in the fallback search of the + * entire cache, but still allowing it to be found when searching for a hook + * in that specific chapter. Finding a hook will clear the skipSearch flag, + * once again allowing the non-hook searches to use the cache entry. Again, + * regardless of the state of the skipSearch flag, the virtual chapter must + * still considered to be a member of the cache for sparseCacheContains(). + * + * Barrier requests and the sparse chapter index cache are also described in + * + * https://intranet.permabit.com/wiki/Chapter_Index_Cache_supports_concurrent_access + * + * and in a message to the albireo mailing list on 5/28/2011 titled "true + * barriers with a hook resolution queue". + **/ + +#include "sparseCache.h" + +#include "cachedChapterIndex.h" +#include "chapterIndex.h" +#include "common.h" +#include "index.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "searchList.h" +#include "threads.h" +#include "zone.h" + +enum { + /** The number of consecutive search misses that will disable searching */ + SKIP_SEARCH_THRESHOLD = 20000, + + /** a named constant to use when identifying zone zero */ + ZONE_ZERO = 0 +}; + +/** + * These counter values are essentially fields of the SparseCache, but are + * segregated into this structure because they are frequently modified. We + * group them and align them to keep them on different cache lines from the + * cache fields that are accessed far more often than they are updated. + **/ +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sparseCacheCounters { + /** the total number of virtual chapter probes that succeeded */ + uint64_t chapterHits; + + /** the total number of virtual chapter probes that failed */ + uint64_t chapterMisses; + + /** the total number of cache searches that found a possible match */ + uint64_t searchHits; + + /** the total number of cache searches that found no matches */ + uint64_t searchMisses; + + /** the number of cache entries that fell off the end of the volume */ + uint64_t invalidations; + + /** the number of cache entries that were evicted while still valid */ + uint64_t evictions; +} SparseCacheCounters; + +/** + * This is the private structure definition of a SparseCache. + **/ +struct sparseCache { + /** the number of cache entries, which is the size of the chapters array */ + unsigned int capacity; + + /** the number of zone threads using the cache */ + unsigned int zoneCount; + + /** the geometry governing the volume */ + const Geometry *geometry; + + /** the number of search misses in zone zero that will disable searching */ + unsigned int skipSearchThreshold; + + /** pointers to the cache-aligned chapter search order for each zone */ + SearchList *searchLists[MAX_ZONES]; + + /** the thread barriers used to synchronize the zone threads for update */ + Barrier beginCacheUpdate; + Barrier endCacheUpdate; + + /** frequently-updated counter fields (cache-aligned) */ + SparseCacheCounters counters; + + /** the counted array of chapter index cache entries (cache-aligned) */ + CachedChapterIndex chapters[]; +}; + +/** + * Initialize a sparse chapter index cache. + * + * @param cache the sparse cache to initialize + * @param geometry the geometry governing the volume + * @param capacity the number of chapters the cache will hold + * @param zoneCount the number of zone threads using the cache + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int initializeSparseCache(SparseCache *cache, + const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount) +{ + cache->geometry = geometry; + cache->capacity = capacity; + cache->zoneCount = zoneCount; + + // Scale down the skip threshold by the number of zones since we count the + // chapter search misses only in zone zero. + cache->skipSearchThreshold = (SKIP_SEARCH_THRESHOLD / zoneCount); + + int result = initializeBarrier(&cache->beginCacheUpdate, zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + result = initializeBarrier(&cache->endCacheUpdate, zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + unsigned int i; + for (i = 0; i < capacity; i++) { + result = initializeCachedChapterIndex(&cache->chapters[i], geometry); + if (result != UDS_SUCCESS) { + return result; + } + } + + // Allocate each zone's independent LRU order. + for (i = 0; i < zoneCount; i++) { + result = makeSearchList(capacity, &cache->searchLists[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeSparseCache(const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount, + SparseCache **cachePtr) +{ + unsigned int bytes + = (sizeof(SparseCache) + (capacity * sizeof(CachedChapterIndex))); + + SparseCache *cache; + int result = allocateCacheAligned(bytes, "sparse cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeSparseCache(cache, geometry, capacity, zoneCount); + if (result != UDS_SUCCESS) { + freeSparseCache(cache); + return result; + } + + *cachePtr = cache; + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getSparseCacheMemorySize(const SparseCache *cache) +{ + // Count the DeltaIndexPage as cache memory, but ignore all other overhead. + size_t pageSize = (sizeof(DeltaIndexPage) + cache->geometry->bytesPerPage); + size_t chapterSize = (pageSize * cache->geometry->indexPagesPerChapter); + return (cache->capacity * chapterSize); +} + +/** + * Update counters to reflect a chapter access hit and clear the skipSearch + * flag on the chapter, if set. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreChapterHit(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.chapterHits += 1; + setSkipSearch(chapter, false); +} + +/** + * Update counters to reflect a chapter access miss. + * + * @param cache the cache to update + **/ +static void scoreChapterMiss(SparseCache *cache) +{ + cache->counters.chapterMisses += 1; +} + +/** + * Check if the cache entry that is about to be replaced is already dead, and + * if it's not, add to tally of evicted or invalidated cache entries. + * + * @param zone the zone used to find the oldest chapter + * @param cache the cache to update + * @param chapter the cache entry about to be replaced + **/ +static void scoreEviction(IndexZone *zone, + SparseCache *cache, + CachedChapterIndex *chapter) +{ + if (chapter->virtualChapter == UINT64_MAX) { + return; + } + if (chapter->virtualChapter < zone->oldestVirtualChapter) { + cache->counters.invalidations += 1; + } else { + cache->counters.evictions += 1; + } +} + +/** + * Update counters to reflect a cache search hit. This bumps the hit + * count, clears the miss count, and clears the skipSearch flag. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreSearchHit(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.searchHits += 1; + chapter->counters.searchHits += 1; + chapter->counters.consecutiveMisses = 0; + setSkipSearch(chapter, false); +} + +/** + * Update counters to reflect a cache search miss. This bumps the consecutive + * miss count, and if it goes over skipSearchThreshold, sets the skipSearch + * flag on the chapter. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreSearchMiss(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.searchMisses += 1; + chapter->counters.searchMisses += 1; + chapter->counters.consecutiveMisses += 1; + if (chapter->counters.consecutiveMisses > cache->skipSearchThreshold) { + setSkipSearch(chapter, true); + } +} + +/**********************************************************************/ +void freeSparseCache(SparseCache *cache) +{ + if (cache == NULL) { + return; + } + + unsigned int i; + for (i = 0; i < cache->zoneCount; i++) { + freeSearchList(&cache->searchLists[i]); + } + + for (i = 0; i < cache->capacity; i++) { + CachedChapterIndex *chapter = &cache->chapters[i]; + destroyCachedChapterIndex(chapter); + } + + destroyBarrier(&cache->beginCacheUpdate); + destroyBarrier(&cache->endCacheUpdate); + FREE(cache); +} + + +/**********************************************************************/ +bool sparseCacheContains(SparseCache *cache, + uint64_t virtualChapter, + unsigned int zoneNumber) +{ + /* + * The correctness of the barriers depends on the invariant that between + * calls to updateSparseCache(), the answers this function returns must + * never vary--the result for a given chapter must be identical across + * zones. That invariant must be maintained even if the chapter falls off + * the end of the volume, or if searching it is disabled because of too many + * search misses. + */ + + // Get the chapter search order for this zone thread. + SearchListIterator iterator + = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); + while (hasNextChapter(&iterator)) { + CachedChapterIndex *chapter = getNextChapter(&iterator); + if (virtualChapter == chapter->virtualChapter) { + if (zoneNumber == ZONE_ZERO) { + scoreChapterHit(cache, chapter); + } + + // Move the chapter to the front of the search list. + rotateSearchList(iterator.list, iterator.nextEntry); + return true; + } + } + + // The specified virtual chapter isn't cached. + if (zoneNumber == ZONE_ZERO) { + scoreChapterMiss(cache); + } + return false; +} + +/**********************************************************************/ +int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) +{ + const Index *index = zone->index; + SparseCache *cache = index->volume->sparseCache; + + // If the chapter is already in the cache, we don't need to do a thing + // except update the search list order, which this check does. + if (sparseCacheContains(cache, virtualChapter, zone->id)) { + return UDS_SUCCESS; + } + + // Wait for every zone thread to have reached its corresponding barrier + // request and invoked this function before starting to modify the cache. + enterBarrier(&cache->beginCacheUpdate, NULL); + + /* + * This is the start of the critical section: the zone zero thread is + * captain, effectively holding an exclusive lock on the sparse cache. All + * the other zone threads must do nothing between the two barriers. They + * will wait at the endCacheUpdate barrier for the captain to finish the + * update. + */ + + int result = UDS_SUCCESS; + if (zone->id == ZONE_ZERO) { + // Purge invalid chapters from the LRU search list. + SearchList *zoneZeroList = cache->searchLists[ZONE_ZERO]; + purgeSearchList(zoneZeroList, cache->chapters, zone->oldestVirtualChapter); + + // First check that the desired chapter is still in the volume. If it's + // not, the hook fell out of the index and there's nothing to do for it. + if (virtualChapter >= index->oldestVirtualChapter) { + // Evict the least recently used live chapter, or replace a dead cache + // entry, all by rotating the the last list entry to the front. + CachedChapterIndex *victim + = &cache->chapters[rotateSearchList(zoneZeroList, cache->capacity)]; + + // Check if the victim is already dead, and if it's not, add to the + // tally of evicted or invalidated cache entries. + scoreEviction(zone, cache, victim); + + // Read the index page bytes and initialize the page array. + result = cacheChapterIndex(victim, virtualChapter, index->volume); + } + + // Copy the new search list state to all the other zone threads so they'll + // get the result of pruning and see the new chapter. + unsigned int z; + for (z = 1; z < cache->zoneCount; z++) { + copySearchList(zoneZeroList, cache->searchLists[z]); + } + } + + // This is the end of the critical section. All cache invariants must have + // been restored--it will be shared/read-only again beyond the barrier. + + enterBarrier(&cache->endCacheUpdate, NULL); + return result; +} + + +/**********************************************************************/ +int searchSparseCache(IndexZone *zone, + const UdsChunkName *name, + uint64_t *virtualChapterPtr, + int *recordPagePtr) +{ + Volume *volume = zone->index->volume; + SparseCache *cache = volume->sparseCache; + unsigned int zoneNumber = zone->id; + // If the caller did not specify a virtual chapter, search the entire cache. + bool searchAll = (*virtualChapterPtr == UINT64_MAX); + unsigned int chaptersSearched = 0; + + // Get the chapter search order for this zone thread, searching the chapters + // from most recently hit to least recently hit. + SearchListIterator iterator + = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); + while (hasNextChapter(&iterator)) { + CachedChapterIndex *chapter = getNextChapter(&iterator); + + // Skip chapters no longer cached, or that have too many search misses. + if (shouldSkipChapterIndex(zone, chapter, *virtualChapterPtr)) { + continue; + } + + int result = searchCachedChapterIndex(chapter, cache->geometry, + volume->indexPageMap, name, + recordPagePtr); + if (result != UDS_SUCCESS) { + return result; + } + chaptersSearched += 1; + + // Did we find an index entry for the name? + if (*recordPagePtr != NO_CHAPTER_INDEX_ENTRY) { + if (zoneNumber == ZONE_ZERO) { + scoreSearchHit(cache, chapter); + } + + // Move the chapter to the front of the search list. + rotateSearchList(iterator.list, iterator.nextEntry); + + // Return a matching entry as soon as it is found. It might be a false + // collision that has a true match in another chapter, but that's a very + // rare case and not worth the extra search cost or complexity. + *virtualChapterPtr = chapter->virtualChapter; + return UDS_SUCCESS; + } + + if (zoneNumber == ZONE_ZERO) { + scoreSearchMiss(cache, chapter); + } + + if (!searchAll) { + // We just searched the virtual chapter the caller specified and there + // was no match, so we're done. + break; + } + } + + // The name was not found in the cache. + *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; + return UDS_SUCCESS; +} diff --git a/uds/sparseCache.h b/uds/sparseCache.h new file mode 100644 index 0000000..09c4a1c --- /dev/null +++ b/uds/sparseCache.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.h#1 $ + */ + +#ifndef SPARSE_CACHE_H +#define SPARSE_CACHE_H + +#include "cacheCounters.h" +#include "geometry.h" +#include "indexZone.h" +#include "typeDefs.h" + +/** + * SparseCache is the cache of entire chapter indexes from sparse chapters + * used for searching for chunks after all other search paths have failed. It + * contains only complete chapter indexes; record pages from sparse chapters + * and single index pages used for resolving hooks are kept in the volume page + * cache. + * + * Searching the cache is an unsynchronized operation. Changing the contents + * of the cache is a coordinated process requiring the coordinated + * participation of all zone threads via the careful use of barrier messages + * sent to all the index zones by the triage queue worker thread. + **/ +typedef struct sparseCache SparseCache; + +// Bare declaration to avoid include dependency loops. +struct index; + +/** + * Allocate and initialize a sparse chapter index cache. + * + * @param [in] geometry the geometry governing the volume + * @param [in] capacity the number of chapters the cache will hold + * @param [in] zoneCount the number of zone threads using the cache + * @param [out] cachePtr a pointer in which to return the new cache + * + * @return UDS_SUCCESS or an error code + **/ +int makeSparseCache(const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount, + SparseCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Destroy and free a sparse chapter index cache. + * + * @param cache the cache to free + **/ +void freeSparseCache(SparseCache *cache); + +/** + * Get the number of bytes of memory used by a sparse chapter cache. + * + * @param cache the cache to measure + **/ +size_t getSparseCacheMemorySize(const SparseCache *cache); + + +/** + * Check whether a sparse chapter index is present in the chapter cache. This + * is only intended for use by the zone threads. + * + * @param cache the cache to search for the virtual chapter + * @param virtualChapter the virtual chapter number of the chapter index + * @param zoneNumber the zone number of the calling thread + * + * @return true iff the sparse chapter index is cached + **/ +bool sparseCacheContains(SparseCache *cache, + uint64_t virtualChapter, + unsigned int zoneNumber); + +/** + * Update the sparse cache to contain a chapter index. + * + * This function must be called by all the zone threads with the same chapter + * numbers to correctly enter the thread barriers used to synchronize the + * cache updates. + * + * @param zone the index zone + * @param virtualChapter the virtual chapter number of the chapter index + * + * @return UDS_SUCCESS or an error code if the chapter index could not be + * read or decoded + **/ +int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) + __attribute__((warn_unused_result)); + + +/** + * Search the cached sparse chapter indexes for a chunk name, returning a + * virtual chapter number and record page number that may contain the name. + * + * @param [in] zone the zone containing the volume, sparse + * chapter index cache and the index page + * number map + * @param [in] name the chunk name to search for + * @param [in,out] virtualChapterPtr If UINT64_MAX on input, + * search all cached chapters, else search + * the specified virtual chapter, if cached. + * On output, if a match was found, set to + * the virtual chapter number of the match, + * otherwise set to UINT64_MAX on a miss. + * @param [out] recordPagePtr the record page number of a match, else + * NO_CHAPTER_INDEX_ENTRY if nothing matched + * + * @return UDS_SUCCESS or an error code + **/ +int searchSparseCache(IndexZone *zone, + const UdsChunkName *name, + uint64_t *virtualChapterPtr, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* SPARSE_CACHE_H */ diff --git a/uds/stringLinuxKernel.c b/uds/stringLinuxKernel.c new file mode 100644 index 0000000..bf0a255 --- /dev/null +++ b/uds/stringLinuxKernel.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/stringLinuxKernel.c#1 $ + */ + +#include + +#include "errors.h" +#include "logger.h" +#include "stringUtils.h" + +/**********************************************************************/ +int stringToSignedLong(const char *nptr, long *num) +{ + while (*nptr == ' ') { + nptr++; + } + return kstrtol(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; +} + +/**********************************************************************/ +int stringToUnsignedLong(const char *nptr, unsigned long *num) +{ + while (*nptr == ' ') { + nptr++; + } + if (*nptr == '+') { + nptr++; + } + return kstrtoul(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; +} + +/*****************************************************************************/ +char *nextToken(char *str, const char *delims, char **state) +{ + char *sp = str ? str : *state; + while (*sp && strchr(delims, *sp)) { + ++sp; + } + if (!*sp) { + return NULL; + } + char *ep = sp; + while (*ep && !strchr(delims, *ep)) { + ++ep; + } + if (*ep) { + *ep++ = '\0'; + } + *state = ep; + return sp; +} + +/*****************************************************************************/ +int parseUint64(const char *str, uint64_t *num) +{ + unsigned long value = *num; + int result = stringToUnsignedLong(str, &value); + *num = value; + return result; +} diff --git a/uds/stringUtils.c b/uds/stringUtils.c new file mode 100644 index 0000000..93d7da1 --- /dev/null +++ b/uds/stringUtils.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.c#2 $ + */ + +#include "stringUtils.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/*****************************************************************************/ +int allocSprintf(const char *what, char **strp, const char *fmt, ...) +{ + if (strp == NULL) { + return UDS_INVALID_ARGUMENT; + } + va_list args; +#ifdef __KERNEL__ + // We want the memory allocation to use our own ALLOCATE/FREE wrappers. + va_start(args, fmt); + int count = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + int result = ALLOCATE(count, char, what, strp); + if (result == UDS_SUCCESS) { + va_start(args, fmt); + vsnprintf(*strp, count, fmt, args); + va_end(args); + } +#else + va_start(args, fmt); + int result = vasprintf(strp, fmt, args) == -1 ? ENOMEM : UDS_SUCCESS; + va_end(args); +#endif + if ((result != UDS_SUCCESS) && (what != NULL)) { + logError("cannot allocate %s", what); + } + return result; +} + +/*****************************************************************************/ +int wrapVsnprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, va_list ap, size_t *needed) +{ + if (buf == NULL) { + static char nobuf[1]; + buf = nobuf; + bufSize = 0; + } + int n = vsnprintf(buf, bufSize, fmt, ap); + if (n < 0) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: vsnprintf failed", what); + } + if (needed) { + *needed = n; + } + if (((size_t) n >= bufSize) && (buf != NULL) && (error != UDS_SUCCESS)) { + return logErrorWithStringError(error, "%s: string too long", what); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int fixedSprintf(const char *what, + char *buf, + size_t bufSize, + int error, + const char *fmt, + ...) +{ + if (buf == NULL) { + return UDS_INVALID_ARGUMENT; + } + va_list args; + va_start(args, fmt); + int result = wrapVsnprintf(what, buf, bufSize, error, fmt, args, NULL); + va_end(args); + return result; +} + +/*****************************************************************************/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) +{ + size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); + if (n >= (size_t) (bufEnd - buffer)) { + buffer = bufEnd; + } else { + buffer += n; + } + return buffer; +} + +/*****************************************************************************/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); + va_end(ap); + return pos; +} + +/*****************************************************************************/ +int stringToSignedInt(const char *nptr, int *num) +{ + long value; + int result = stringToSignedLong(nptr, &value); + if (result != UDS_SUCCESS) { + return result; + } + if ((value < INT_MIN) || (value > INT_MAX)) { + return ERANGE; + } + *num = (int) value; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int stringToUnsignedInt(const char *nptr, unsigned int *num) +{ + unsigned long value; + int result = stringToUnsignedLong(nptr, &value); + if (result != UDS_SUCCESS) { + return result; + } + if (value > UINT_MAX) { + return ERANGE; + } + *num = (unsigned int) value; + return UDS_SUCCESS; +} diff --git a/uds/stringUtils.h b/uds/stringUtils.h new file mode 100644 index 0000000..bd685bb --- /dev/null +++ b/uds/stringUtils.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.h#2 $ + */ + +#ifndef STRING_UTILS_H +#define STRING_UTILS_H + +#include +#ifdef __KERNEL__ +#include +#include +#else +#include // for vsnprintf +#include // for strtol +#include +#include +#endif + +#include "compiler.h" +#include "typeDefs.h" + +/** + * Convert a boolean value to its corresponding "true" or "false" string. + * + * @param value The boolean value to convert + * + * @return "true" if value is true, "false" otherwise. + **/ +static INLINE const char *boolToString(bool value) +{ + return (value ? "true" : "false"); +} + +/** + * Allocate a string built according to format (our version of asprintf). + * + * @param [in] what A description of what is being allocated, for error + * logging; if NULL doesn't log anything. + * @param [out] strp The pointer in which to store the allocated string. + * @param [in] fmt The sprintf format parameter. + * + * @return UDS_SUCCESS, or the appropriately translated asprintf error + **/ +int allocSprintf(const char *what, char **strp, const char *fmt, ...) + __attribute__((format(printf, 3, 4), warn_unused_result)); + +/** + * Write a printf-style string into a fixed-size buffer, returning + * errors if it would not fit. (our version of snprintf) + * + * @param [in] what A description of what is being written, for error + * logging; if NULL doesn't log anything. + * @param [out] buf The target buffer + * @param [in] bufSize The size of buf + * @param [in] error Error code to return on overflow + * @param [in] fmt The sprintf format parameter. + * @return UDS_SUCCESS or error + **/ +int fixedSprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, ...) + __attribute__((format(printf, 5, 6), warn_unused_result)); + +/** + * Write printf-style string into an existing buffer, returning a specified + * error code if it would not fit, and setting ``needed`` to the amount of + * space that would be required. + * + * @param [in] what A description of what is being written, for logging. + * @param [in] buf The buffer in which to write the string, or NULL to + * merely determine the required space. + * @param [in] bufSize The size of buf. + * @param [in] error The error code to return for exceeding the specified + * space, UDS_SUCCESS if no logging required. + * @param [in] fmt The sprintf format specification. + * @param [in] ap The variable argument pointer (see ). + * @param [out] needed If non-NULL, the actual amount of string space + * required, which may be smaller or larger than bufSize. + * + * @return UDS_SUCCESS if the string fits, the value of the error parameter if + * the string does not fit and a buffer was supplied, or + * UDS_UNEXPECTED_RESULT if vsnprintf fails in some other undocumented + * way. + **/ +int wrapVsnprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, va_list ap, size_t *needed) + __attribute__((format(printf, 5, 0), warn_unused_result)); + +/** + * Helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Variable-arglist helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * @param args printf arguments + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Our version of strtok_r, since some platforma apparently don't define it. + * + * @param str On first call, the string to tokenize. On subsequent + * calls, NULL. + * @param delims The set of delimiter characters. + * @param statePtr The address of a variable which holds the state of + * the tokenization between calls to nextToken. + * + * @return the next token if any, or NULL + **/ +char *nextToken(char *str, const char *delims, char **statePtr); + +/** + * Parse a string representing a decimal uint64_t. + * + * @param str The string. + * @param num Where to put the number. + * + * @return UDS_SUCCESS or the error UDS_INVALID_ARGUMENT if the string + * is not in the correct format. + **/ +int parseUint64(const char *str, uint64_t *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an integer (base 10) + * + * @param nptr Pointer to string to convert + * @param num The resulting integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToSignedInt(const char *nptr, int *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to a long integer (base 10) + * + * @param nptr Pointer to string to convert + * @param num The resulting long integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToSignedLong(const char *nptr, long *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an unsigned integer (base 10). + * + * @param nptr Pointer to string to convert + * @param num The resulting unsigned integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToUnsignedInt(const char *nptr, unsigned int *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an unsigned long integer (base 10). + * + * @param nptr Pointer to string to convert + * @param num The resulting long unsigned integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToUnsignedLong(const char *nptr, unsigned long *num) + __attribute__((warn_unused_result)); + +#endif /* STRING_UTILS_H */ diff --git a/uds/sysfs.c b/uds/sysfs.c new file mode 100644 index 0000000..b2009d7 --- /dev/null +++ b/uds/sysfs.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.c#4 $ + */ + +#include "sysfs.h" + +#include +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" +#include "uds.h" + +static struct { + struct kobject kobj; // /sys/uds + struct kobject parameterKobj; // /sys/uds/parameter + // These flags are used to ensure a clean shutdown + bool flag; // /sys/uds + bool parameterFlag; // /sys/uds/parameter +} objectRoot; + +/**********************************************************************/ +static char *bufferToString(const char *buf, size_t length) +{ + char *string; + if (ALLOCATE(length + 1, char, __func__, &string) != UDS_SUCCESS) { + return NULL; + } + memcpy(string, buf, length); + string[length] = '\0'; + if (string[length - 1] == '\n') { + string[length - 1] = '\0'; + } + return string; +} + +/**********************************************************************/ +// This is the code for a directory in the /sys/ tree that +// contains no regular files (only subdirectories). +/**********************************************************************/ + +/**********************************************************************/ +static void emptyRelease(struct kobject *kobj) +{ + // Many of our sysfs share this release function that does nothing. +} + +/**********************************************************************/ +static ssize_t emptyShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return 0; +} + +/**********************************************************************/ +static ssize_t emptyStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + return length; +} + +static struct sysfs_ops emptyOps = { + .show = emptyShow, + .store = emptyStore, +}; + +static struct attribute *emptyAttrs[] = { + NULL, +}; + +static struct kobj_type emptyObjectType = { + .release = emptyRelease, + .sysfs_ops = &emptyOps, + .default_attrs = emptyAttrs, +}; + + +/**********************************************************************/ +// This is the the code for the /sys//parameter directory. +// +//

/log_level UDS_LOG_LEVEL +// +/**********************************************************************/ + +typedef struct { + struct attribute attr; + const char *(*showString)(void); + void (*storeString)(const char *); +} ParameterAttribute; + +/**********************************************************************/ +static ssize_t parameterShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); + if (pa->showString != NULL) { + return sprintf(buf, "%s\n", pa->showString()); + } else { + return -EINVAL; + } +} + +/**********************************************************************/ +static ssize_t parameterStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); + char *string = bufferToString(buf, length); + if (string == NULL) { + return -ENOMEM; + } + int result = UDS_SUCCESS; + if (pa->storeString != NULL) { + pa->storeString(string); + } else { + return -EINVAL; + } + FREE(string); + return result == UDS_SUCCESS ? length : result; +} + +/**********************************************************************/ + +static const char *parameterShowLogLevel(void) +{ + return priorityToString(getLogLevel()); +} + +/**********************************************************************/ + +static void parameterStoreLogLevel(const char *string) +{ + setLogLevel(stringToPriority(string)); +} + +/**********************************************************************/ + +static ParameterAttribute logLevelAttr = { + .attr = { .name = "log_level", .mode = 0600 }, + .showString = parameterShowLogLevel, + .storeString = parameterStoreLogLevel, +}; + +static struct attribute *parameterAttrs[] = { + &logLevelAttr.attr, + NULL, +}; + +static struct sysfs_ops parameterOps = { + .show = parameterShow, + .store = parameterStore, +}; + +static struct kobj_type parameterObjectType = { + .release = emptyRelease, + .sysfs_ops = ¶meterOps, + .default_attrs = parameterAttrs, +}; + +/**********************************************************************/ +int initSysfs(void) +{ + memset(&objectRoot, 0, sizeof(objectRoot)); + kobject_init(&objectRoot.kobj, &emptyObjectType); + int result = kobject_add(&objectRoot.kobj, NULL, THIS_MODULE->name); + if (result == 0) { + objectRoot.flag = true; + kobject_init(&objectRoot.parameterKobj, ¶meterObjectType); + result = kobject_add(&objectRoot.parameterKobj, &objectRoot.kobj, + "parameter"); + if (result == 0) { + objectRoot.parameterFlag = true; + } + } + if (result != 0) { + putSysfs(); + } + return result; +} + +/**********************************************************************/ +void putSysfs() +{ + if (objectRoot.parameterFlag) { + kobject_put(&objectRoot.parameterKobj); + } + if (objectRoot.flag) { + kobject_put(&objectRoot.kobj); + } +} diff --git a/uds/sysfs.h b/uds/sysfs.h new file mode 100644 index 0000000..d5f9ccf --- /dev/null +++ b/uds/sysfs.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.h#1 $ + */ + +#ifndef SYSFS_H +#define SYSFS_H + +/** + * Called when the module is loaded to initialize the /sys/\ + * tree. + * + * @return 0 on success, or non-zero on error + **/ +int initSysfs(void); + +/** + * Called when the module is being unloaded to terminate the + * /sys/\ tree. + **/ +void putSysfs(void); + +#endif /* SYSFS_H */ diff --git a/uds/threadCondVarLinuxKernel.c b/uds/threadCondVarLinuxKernel.c new file mode 100644 index 0000000..e3c1517 --- /dev/null +++ b/uds/threadCondVarLinuxKernel.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadCondVarLinuxKernel.c#2 $ + */ + +#include "threads.h" +#include "timeUtils.h" +#include "uds-error.h" + +/**********************************************************************/ +int initCond(CondVar *cv) +{ + cv->eventCount = NULL; + return makeEventCount(&cv->eventCount); +} + +/**********************************************************************/ +int signalCond(CondVar *cv) +{ + eventCountBroadcast(cv->eventCount); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int broadcastCond(CondVar *cv) +{ + eventCountBroadcast(cv->eventCount); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int waitCond(CondVar *cv, Mutex *mutex) +{ + EventToken token = eventCountPrepare(cv->eventCount); + unlockMutex(mutex); + eventCountWait(cv->eventCount, token, NULL); + lockMutex(mutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int timedWaitCond(CondVar *cv, Mutex *mutex, RelTime timeout) +{ + EventToken token = eventCountPrepare(cv->eventCount); + unlockMutex(mutex); + bool happened = eventCountWait(cv->eventCount, token, &timeout); + lockMutex(mutex); + return happened ? UDS_SUCCESS : ETIMEDOUT; +} + +/**********************************************************************/ +int destroyCond(CondVar *cv) +{ + freeEventCount(cv->eventCount); + cv->eventCount = NULL; + return UDS_SUCCESS; +} diff --git a/uds/threadOnce.c b/uds/threadOnce.c new file mode 100644 index 0000000..62149ca --- /dev/null +++ b/uds/threadOnce.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.c#1 $ + */ + +#include "errors.h" +#include "threads.h" + +enum { + ONCE_NOT_DONE = 0, + ONCE_IN_PROGRESS = 1, + ONCE_COMPLETE = 2, +}; + +/*****************************************************************************/ +int performOnce(OnceState *once, void (*function)(void)) +{ + for (;;) { + switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) { + case ONCE_NOT_DONE: + function(); + atomic_set_release(once, ONCE_COMPLETE); + return UDS_SUCCESS; + case ONCE_IN_PROGRESS: + yieldScheduler(); + break; + case ONCE_COMPLETE: + return UDS_SUCCESS; + default: + return UDS_BAD_STATE; + } + } +} diff --git a/uds/threadOnce.h b/uds/threadOnce.h new file mode 100644 index 0000000..58b6da3 --- /dev/null +++ b/uds/threadOnce.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.h#1 $ + */ + +#ifndef THREAD_ONCE_H +#define THREAD_ONCE_H + +#include "atomicDefs.h" + +#define ONCE_STATE_INITIALIZER ATOMIC_INIT(0) + +typedef atomic_t OnceState; + +/** + * Thread safe once only initialization. + * + * @param onceState pointer to object to record that initialization + * has been performed + * @param initFunction called if onceState does not indicate + * initialization has been performed + * + * @return UDS_SUCCESS or error code + * + * @note Generally the following declaration of onceState is performed in + * at file scope: + * + * static OnceState onceState = ONCE_STATE_INITIALIZER; + **/ +int performOnce(OnceState *onceState, void (*initFunction) (void)); + +#endif /* THREAD_ONCE_H */ diff --git a/uds/threadRegistry.c b/uds/threadRegistry.c new file mode 100644 index 0000000..c37e77a --- /dev/null +++ b/uds/threadRegistry.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.c#1 $ + */ + +#include "threadRegistry.h" + +#include +#include + +#include "permassert.h" + +/* + * We need to be careful when using other facilities that may use + * threadRegistry functions in their normal operation. For example, + * we do not want to invoke the logger while holding a lock. + */ + +/*****************************************************************************/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer) +{ + INIT_LIST_HEAD(&newThread->links); + newThread->pointer = pointer; + newThread->task = current; + + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + // This should not have been there. + // We'll complain after releasing the lock. + list_del_init(&thread->links); + foundIt = true; + break; + } + } + list_add_tail(&newThread->links, ®istry->links); + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); +} + +/*****************************************************************************/ +void unregisterThread(ThreadRegistry *registry) +{ + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + list_del_init(&thread->links); + foundIt = true; + break; + } + } + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(foundIt, "thread found in registry"); +} + +/*****************************************************************************/ +void initializeThreadRegistry(ThreadRegistry *registry) +{ + INIT_LIST_HEAD(®istry->links); + rwlock_init(®istry->lock); +} + +/*****************************************************************************/ +const void *lookupThread(ThreadRegistry *registry) +{ + const void *result = NULL; + read_lock(®istry->lock); + RegisteredThread *thread; + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + result = thread->pointer; + break; + } + } + read_unlock(®istry->lock); + return result; +} diff --git a/uds/threadRegistry.h b/uds/threadRegistry.h new file mode 100644 index 0000000..ec1832d --- /dev/null +++ b/uds/threadRegistry.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.h#1 $ + */ + +#ifndef THREAD_REGISTRY_H +#define THREAD_REGISTRY_H 1 + +#include +#include + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. + */ + +typedef struct threadRegistry { + struct list_head links; + rwlock_t lock; +} ThreadRegistry; + +typedef struct registeredThread { + struct list_head links; + const void *pointer; + struct task_struct *task; +} RegisteredThread; + +/*****************************************************************************/ + +/** + * Initialize a registry of threads and associated data pointers. + * + * @param registry The registry to initialize + **/ +void initializeThreadRegistry(ThreadRegistry *registry); + +/** + * Register the current thread and associate it with a data pointer. + * + * This call will log messages if the thread is already registered. + * + * @param registry The thread registry + * @param newThread RegisteredThread structure to use for the current thread + * @param pointer The value to associated with the current thread + **/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer); + +/** + * Remove the registration for the current thread. + * + * A message may be logged if the thread was not registered. + * + * @param registry The thread registry + **/ +void unregisterThread(ThreadRegistry *registry); + +/** + * Fetch a pointer that may have been registered for the current + * thread. If the thread is not registered, a null pointer is + * returned. + * + * @param registry The thread registry + * + * @return the registered pointer, if any, or NULL + **/ +const void *lookupThread(ThreadRegistry *registry); + +#endif /* THREAD_REGISTRY_H */ diff --git a/uds/threads.h b/uds/threads.h new file mode 100644 index 0000000..793355c --- /dev/null +++ b/uds/threads.h @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threads.h#4 $ + */ + +#ifndef THREADS_H +#define THREADS_H + +#include "compiler.h" +#include "threadOnce.h" +#include "timeUtils.h" +#include "uds-error.h" + +#ifdef __KERNEL__ +#include +#include +#include +#include "util/eventCount.h" +#else +#include +#include +#include +#endif + +#ifdef __KERNEL__ +typedef struct { EventCount *eventCount; } CondVar; +typedef struct mutex Mutex; +typedef struct semaphore Semaphore; +typedef struct kernelThread *Thread; +typedef pid_t ThreadId; + +typedef struct { + Semaphore mutex; // Mutex for this barrier object + Semaphore wait; // Semaphore for threads waiting at the barrier + int arrived; // Number of threads which have arrived + int threadCount; // Total number of threads using this barrier +} Barrier; +#else +typedef pthread_barrier_t Barrier; +typedef pthread_cond_t CondVar; +typedef pthread_mutex_t Mutex; +typedef sem_t Semaphore; +typedef pthread_t Thread; +typedef pid_t ThreadId; + +#ifndef NDEBUG +#define MUTEX_INITIALIZER PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP +#else +#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER +#endif + +extern const bool DO_ASSERTIONS; +#endif + +#ifdef __KERNEL__ +/** + * Apply a function to every thread that we have created. + * + * @param applyFunc The function to apply + * @param argument The first argument to applyFunc + * + **/ +void applyToThreads(void applyFunc(void *, struct task_struct *), + void *argument); +#endif + +/** + * Create a thread, logging any cause of failure. + * + * @param threadFunc function to run in new thread + * @param threadData private data for new thread + * @param name name of the new thread + * @param newThread where to store the new thread id + * + * @return success or failure indication + **/ +int createThread(void (*threadFunc)(void *), + void *threadData, + const char *name, + Thread *newThread) + __attribute__((warn_unused_result)); + +/** + * Retrieve the current numbers of cores. + * + * This is either the total number or the number of cores that this + * process has been limited to. + * + * @return number of cores + **/ +unsigned int getNumCores(void); + +/** + * Return the id of the current thread. + * + * @return the thread id + **/ +ThreadId getThreadId(void) __attribute__((warn_unused_result)); + +#ifndef __KERNEL__ +/** + * Get the name of the current thread. + * + * @param name a buffer of size at least 16 to write the name to + **/ +void getThreadName(char *name); +#endif + +/** + * Wait for termination of another thread. + * + * + * @param th The thread for which to wait. + * + * @return UDS_SUCCESS or error code + **/ +int joinThreads(Thread th); + +#ifdef __KERNEL__ +/** + * Exit the current thread. This is a kernel-only function that is intended to + * be an alternative to using BUG() or BUG_ON(). + **/ +__attribute__((noreturn)) +void exitThread(void); +#endif + +/** + * Initialize a thread synchronization barrier (also known as a rendezvous). + * + * @param barrier the barrier to initialize + * @param threadCount the number of threads that must enter the barrier before + * any threads are permitted to leave it + * + * @return UDS_SUCCESS or an error code + **/ +int initializeBarrier(Barrier *barrier, unsigned int threadCount) + __attribute__((warn_unused_result)); + +/** + * Destroy a thread synchronization barrier. + * + * @param barrier the barrier to destroy + * + * @return UDS_SUCCESS or an error code + **/ +int destroyBarrier(Barrier *barrier); + +/** + * Enter a thread synchronization barrier, waiting for the configured number + * of threads to have entered before exiting the barrier. Exactly one thread + * will be arbitrarily selected to be flagged as the "winner" of a barrier. + * + * @param barrier the barrier to enter + * @param winner if non-NULL, a pointer to the flag indicating whether the + * calling thread was the unique winner + * + * @return UDS_SUCCESS or an error code + **/ +int enterBarrier(Barrier *barrier, bool *winner); + +/** + * Initialize a condition variable with default attributes. + * + * @param cond condition variable to init + * + * @return UDS_SUCCESS or error code + **/ +int initCond(CondVar *cond) __attribute__((warn_unused_result)); + +/** + * Signal a condition variable. + * + * @param cond condition variable to signal + * + * @return UDS_SUCCESS or error code + **/ +int signalCond(CondVar *cond); + +/** + * Broadcast a condition variable. + * + * @param cond condition variable to broadcast + * + * @return UDS_SUCCESS or error code + **/ +int broadcastCond(CondVar *cond); + +/** + * Wait on a condition variable. + * + * @param cond condition variable to wait on + * @param mutex mutex to release while waiting + * + * @return UDS_SUCCESS or error code + **/ +int waitCond(CondVar *cond, Mutex *mutex); + +/** + * Wait on a condition variable with a timeout. + * + * @param cond condition variable to wait on + * @param mutex mutex to release while waiting + * @param timeout the relative time until the timeout expires + * + * @return error code (ETIMEDOUT if the deadline is hit) + **/ +int timedWaitCond(CondVar *cond, Mutex *mutex, RelTime timeout); + +/** + * Destroy a condition variable. + * + * @param cond condition variable to destroy + * + * @return UDS_SUCCESS or error code + **/ +int destroyCond(CondVar *cond); + +#ifndef __KERNEL__ +/** + * Initialize a mutex, optionally asserting if the mutex initialization fails. + * This function should only be called directly in places where making + * assertions is not safe. + * + * @param mutex the mutex to initialize + * @param assertOnError if true, an error initializing the + * mutex will make an assertion + * + * @return UDS_SUCCESS or an error code + **/ +int initializeMutex(Mutex *mutex, bool assertOnError); +#endif + +/** + * Initialize the default type (error-checking during development) mutex. + * + * @param mutex the mutex to initialize + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE int initMutex(Mutex *mutex) +{ + mutex_init(mutex); + return UDS_SUCCESS; +} +#else +int initMutex(Mutex *mutex); +#endif + +/** + * Destroy a mutex (with error checking during development). + * + * @param mutex mutex to destroy + * + * @return UDS_SUCCESS or error code + **/ +#ifdef __KERNEL__ +static INLINE int destroyMutex(Mutex *mutex) +{ + return UDS_SUCCESS; +} +#else +int destroyMutex(Mutex *mutex); +#endif + +/** + * Lock a mutex, with optional error checking during development. + * + * @param mutex mutex to lock + **/ +#ifdef __KERNEL__ +static INLINE void lockMutex(Mutex *mutex) +{ + mutex_lock(mutex); +} +#else +void lockMutex(Mutex *mutex); +#endif + +/** + * Unlock a mutex, with optional error checking during development. + * + * @param mutex mutex to unlock + **/ +#ifdef __KERNEL__ +static INLINE void unlockMutex(Mutex *mutex) +{ + mutex_unlock(mutex); +} +#else +void unlockMutex(Mutex *mutex); +#endif + +/** + * Initialize a semaphore used among threads in the same process. + * + * @param semaphore the semaphore to initialize + * @param value the initial value of the semaphore + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE int initializeSemaphore(Semaphore *semaphore, unsigned int value) +{ + sema_init(semaphore, value); + return UDS_SUCCESS; +} +#else +int initializeSemaphore(Semaphore *semaphore, unsigned int value); +#endif + +/** + * Destroy a semaphore used among threads in the same process. + * + * @param semaphore the semaphore to destroy + * + * @return UDS_SUCCESS or an error code + **/ +#ifdef __KERNEL__ +static INLINE int destroySemaphore(Semaphore *semaphore) +{ + return UDS_SUCCESS; +} +#else +int destroySemaphore(Semaphore *semaphore); +#endif + +/** + * Acquire a permit from a semaphore, waiting if none are currently available. + * + * @param semaphore the semaphore to acquire + **/ +#ifdef __KERNEL__ +static INLINE void acquireSemaphore(Semaphore *semaphore) +{ + // Do not use down(semaphore). Instead use down_interruptible so that we do + // not get 120 second stall messages in kern.log. + while (down_interruptible(semaphore) != 0) { + } +} +#else +void acquireSemaphore(Semaphore *semaphore); +#endif + +/** + * Attempt to acquire a permit from a semaphore. + * + * If a permit is available, it is claimed and the function immediately + * returns true. If a timeout is zero or negative, the function immediately + * returns false. Otherwise, this will wait either a permit to become + * available (returning true) or the relative timeout to expire (returning + * false). + * + * @param semaphore the semaphore to decrement + * @param timeout the relative time until the timeout expires + * + * @return true if a permit was acquired, otherwise false + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE bool attemptSemaphore(Semaphore *semaphore, RelTime timeout) +{ + if (timeout <= 0) { + // No timeout, just try to grab the semaphore. + return down_trylock(semaphore) == 0; + } else { + unsigned int jiffies = usecs_to_jiffies(relTimeToMicroseconds(timeout)); + return down_timeout(semaphore, jiffies) == 0; + } +} +#else +bool attemptSemaphore(Semaphore *semaphore, RelTime timeout); +#endif + +/** + * Release a semaphore, incrementing the number of available permits. + * + * @param semaphore the semaphore to increment + **/ +#ifdef __KERNEL__ +static INLINE void releaseSemaphore(Semaphore *semaphore) +{ + up(semaphore); +} +#else +void releaseSemaphore(Semaphore *semaphore); +#endif + +/** + * Yield the time slice in the given thread. + * + * @return UDS_SUCCESS or an error code + **/ +int yieldScheduler(void); + +#ifndef __KERNEL__ +/** + * Allocate a thread specific key for thread specific data. + * + * @param key points to location for new key + * @param destr_function destructor function called when thread exits + * + * @return UDS_SUCCESS or error code + **/ +int createThreadKey(pthread_key_t *key, void (*destr_function) (void *)); + +/** + * Delete a thread specific key for thread specific data. + * + * @param key key to delete + * + * @return UDS_SUCCESS or error code + **/ +int deleteThreadKey(pthread_key_t key); + +/** + * Set pointer for thread specific data. + * + * @param key key to be associated with pointer + * @param pointer data associated with key + * + * @return UDS_SUCCESS or error code + **/ +int setThreadSpecific(pthread_key_t key, const void *pointer); + +/** + * Get pointer for thread specific data. + * + * @param key key identifying the thread specific data + **/ +void *getThreadSpecific(pthread_key_t key); +#endif + +#endif /* THREADS_H */ diff --git a/uds/threadsLinuxKernel.c b/uds/threadsLinuxKernel.c new file mode 100644 index 0000000..7ac972d --- /dev/null +++ b/uds/threadsLinuxKernel.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadsLinuxKernel.c#4 $ + */ + +#include +#include +#include + +#include "memoryAlloc.h" +#include "logger.h" +#include "threads.h" +#include "uds-error.h" + +static struct hlist_head kernelThreadList; +static struct mutex kernelThreadMutex; +static OnceState kernelThreadOnce; + +typedef struct kernelThread { + void (*threadFunc)(void *); + void *threadData; + struct hlist_node threadLinks; + struct task_struct *threadTask; + struct completion threadDone; +} KernelThread; + +/**********************************************************************/ +static void kernelThreadInit(void) +{ + mutex_init(&kernelThreadMutex); +} + +/**********************************************************************/ +static int threadStarter(void *arg) +{ + KernelThread *kt = arg; + kt->threadTask = current; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_add_head(&kt->threadLinks, &kernelThreadList); + mutex_unlock(&kernelThreadMutex); + RegisteredThread allocatingThread; + registerAllocatingThread(&allocatingThread, NULL); + kt->threadFunc(kt->threadData); + unregisterAllocatingThread(); + complete(&kt->threadDone); + return 0; +} + +/**********************************************************************/ +int createThread(void (*threadFunc)(void *), + void *threadData, + const char *name, + Thread *newThread) +{ + char *nameColon = strchr(name, ':'); + char *myNameColon = strchr(current->comm, ':'); + KernelThread *kt; + int result = ALLOCATE(1, KernelThread, __func__, &kt); + if (result != UDS_SUCCESS) { + logWarning("Error allocating memory for %s", name); + return result; + } + kt->threadFunc = threadFunc; + kt->threadData = threadData; + init_completion(&kt->threadDone); + struct task_struct *thread; + /* + * Start the thread, with an appropriate thread name. + * + * If the name supplied contains a colon character, use that name. This + * causes uds module threads to have names like "uds:callbackW" and the main + * test runner thread to be named "zub:runtest". + * + * Otherwise if the current thread has a name containing a colon character, + * prefix the name supplied with the name of the current thread up to (and + * including) the colon character. Thus when the "kvdo0:dedupeQ" thread + * opens an index session, all the threads associated with that index will + * have names like "kvdo0:foo". + * + * Otherwise just use the name supplied. This should be a rare occurrence. + */ + if ((nameColon == NULL) && (myNameColon != NULL)) { + thread = kthread_run(threadStarter, kt, "%.*s:%s", + (int) (myNameColon - current->comm), current->comm, + name); + } else { + thread = kthread_run(threadStarter, kt, "%s", name); + } + if (IS_ERR(thread)) { + FREE(kt); + return UDS_ENOTHREADS; + } + *newThread = kt; + return UDS_SUCCESS; +} +/**********************************************************************/ +int joinThreads(Thread kt) +{ + while (wait_for_completion_interruptible(&kt->threadDone) != 0) { + } + mutex_lock(&kernelThreadMutex); + hlist_del(&kt->threadLinks); + mutex_unlock(&kernelThreadMutex); + FREE(kt); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void applyToThreads(void applyFunc(void *, struct task_struct *), + void *argument) +{ + KernelThread *kt; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { + applyFunc(argument, kt->threadTask); + } + mutex_unlock(&kernelThreadMutex); +} + +/**********************************************************************/ +void exitThread(void) +{ + KernelThread *kt; + struct completion *completion = NULL; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { + if (kt->threadTask == current) { + completion = &kt->threadDone; + break; + } + } + mutex_unlock(&kernelThreadMutex); + unregisterAllocatingThread(); + complete_and_exit(completion, 1); +} + +/**********************************************************************/ +ThreadId getThreadId(void) +{ + return current->pid; +} + +/**********************************************************************/ +unsigned int getNumCores(void) +{ + return num_online_cpus(); +} + +/**********************************************************************/ +int initializeBarrier(Barrier *barrier, unsigned int threadCount) +{ + barrier->arrived = 0; + barrier->threadCount = threadCount; + int result = initializeSemaphore(&barrier->mutex, 1); + if (result != UDS_SUCCESS) { + return result; + } + return initializeSemaphore(&barrier->wait, 0); +} + +/**********************************************************************/ +int destroyBarrier(Barrier *barrier) +{ + int result = destroySemaphore(&barrier->mutex); + if (result != UDS_SUCCESS) { + return result; + } + return destroySemaphore(&barrier->wait); +} + +/**********************************************************************/ +int enterBarrier(Barrier *barrier, bool *winner) +{ + acquireSemaphore(&barrier->mutex); + bool lastThread = ++barrier->arrived == barrier->threadCount; + if (lastThread) { + // This is the last thread to arrive, so wake up the others + int i; + for (i = 1; i < barrier->threadCount; i++) { + releaseSemaphore(&barrier->wait); + } + // Then reinitialize for the next cycle + barrier->arrived = 0; + releaseSemaphore(&barrier->mutex); + } else { + // This is NOT the last thread to arrive, so just wait + releaseSemaphore(&barrier->mutex); + acquireSemaphore(&barrier->wait); + } + if (winner != NULL) { + *winner = lastThread; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int yieldScheduler(void) +{ + yield(); + return UDS_SUCCESS; +} diff --git a/uds/timeUtils.c b/uds/timeUtils.c new file mode 100644 index 0000000..ddf3b2b --- /dev/null +++ b/uds/timeUtils.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.c#4 $ + */ + +#include "stringUtils.h" +#include "timeUtils.h" + +#ifdef __KERNEL__ +#include +#include // for getnstimeofday on Vivid +#else +#include +#endif + +#ifndef __KERNEL__ +static const struct timespec invalidTime = { + .tv_sec = -1, + .tv_nsec = LONG_MAX +}; + +static const long BILLION = 1000 * 1000 * 1000; +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +AbsTime currentTime(clockid_t clock) +{ + struct timespec ts; + if (clock_gettime(clock, &ts) != 0) { + ts = invalidTime; + } + return ts; +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +/** + * Return a time offset from the specified time. + * + * @param time A time. + * @param reltime The relative time + * + * @return the sum of the time and the offset, possibly rounded up to the + * next representable instant. + * + * @note timeDifference(a, deltaTime(a, n)) may only be approx == -n + * depending on the system-specific time resolution + **/ +static AbsTime deltaTime(AbsTime time, RelTime reltime) +{ + if (!isValidTime(time)) { + return time; + } + if ((reltime >= 0) && (reltime < 10 * BILLION)) { + reltime += time.tv_nsec; + while (reltime >= BILLION) { + reltime -= BILLION; + time.tv_sec++; + } + time.tv_nsec = reltime; + return time; + } + // may not be accurate for times before the Epoch... + // (is the ns time positive or negative for negative time_t?) + int64_t ns = time.tv_sec * BILLION + time.tv_nsec; + if ((ns < INT64_MIN / 2) || + (ns > INT64_MAX / 2) || + (reltime < INT64_MIN / 2) || + (reltime > INT64_MAX / 2)) { + return invalidTime; + } + ns += reltime; + return (AbsTime) { .tv_sec = ns / BILLION, .tv_nsec = ns % BILLION }; +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +AbsTime futureTime(clockid_t clock, RelTime reltime) +{ + return deltaTime(currentTime(clock), reltime); +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +bool isValidTime(AbsTime time) +{ + if (time.tv_nsec < 0 || time.tv_nsec >= BILLION) { + return false; + } + return true; +} +#endif + +/*****************************************************************************/ +uint64_t nowUsec(void) +{ +#ifdef __KERNEL__ + static const AbsTime epoch = 0; +#else + static const AbsTime epoch = { 0, 0 }; +#endif + return relTimeToMicroseconds(timeDifference(currentTime(CLOCK_REALTIME), + epoch)); +} + + + +#ifndef __KERNEL__ +/*****************************************************************************/ +RelTime timeDifference(AbsTime a, AbsTime b) +{ + if (isValidTime(a) && isValidTime(b)) { + int64_t ans = a.tv_sec * BILLION + a.tv_nsec; + int64_t bns = b.tv_sec * BILLION + b.tv_nsec; + return ans - bns; + } else if (isValidTime(a)) { + return INT64_MAX; + } else if (isValidTime(b)) { + return INT64_MIN; + } else { + return 0; + } +} +#endif diff --git a/uds/timeUtils.h b/uds/timeUtils.h new file mode 100644 index 0000000..8d159f4 --- /dev/null +++ b/uds/timeUtils.h @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.h#5 $ + */ + +#ifndef TIME_UTILS_H +#define TIME_UTILS_H + +#include "compiler.h" +#include "typeDefs.h" + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#endif + +// Absolute time. +#ifdef __KERNEL__ +typedef int64_t AbsTime; +#else +typedef struct timespec AbsTime; +#endif + +// Relative time, the length of a time interval, or the difference between +// two times. A signed 64-bit number of nanoseconds. +typedef int64_t RelTime; + +#ifndef __KERNEL__ +/** + * Return true if the time is valid. + * + * @param time a time + * + * @return true if the time is valid + * + * @note an invalid time is generally returned from a failed attempt + * to get the time from the system + **/ +bool isValidTime(AbsTime time); +#endif + +/** + * Return the current time according to the specified clock type. + * + * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC + * + * @return the current time according to the clock in question + * + * @note the precision of the clock is system specific + **/ +#ifdef __KERNEL__ +static INLINE AbsTime currentTime(clockid_t clock) +{ + // clock is always a constant, so gcc reduces this to a single call + return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns(); +} +#else +AbsTime currentTime(clockid_t clock); +#endif + +#ifndef __KERNEL__ +/** + * Return the timestamp a certain number of nanoseconds in the future. + * + * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC + * @param reltime The relative time to the clock value + * + * @return the timestamp for that time (potentially rounded to the next + * representable instant for the system in question) + **/ +AbsTime futureTime(clockid_t clock, RelTime reltime); +#endif + +/** + * Return the difference between two timestamps. + * + * @param a A time + * @param b Another time, based on the same clock as a. + * + * @return the relative time between the two timestamps + **/ +#ifdef __KERNEL__ +static INLINE RelTime timeDifference(AbsTime a, AbsTime b) +{ + return a - b; +} +#else +RelTime timeDifference(AbsTime a, AbsTime b); +#endif + + + +/** + * Convert seconds to a RelTime value + * + * @param seconds A number of seconds + * + * @return the equivalent number of seconds as a RelTime + **/ +static INLINE RelTime secondsToRelTime(int64_t seconds) +{ + return (RelTime) seconds * (1000 * 1000 * 1000); +} + +/** + * Convert milliseconds to a RelTime value + * + * @param milliseconds A number of milliseconds + * + * @return the equivalent number of milliseconds as a RelTime + **/ +static INLINE RelTime millisecondsToRelTime(int64_t milliseconds) +{ + return (RelTime) milliseconds * (1000 * 1000); +} + +/** + * Convert microseconds to a RelTime value + * + * @param microseconds A number of microseconds + * + * @return the equivalent number of microseconds as a RelTime + **/ +static INLINE RelTime microsecondsToRelTime(int64_t microseconds) +{ + return (RelTime) microseconds * 1000; +} + +/** + * Convert nanoseconds to a RelTime value + * + * @param nanoseconds A number of nanoseconds + * + * @return the equivalent number of nanoseconds as a RelTime + **/ +static INLINE RelTime nanosecondsToRelTime(int64_t nanoseconds) +{ + return (RelTime) nanoseconds; +} + +/** + * Convert a RelTime value to milliseconds + * + * @param reltime The relative time + * + * @return the equivalent number of milliseconds + **/ +static INLINE int64_t relTimeToSeconds(RelTime reltime) +{ + return reltime / (1000 * 1000 * 1000); +} + +/** + * Convert a RelTime value to milliseconds + * + * @param reltime The relative time + * + * @return the equivalent number of milliseconds + **/ +static INLINE int64_t relTimeToMilliseconds(RelTime reltime) +{ + return reltime / (1000 * 1000); +} + +/** + * Convert a RelTime value to microseconds + * + * @param reltime The relative time + * + * @return the equivalent number of microseconds + **/ +static INLINE int64_t relTimeToMicroseconds(RelTime reltime) +{ + return reltime / 1000; +} + +/** + * Convert a RelTime value to nanoseconds + * + * @param reltime The relative time + * + * @return the equivalent number of nanoseconds + **/ +static INLINE int64_t relTimeToNanoseconds(RelTime reltime) +{ + return reltime; +} + +/** + * Return the wall clock time in microseconds. The actual value is time + * since the epoch (see "man gettimeofday"), but the typical use is to call + * this twice and compute the difference, giving the elapsed time between + * the two calls. + * + * @return the time in microseconds + **/ +uint64_t nowUsec(void) __attribute__((warn_unused_result)); + +/** + * Convert from an AbsTime to a time_t + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE time_t asTimeT(AbsTime time) +{ +#ifdef __KERNEL__ + return time / 1000000000; +#else + return time.tv_sec; +#endif +} + +/** + * Convert from a time_t to an AbsTime, + * + * @param time a time_t time + * + * @return an AbsTime time + **/ +static INLINE AbsTime fromTimeT(time_t time) +{ +#ifdef __KERNEL__ + return time * 1000000000; +#else + AbsTime abs; + abs.tv_sec = time; + abs.tv_nsec = 0; + return abs; +#endif +} + +#ifndef __KERNEL__ +/** + * Convert from an AbsTime to a struct timespec + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE struct timespec asTimeSpec(AbsTime time) +{ + return time; +} +#endif + +#ifndef __KERNEL__ +/** + * Convert from an AbsTime to a struct timeval + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE struct timeval asTimeVal(AbsTime time) +{ + struct timeval tv = { time.tv_sec, time.tv_nsec / 1000 }; + return tv; +} +#endif + +#endif /* TIME_UTILS_H */ diff --git a/uds/typeDefs.h b/uds/typeDefs.h new file mode 100644 index 0000000..927bd23 --- /dev/null +++ b/uds/typeDefs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/typeDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_TYPE_DEFS_H +#define LINUX_KERNEL_TYPE_DEFS_H + +/* + * General system type definitions. This file is parallel to the other + * typeDefs.h files in this project. We pick up what we can from the system + * include files, and explicitly define the other things we need. + */ + +#include +#include +#include + +#define CHAR_BIT 8 + +#define INT64_MAX (9223372036854775807L) +#define UCHAR_MAX ((unsigned char)~0ul) +#define UINT8_MAX ((uint8_t)~0ul) +#define UINT16_MAX ((uint16_t)~0ul) +#define UINT64_MAX ((uint64_t)~0ul) + +// Some recent versions of define this for us +#ifndef SIZE_MAX +#define SIZE_MAX ((size_t)~0ul) +#endif + +#define PRId64 "lld" +#define PRIu16 "u" +#define PRIu32 "u" +#define PRIu64 "llu" + +typedef unsigned long uintmax_t; +#define PRIuMAX "lu" + +typedef unsigned char byte; + +#endif /* LINUX_KERNEL_TYPE_DEFS_H */ diff --git a/uds/uds-block.h b/uds/uds-block.h new file mode 100644 index 0000000..e1b8e61 --- /dev/null +++ b/uds/uds-block.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-block.h#1 $ + */ + +/** + * @file + * @brief Definitions for the UDS block interface + **/ +#ifndef UDS_BLOCK_H +#define UDS_BLOCK_H + +#include "uds.h" + +/** General UDS block constants. */ +enum { + /** The maximum metadata size for a block. */ + UDS_MAX_BLOCK_DATA_SIZE = UDS_MAX_METADATA_SIZE +}; + +/** + * Metadata to associate with a blockName. + **/ +struct udsChunkData { + unsigned char data[UDS_MAX_BLOCK_DATA_SIZE]; +}; + +/** + * Represents a block address on disk. + * + * #UdsBlockAddress objects allow the Application Software and UDS + * to refer to specific disk blocks. It might be, for instance, the + * logical block address divided by the block size. + * + * These objects are stored persistently in the index and are also cached. + * Therefore, make every effort to ensure that these objects are as small as + * possible. + **/ +typedef void *UdsBlockAddress; + +/** @{ */ +/** @name Deduplication */ + +typedef struct udsRequest UdsRequest; + +/** + * Callback function invoked to inform the Application Software that an + * operation started by #udsStartChunkOperation has completed. + * + * @param [in] request The operation that finished. When the callback + * function is called, this UdsRequest structure can be + * reused or freed. + **/ +typedef void UdsChunkCallback(UdsRequest *request); + +/** + * Request structure passed to #udsStartChunkOperation to begin an operation, + * and returned to the Application Software when the callback function is + * invoked. + **/ +struct udsRequest { + /* + * The name of the block. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsChunkName chunkName; + /* + * The metadata found in the index that was associated with the block + * (sometimes called the canonical address). + * Set before the callback. + */ + struct udsChunkData oldMetadata; + /* + * The new metadata to associate with the name of the block (sometimes called + * the duplicate address). + * Set before starting a #UDS_POST or #UDS_QUERY operation. + * Unchanged at time of callback. + */ + struct udsChunkData newMetadata; + /* + * The callback method to be invoked when the operation finishes. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsChunkCallback *callback; + /* + * The index session. + * Set before starting an operation. + * Unchanged at time of callback. + */ + struct uds_index_session *session; + /* + * The operation type, which is one of #UDS_DELETE, #UDS_POST, #UDS_QUERY or + * #UDS_UPDATE. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsCallbackType type; + /* + * The operation status, which is either #UDS_SUCCESS or an error code. + * Set before the callback. + */ + int status; + /* + * If true, the name of the block was found in the index. + * Set before the callback. + */ + bool found; + /* + * If true, move the entry to the end of the deduplication window. + * Set before starting a #UDS_QUERY operation. + * Unchanged at time of callback. + */ + bool update; + long private[25]; +}; + +/** + * Start a UDS index chunk operation. The request type field must + * be set to the type of operation. This is an asynchronous interface to the + * block-oriented UDS API. The callback is invoked upon completion. + * + * The #UDS_DELETE operation type deletes the mapping for a particular block. + * #UDS_DELETE is typically used when UDS provides invalid advice. + * + * The #UDS_POST operation type indexes a block name and associates it with a + * particular address. The caller provides the block's name. UDS then checks + * this name against its index. + *
    + *
  • If the block is new, it is stored in the index.
  • + *
  • If the block is a duplicate of an indexed block, UDS returns the + * canonical block address via the callback.
  • + *
+ * + * The #UDS_QUERY operation type checks to see if a block name exists in the + * index. The caller provides the block's name. UDS then checks + * this name against its index. + *
    + *
  • If the block is new, no action is taken.
  • + + *
  • If the block is a duplicate of an indexed block, UDS returns the + * canonical block address via the callback. If the update + * field is set, the entry is moved to the end of the deduplication + * window.
+ * + * The #UDS_UPDATE operation type updates the mapping for a particular block. + * #UDS_UPDATE is typically used if the callback function provides invalid + * advice. + * + * @param [in] request The operation. The type, + * chunkName, newMetadata, + * context, callback, and + * update fields must be set. At callback + * time, the oldMetadata, + * status, and found fields will + * be set. + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsStartChunkOperation(UdsRequest *request); +/** @} */ + +#endif /* UDS_BLOCK_H */ diff --git a/uds/uds-error.h b/uds/uds-error.h new file mode 100644 index 0000000..7658982 --- /dev/null +++ b/uds/uds-error.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-error.h#3 $ + */ + +/** + * @file + * @brief UDS error code definitions + **/ +#ifndef UDS_ERROR_H +#define UDS_ERROR_H + + +/** + * Valid return status codes for API routines. + **/ +enum udsStatusCodes { + /** Successful return */ + UDS_SUCCESS = 0, + + /** Used as a base value for reporting errors */ + UDS_ERROR_CODE_BASE = 1024, + /** The UDS library is not initialized */ + UDS_UNINITIALIZED = UDS_ERROR_CODE_BASE + 0, + /** The UDS library is shutting down */ + UDS_SHUTTINGDOWN = UDS_ERROR_CODE_BASE + 1, + /** Could not load scanner modules */ + UDS_EMODULE_LOAD = UDS_ERROR_CODE_BASE + 2, + /** Could not create a new thread */ + UDS_ENOTHREADS = UDS_ERROR_CODE_BASE + 3, + /** Could not find the specified library context */ + UDS_NOCONTEXT = UDS_ERROR_CODE_BASE + 4, + /** The specified library context is disabled */ + UDS_DISABLED = UDS_ERROR_CODE_BASE + 5, + /** Some saved index component is corrupt */ + UDS_CORRUPT_COMPONENT = UDS_ERROR_CODE_BASE + 6, + UDS_CORRUPT_FILE = UDS_CORRUPT_COMPONENT, + /** Unknown error */ + UDS_UNKNOWN_ERROR = UDS_ERROR_CODE_BASE + 7, + /** Unused */ + UDS_UNUSED_CODE_8 = UDS_ERROR_CODE_BASE + 8, + /** Unused */ + UDS_UNUSED_CODE_9 = UDS_ERROR_CODE_BASE + 9, + /** The index configuration or volume format is no longer supported */ + UDS_UNSUPPORTED_VERSION = UDS_ERROR_CODE_BASE + 10, + /** Index session not available */ + UDS_NO_INDEXSESSION = UDS_ERROR_CODE_BASE + 11, + /** Index data in memory is corrupt */ + UDS_CORRUPT_DATA = UDS_ERROR_CODE_BASE + 12, + /** Short read due to truncated file */ + UDS_SHORT_READ = UDS_ERROR_CODE_BASE + 13, + /** Unused */ + UDS_UNUSED_CODE_14 = UDS_ERROR_CODE_BASE + 14, + /** Internal resource limits exceeded */ + UDS_RESOURCE_LIMIT_EXCEEDED = UDS_ERROR_CODE_BASE + 15, + /** Memory overflow due to storage failure */ + UDS_VOLUME_OVERFLOW = UDS_ERROR_CODE_BASE + 16, + /** Unused */ + UDS_UNUSED_CODE_17 = UDS_ERROR_CODE_BASE + 17, + /** Unused */ + UDS_UNUSED_CODE_18 = UDS_ERROR_CODE_BASE + 18, + /** Unused */ + UDS_UNUSED_CODE_19 = UDS_ERROR_CODE_BASE + 19, + /** Configuration pointer required */ + UDS_CONF_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 20, + /** Index stats pointer required */ + UDS_INDEX_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 21, + /** Context stats pointer required */ + UDS_CONTEXT_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 22, + /** Unused */ + UDS_UNUSED_CODE_23 = UDS_ERROR_CODE_BASE + 23, + /** Unused */ + UDS_UNUSED_CODE_24 = UDS_ERROR_CODE_BASE + 24, + /** Unused */ + UDS_UNUSED_CODE_25 = UDS_ERROR_CODE_BASE + 25, + /** Unused */ + UDS_UNUSED_CODE_26 = UDS_ERROR_CODE_BASE + 26, + /** Unused */ + UDS_UNUSED_CODE_27 = UDS_ERROR_CODE_BASE + 27, + /** Memory configuration not supported */ + UDS_INVALID_MEMORY_SIZE = UDS_ERROR_CODE_BASE + 28, + /** Unused */ + UDS_UNUSED_CODE_29 = UDS_ERROR_CODE_BASE + 29, + /** Index name required */ + UDS_INDEX_NAME_REQUIRED = UDS_ERROR_CODE_BASE + 30, + /** Configuration required */ + UDS_CONF_REQUIRED = UDS_ERROR_CODE_BASE + 31, + /** Unused */ + UDS_UNUSED_CODE_32 = UDS_ERROR_CODE_BASE + 32, + /** Unused */ + UDS_UNUSED_CODE_33 = UDS_ERROR_CODE_BASE + 33, + /** Unused */ + UDS_UNUSED_CODE_34 = UDS_ERROR_CODE_BASE + 34, + /** Unused */ + UDS_UNUSED_CODE_35 = UDS_ERROR_CODE_BASE + 35, + /** Unused */ + UDS_UNUSED_CODE_36 = UDS_ERROR_CODE_BASE + 36, + /** Essential files for index not found */ + UDS_NO_INDEX = UDS_ERROR_CODE_BASE + 37, + /** Checkpoint frequency out of range */ + UDS_BAD_CHECKPOINT_FREQUENCY = UDS_ERROR_CODE_BASE + 38, + /** Wrong type of index configuration */ + UDS_WRONG_INDEX_CONFIG = UDS_ERROR_CODE_BASE + 39, + /** Unused */ + UDS_UNUSED_CODE_40 = UDS_ERROR_CODE_BASE + 40, + /** Unused */ + UDS_UNUSED_CODE_41 = UDS_ERROR_CODE_BASE + 41, + /** Unused */ + UDS_UNUSED_CODE_42 = UDS_ERROR_CODE_BASE + 42, + /** Unused */ + UDS_UNUSED_CODE_43 = UDS_ERROR_CODE_BASE + 43, + /** Premature end of file in scanned file */ + UDS_END_OF_FILE = UDS_ERROR_CODE_BASE + 44, + /** Attempt to access unsaved index */ + UDS_INDEX_NOT_SAVED_CLEANLY = UDS_ERROR_CODE_BASE + 45, + /** Unused */ + UDS_UNUSED_CODE_46 = UDS_ERROR_CODE_BASE + 46, + /** There is not sufficient space to create the index */ + UDS_INSUFFICIENT_INDEX_SPACE = UDS_ERROR_CODE_BASE + 47, + /** Unused */ + UDS_UNUSED_CODE_48 = UDS_ERROR_CODE_BASE + 48, + /** Unused */ + UDS_UNUSED_CODE_49 = UDS_ERROR_CODE_BASE + 49, + /** Index is suspended */ + UDS_SUSPENDED = UDS_ERROR_CODE_BASE + 50, + /** Unused */ + UDS_UNUSED_CODE_51 = UDS_ERROR_CODE_BASE + 51, + /** Index session is already initialized */ + UDS_INDEXSESSION_IN_USE = UDS_ERROR_CODE_BASE + 52, + /** Callback required */ + UDS_CALLBACK_REQUIRED = UDS_ERROR_CODE_BASE + 53, + /** Wrong operation type */ + UDS_INVALID_OPERATION_TYPE = UDS_ERROR_CODE_BASE + 54, + /** One more than the last UDS_ERROR_CODE */ + UDS_ERROR_CODE_LAST, + /** One more than this block can use */ + UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 1024 +}; + +#endif /* UDS_ERROR_H */ diff --git a/uds/uds-platform.h b/uds/uds-platform.h new file mode 100644 index 0000000..0df39ef --- /dev/null +++ b/uds/uds-platform.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-platform.h#1 $ + */ + +/** + * @file + * @brief Platform definitions for albireo + **/ +#ifndef UDS_PLATFORM_H +#define UDS_PLATFORM_H + + +#ifdef __KERNEL__ +#include +#else +#include +#include +#include +#include +#include +#endif + +#endif /* UDS_PLATFORM_H */ diff --git a/uds/uds.h b/uds/uds.h new file mode 100644 index 0000000..42e2863 --- /dev/null +++ b/uds/uds.h @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds.h#2 $ + */ + +/** + * @mainpage UDS API Reference + *
Copyright (c) 2020 Red Hat, Inc.
+ **/ + +/** + * @file + * @brief General UDS definitions + **/ +#ifndef UDS_H +#define UDS_H + +#include "uds-platform.h" + +#ifdef UDS_DISABLE_ATTR_WARN_UNUSED_RESULT +#define UDS_ATTR_WARN_UNUSED_RESULT +#else +#define UDS_ATTR_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#endif + +/** + * Valid request types as described in callbacks. + **/ +typedef enum { + /** + * Callback type for operations that post mappings to the UDS + * index. When the chunk-hash being added already exists, the + * existing metadata is not overwritten. Regardless, the + * recency of the chunk is updated. + **/ + UDS_POST, + + /** + * Callback type for operations that update mappings in the UDS + * index. If the indicated entry does not have any mapping in the + * index, one is created. In either case, the recency of + * the chunk is updated. + **/ + UDS_UPDATE, + + /** + * Callback type for operations that delete mappings from the + * UDS index. */ + UDS_DELETE, + + /** + * Callback type for operations that query mappings in the UDS + * index. When a mapping is found, the recency of the mapping + * is updated unless it's the no-update call. + **/ + UDS_QUERY +} UdsCallbackType; + +/** + * Valid types for opening an index. + **/ +typedef enum { + /** + * Load an existing index. If the index was not saved cleanly, try to + * recover and rebuild the index. + **/ + UDS_LOAD = 0, + + /** + * Create a new index. + **/ + UDS_CREATE = 1, + + /** + * Load an existing index, but only if it was cleanly saved. + **/ + UDS_NO_REBUILD = 2, +} UdsOpenIndexType; + +/** General UDS constants. */ +enum { + /** The chunk name size in bytes (128 bits = 16 bytes). */ + UDS_CHUNK_NAME_SIZE = 16, + /** The maximum metadata size in bytes. */ + UDS_MAX_METADATA_SIZE = 16, +}; + +/** + * Type representing memory configuration which is either a positive + * integer number of gigabytes or one of the three special constants + * for configurations which are smaller than 1 gigabyte. + **/ +typedef unsigned int UdsMemoryConfigSize; + +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB; +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB; +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB; + +/** + * The maximum configurable amount of memory. + **/ +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX; + +/** The name (hash) of a chunk. */ +typedef struct udsChunkName { + /** The name (hash) of a chunk. */ + unsigned char name[UDS_CHUNK_NAME_SIZE]; +} UdsChunkName; + +/** + * An active index session. + **/ +struct uds_index_session; + +/** + * The data used to configure a new index. + **/ +typedef struct udsConfiguration *UdsConfiguration; +typedef uint64_t UdsNonce; + +/** + * The data used to configure a new index session. + **/ +struct uds_parameters { + // Tne number of threads used to process index requests. + int zone_count; + // The number of threads used to read volume pages. + int read_threads; + // The number of chapters to write between checkpoints. + int checkpoint_frequency; +}; +#define UDS_PARAMETERS_INITIALIZER { \ + .zone_count = 0, \ + .read_threads = 2, \ + .checkpoint_frequency = 0, \ + } + +/** + * Index statistics + * + * These statistics capture the current index characteristics, + * including resource usage. + **/ +typedef struct udsIndexStats { + /** The total number of chunk names stored in the index */ + uint64_t entriesIndexed; + /** An estimate of the index's memory usage */ + uint64_t memoryUsed; + /** The number of collisions recorded in the master index */ + uint64_t collisions; + /** The number of entries discarded from the index since index startup */ + uint64_t entriesDiscarded; + /** The number of checkpoints done this session */ + uint64_t checkpoints; +} UdsIndexStats; + +/** + * Context statistics + * + * These statistics capture a library context's characteristics either since + * it was initialized or since its statistics were last reset, whichever + * is more recent. + **/ +typedef struct udsContextStats { + /** The time at which context statistics were last fetched */ + time_t currentTime; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry + **/ + uint64_t postsFound; + /** + * The number of post calls since context statistics were last reset that + * added an entry + **/ + uint64_t postsNotFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry is current enough to only exist in memory and not + * have been commited to disk yet. + **/ + uint64_t inMemoryPostsFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry in the dense portion of the index. + **/ + uint64_t densePostsFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry in the sparse portion of the index (if one + * exists). + **/ + uint64_t sparsePostsFound; + /** + * The number of update calls since context statistics were last reset that + * updated an existing entry + **/ + uint64_t updatesFound; + /** + * The number of update calls since context statistics were last reset that + * added a new entry + **/ + uint64_t updatesNotFound; + /** + * The number of delete requests since context statistics were last reset + * that deleted an existing entry + **/ + uint64_t deletionsFound; + /** + * The number of delete requests since context statistics were last reset + * that did nothing. + **/ + uint64_t deletionsNotFound; + /** + * The number of query calls since context statistics were last reset that + * found existing entry + **/ + uint64_t queriesFound; + /** + * The number of query calls since context statistics were last reset that + * did not find an entry + **/ + uint64_t queriesNotFound; + /** + * The total number of library requests (the sum of posts, updates, + * deletions, and queries) since context + * statistics were last reset + **/ + uint64_t requests; +} UdsContextStats; + +/** + * Initializes an index configuration. + * + * @param [out] conf The new configuration + * @param [in] memGB The maximum memory allocation, in GB + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsInitializeConfiguration(UdsConfiguration *conf, + UdsMemoryConfigSize memGB); + +/** + * Sets or clears an index configuration's sparse indexing settings. + * + * @param [in,out] conf The configuration to change + * @param [in] sparse If true, request a sparse + * index; if false, request + * a default index. + * + **/ +void udsConfigurationSetSparse(UdsConfiguration conf, bool sparse); + +/** + * Tests whether an index configuration specifies sparse indexing. + * + * @param [in] conf The configuration to check + * + * @return Returns true if the configuration + * is sparse, or false if not + **/ +UDS_ATTR_WARN_UNUSED_RESULT +bool udsConfigurationGetSparse(UdsConfiguration conf); + +/** + * Sets an index configuration's nonce. + * + * @param [in,out] conf The configuration to change + * @param [in] nonce The 64 bit nonce. + * + **/ +void udsConfigurationSetNonce(UdsConfiguration conf, UdsNonce nonce); + +/** + * Gets an index configuration's nonce. + * + * @param [in] conf The configuration to check + * + * @return The 64 bit nonce. + **/ +UDS_ATTR_WARN_UNUSED_RESULT +UdsNonce udsConfigurationGetNonce(UdsConfiguration conf); + +/** + * Fetches a configuration's maximum memory allocation. + * + * @param [in] conf The configuration to check + * + * @return The amount of memory allocated, in GB + **/ +UDS_ATTR_WARN_UNUSED_RESULT +UdsMemoryConfigSize udsConfigurationGetMemory(UdsConfiguration conf); + +/** + * Fetches a configuration's chapters per volume value. + * + * @param [in] conf The configuration to check + * + * @return The number of chapters per volume + **/ +UDS_ATTR_WARN_UNUSED_RESULT +unsigned int udsConfigurationGetChaptersPerVolume(UdsConfiguration conf); + +/** + * Frees memory used by a configuration. + * + * @param [in,out] conf The configuration for which memory is being freed + **/ +void udsFreeConfiguration(UdsConfiguration conf); + +/** + * Compute the size required to store the index on persistent storage. This + * size is valid for any index stored in a single file or on a single block + * device. This size should be used when configuring a block device on which + * to store an index. + * + * @param [in] config A UdsConfiguration for an index. + * @param [in] numCheckpoints The maximum number of checkpoints. + * @param [out] indexSize The number of bytes required to store + * the index. + * + * @return UDS_SUCCESS or an error code. + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsComputeIndexSize(const UdsConfiguration config, + unsigned int numCheckpoints, + uint64_t *indexSize); + +/** + * Opens an index session. + * + * Creates a session for an index. #udsOpenIndex must be called before + * the index can be used. + * + * Destroy the session with #udsDestroyIndexSession. + * + * @param [out] session A pointer to the new session + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsCreateIndexSession(struct uds_index_session **session); + +/** + * Fetches the UDS library version. + * + * @return The library version + **/ +UDS_ATTR_WARN_UNUSED_RESULT +const char *udsGetVersion(void); + +#ifdef __KERNEL__ +/** + * The name argument to #udsOpenIndex is a text string that names the index. + * The name should have the form "path", where path is the name of the block + * device. The path should not contain white space. The names can optionally + * contain size and/or offset options which give the number of bytes in the + * index and the byte offset to the start of the index. For example, the name + * "/dev/sda8 offset=409600 size=2048000000" is an index that is stored in + * 2040000000 bytes of /dev/sda8 starting at byte 409600. + **/ +#else +/** + * The name argument to #udsOpenIndex is a text string that names the index. + * The name should have the form "path", where path is the name of the file or + * block device. The path should not contain white space. The name can + * optionally contain size and/or offset options which give the number of bytes + * in the index and the byte offset to the start of the index. For example, + * the name "/dev/sda8 offset=409600 size=2048000000" is an index that is + * stored in 2040000000 bytes of /dev/sda8 starting at byte 409600. + **/ +#endif + +/** + * Opens an index with an existing session. This operation will fail if the + * index session is suspended, or if there is already an open index. + * + * The index should be closed with #udsCloseIndex. + * + * @param openType The type of open, which is one of #UDS_LOAD, #UDS_CREATE, + * or #UDS_NO_REBUILD. + * @param name The name of the index + * @param params The index session parameters. If NULL, the default + * session parameters will be used. + * @param conf The index configuration + * @param session The index session + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsOpenIndex(UdsOpenIndexType openType, + const char *name, + const struct uds_parameters *params, + UdsConfiguration conf, + struct uds_index_session *session); + +/** + * Waits until all callbacks for index operations are complete, and prevents + * new index operations from starting. Index operations will return + * UDS_SUSPENDED until #udsResumeIndexSession is called. Optionally saves all + * index data before returning. + * + * @param session The session to suspend + * @param save Whether to save index data + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsSuspendIndexSession(struct uds_index_session *session, bool save); + +/** + * Allows new index operations for an index, whether it was suspended or not. + * + * @param session The session to resume + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsResumeIndexSession(struct uds_index_session *session); + +/** + * Waits until all callbacks for index operations are complete. + * + * @param [in] session The session to flush + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsFlushIndexSession(struct uds_index_session *session); + +/** + * Closes an index. This operation will fail if the index session is + * suspended. + * + * Saves changes to the index so that #udsOpenIndex can re-open it. + * + * @param [in] session The session containing the index to close + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsCloseIndex(struct uds_index_session *session); + +/** + * Destroys an index session. + * + * Saves changes to the index and closes the index if one is open. + * Use #udsDestroyIndexSession for index sessions created by + * #udsCreateIndexSession. + * + * @param [in] session The session to destroy + * + * @return Either #UDS_SUCCESS or an error code + **/ +int udsDestroyIndexSession(struct uds_index_session *session); + +/** + * Returns the configuration for the given index session. + * + * @param [in] session The session + * @param [out] conf The index configuration + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexConfiguration(struct uds_index_session *session, + UdsConfiguration *conf); + +/** + * Fetches index statistics for the given index session. + * + * @param [in] session The session + * @param [out] stats The index statistics structure to fill + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexStats(struct uds_index_session *session, UdsIndexStats *stats); + +/** + * Fetches index session statistics for the given index session. + * + * @param [in] session The session + * @param [out] stats The context statistics structure to fill + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexSessionStats(struct uds_index_session *session, + UdsContextStats *stats); + +/** + * Convert an error code to a string. + * + * @param errnum The error code + * @param buf The buffer to hold the error string + * @param buflen The length of the buffer + * + * @return A pointer to buf + **/ +UDS_ATTR_WARN_UNUSED_RESULT +const char *udsStringError(int errnum, char *buf, size_t buflen); + +/** + * Suggested buffer size for udsStringError. + **/ +enum { + UDS_STRING_ERROR_BUFSIZE = 128 +}; + +#endif /* UDS_H */ diff --git a/uds/udsMain.c b/uds/udsMain.c new file mode 100644 index 0000000..8d4f411 --- /dev/null +++ b/uds/udsMain.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/udsMain.c#12 $ + */ + +#include "uds.h" + +#include "config.h" +#include "geometry.h" +#include "indexLayout.h" +#include "indexRouter.h" +#include "indexSession.h" +#include "loadType.h" +#include "logger.h" +#include "memoryAlloc.h" + +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX = 1024; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB = (UdsMemoryConfigSize) -256; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB = (UdsMemoryConfigSize) -512; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB = (UdsMemoryConfigSize) -768; + +/* + * =========================================================================== + * UDS system management + * =========================================================================== + */ + +/**********************************************************************/ +int udsInitializeConfiguration(UdsConfiguration *userConfig, + UdsMemoryConfigSize memGB) +{ + if (userConfig == NULL) { + return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, + "received a NULL config pointer"); + } + + /* Set the configuration parameters that change with memory size. If you + * change these values, you should also: + * + * Change Configuration_x1, which tests these values and expects to see them + * + * Bump the index configuration version number. This bump ensures that + * the test infrastructure will be forced to test the new configuration. + */ + + unsigned int chaptersPerVolume, recordPagesPerChapter; + if (memGB == UDS_MEMORY_CONFIG_256MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == UDS_MEMORY_CONFIG_512MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == UDS_MEMORY_CONFIG_768MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == 1) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else if ((memGB > 1) && (memGB <= UDS_MEMORY_CONFIG_MAX)) { + chaptersPerVolume = memGB * DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else { + return UDS_INVALID_MEMORY_SIZE; + } + + int result = ALLOCATE(1, struct udsConfiguration, "udsConfiguration", + userConfig); + if (result != UDS_SUCCESS) { + return result; + } + + (*userConfig)->recordPagesPerChapter = recordPagesPerChapter; + (*userConfig)->chaptersPerVolume = chaptersPerVolume; + (*userConfig)->sparseChaptersPerVolume = DEFAULT_SPARSE_CHAPTERS_PER_VOLUME; + (*userConfig)->cacheChapters = DEFAULT_CACHE_CHAPTERS; + (*userConfig)->checkpointFrequency = DEFAULT_CHECKPOINT_FREQUENCY; + (*userConfig)->masterIndexMeanDelta = DEFAULT_MASTER_INDEX_MEAN_DELTA; + (*userConfig)->bytesPerPage = DEFAULT_BYTES_PER_PAGE; + (*userConfig)->sparseSampleRate = DEFAULT_SPARSE_SAMPLE_RATE; + (*userConfig)->nonce = 0; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void udsConfigurationSetSparse(UdsConfiguration userConfig, bool sparse) +{ + bool prevSparse = (userConfig->sparseChaptersPerVolume != 0); + if (sparse == prevSparse) { + // nothing to do + return; + } + + unsigned int prevChaptersPerVolume = userConfig->chaptersPerVolume; + if (sparse) { + // Index 10TB with 4K blocks, 95% sparse, fit in dense (1TB) footprint + userConfig->chaptersPerVolume = 10 * prevChaptersPerVolume; + userConfig->sparseChaptersPerVolume = 9 * prevChaptersPerVolume + + prevChaptersPerVolume / 2; + userConfig->sparseSampleRate = 32; + } else { + userConfig->chaptersPerVolume = prevChaptersPerVolume / 10; + userConfig->sparseChaptersPerVolume = 0; + userConfig->sparseSampleRate = 0; + } +} + +/**********************************************************************/ +bool udsConfigurationGetSparse(UdsConfiguration userConfig) +{ + return userConfig->sparseChaptersPerVolume > 0; +} + +/**********************************************************************/ +void udsConfigurationSetNonce(UdsConfiguration userConfig, UdsNonce nonce) +{ + userConfig->nonce = nonce; +} + +/**********************************************************************/ +UdsNonce udsConfigurationGetNonce(UdsConfiguration userConfig) +{ + return userConfig->nonce; +} + +/**********************************************************************/ +unsigned int udsConfigurationGetMemory(UdsConfiguration userConfig) +{ + enum { + CHAPTERS = DEFAULT_CHAPTERS_PER_VOLUME, + SMALL_PAGES = CHAPTERS * SMALL_RECORD_PAGES_PER_CHAPTER, + LARGE_PAGES = CHAPTERS * DEFAULT_RECORD_PAGES_PER_CHAPTER + }; + unsigned int pages = (userConfig->chaptersPerVolume + * userConfig->recordPagesPerChapter); + if (userConfig->sparseChaptersPerVolume != 0) { + pages /= 10; + } + switch (pages) { + case SMALL_PAGES: return UDS_MEMORY_CONFIG_256MB; + case 2 * SMALL_PAGES: return UDS_MEMORY_CONFIG_512MB; + case 3 * SMALL_PAGES: return UDS_MEMORY_CONFIG_768MB; + default: return pages / LARGE_PAGES; + } +} + +/**********************************************************************/ +unsigned int +udsConfigurationGetChaptersPerVolume(UdsConfiguration userConfig) +{ + return userConfig->chaptersPerVolume; +} + +/**********************************************************************/ +void udsFreeConfiguration(UdsConfiguration userConfig) +{ + FREE(userConfig); +} + +/**********************************************************************/ +int udsCreateIndexSession(struct uds_index_session **session) +{ + if (session == NULL) { + return UDS_NO_INDEXSESSION; + } + + struct uds_index_session *indexSession = NULL; + int result = makeEmptyIndexSession(&indexSession); + if (result != UDS_SUCCESS) { + return result; + } + + *session = indexSession; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static +int initializeIndexSessionWithLayout(struct uds_index_session *indexSession, + IndexLayout *layout, + const struct uds_parameters *userParams, + LoadType loadType) +{ + int result = ((loadType == LOAD_CREATE) + ? writeIndexConfig(layout, &indexSession->userConfig) + : verifyIndexConfig(layout, &indexSession->userConfig)); + if (result != UDS_SUCCESS) { + return result; + } + + Configuration *indexConfig; + result = makeConfiguration(&indexSession->userConfig, &indexConfig); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed to allocate config"); + return result; + } + + // Zero the stats for the new index. + memset(&indexSession->stats, 0, sizeof(indexSession->stats)); + + result = makeIndexRouter(layout, indexConfig, userParams, loadType, + &indexSession->loadContext, enterCallbackStage, + &indexSession->router); + freeConfiguration(indexConfig); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed to make router"); + return result; + } + + logUdsConfiguration(&indexSession->userConfig); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int initializeIndexSession(struct uds_index_session *indexSession, + const char *name, + const struct uds_parameters *userParams, + LoadType loadType) +{ + IndexLayout *layout; + int result = makeIndexLayout(name, loadType == LOAD_CREATE, + &indexSession->userConfig, &layout); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeIndexSessionWithLayout(indexSession, layout, userParams, + loadType); + putIndexLayout(&layout); + return result; +} + +/**********************************************************************/ +int udsOpenIndex(UdsOpenIndexType openType, + const char *name, + const struct uds_parameters *userParams, + UdsConfiguration userConfig, + struct uds_index_session *session) +{ + if (name == NULL) { + return UDS_INDEX_NAME_REQUIRED; + } + if (userConfig == NULL) { + return UDS_CONF_REQUIRED; + } + if (session == NULL) { + return UDS_NO_INDEXSESSION; + } + + int result = startLoadingIndexSession(session); + if (result != UDS_SUCCESS) { + return result; + } + + session->userConfig = *userConfig; + + // Map the external openType to the internal loadType + LoadType loadType = openType == UDS_CREATE ? LOAD_CREATE + : openType == UDS_NO_REBUILD ? LOAD_LOAD + : LOAD_REBUILD; + logNotice("%s: %s", getLoadType(loadType), name); + + result = initializeIndexSession(session, name, userParams, loadType); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed %s", getLoadType(loadType)); + saveAndFreeIndex(session); + } + + finishLoadingIndexSession(session, result); + return sansUnrecoverable(result); +} + +/**********************************************************************/ +const char *udsGetVersion(void) +{ +#ifdef UDS_VERSION + return UDS_VERSION; +#else + return "internal version"; +#endif +} + +/**********************************************************************/ +const char *udsStringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + return stringError(errnum, buf, buflen); +} diff --git a/uds/udsModule.c b/uds/udsModule.c new file mode 100644 index 0000000..007f1a8 --- /dev/null +++ b/uds/udsModule.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/udsModule.c#32 $ + */ + +#include + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" +#include "sysfs.h" +#include "timeUtils.h" +#include "uds.h" +#include "uds-block.h" +#include "util/funnelQueue.h" + +/**********************************************************************/ +static int __init dedupeInit(void) +{ + memoryInit(); + logInfo("loaded version %s", UDS_VERSION); + initSysfs(); + return 0; +} + +/**********************************************************************/ +static void __exit dedupeExit(void) +{ + putSysfs(); + memoryExit(); + logInfo("unloaded version %s", UDS_VERSION); +} + +/**********************************************************************/ +module_init(dedupeInit); +module_exit(dedupeExit); + +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_256MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_512MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_768MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_MAX); +EXPORT_SYMBOL_GPL(udsInitializeConfiguration); +EXPORT_SYMBOL_GPL(udsComputeIndexSize); +EXPORT_SYMBOL_GPL(udsConfigurationSetNonce); +EXPORT_SYMBOL_GPL(udsConfigurationGetNonce); +EXPORT_SYMBOL_GPL(udsConfigurationSetSparse); +EXPORT_SYMBOL_GPL(udsConfigurationGetSparse); +EXPORT_SYMBOL_GPL(udsConfigurationGetMemory); +EXPORT_SYMBOL_GPL(udsConfigurationGetChaptersPerVolume); +EXPORT_SYMBOL_GPL(udsFreeConfiguration); +EXPORT_SYMBOL_GPL(udsGetVersion); +EXPORT_SYMBOL_GPL(udsCreateIndexSession); +EXPORT_SYMBOL_GPL(udsOpenIndex); +EXPORT_SYMBOL_GPL(udsSuspendIndexSession); +EXPORT_SYMBOL_GPL(udsResumeIndexSession); +EXPORT_SYMBOL_GPL(udsCloseIndex); +EXPORT_SYMBOL_GPL(udsDestroyIndexSession); +EXPORT_SYMBOL_GPL(udsFlushIndexSession); +EXPORT_SYMBOL_GPL(udsGetIndexConfiguration); +EXPORT_SYMBOL_GPL(udsGetIndexStats); +EXPORT_SYMBOL_GPL(udsGetIndexSessionStats); +EXPORT_SYMBOL_GPL(udsStringError); +EXPORT_SYMBOL_GPL(udsStartChunkOperation); + +EXPORT_SYMBOL_GPL(allocSprintf); +EXPORT_SYMBOL_GPL(allocateMemory); +EXPORT_SYMBOL_GPL(allocateMemoryNowait); +EXPORT_SYMBOL_GPL(assertionFailed); +EXPORT_SYMBOL_GPL(assertionFailedLogOnly); +EXPORT_SYMBOL_GPL(availableSpace); +EXPORT_SYMBOL_GPL(bufferLength); +EXPORT_SYMBOL_GPL(bufferUsed); +EXPORT_SYMBOL_GPL(clearBuffer); +EXPORT_SYMBOL_GPL(compactBuffer); +EXPORT_SYMBOL_GPL(contentLength); +EXPORT_SYMBOL_GPL(copyBytes); +EXPORT_SYMBOL_GPL(currentTime); +EXPORT_SYMBOL_GPL(duplicateString); +EXPORT_SYMBOL_GPL(ensureAvailableSpace); +EXPORT_SYMBOL_GPL(equalBuffers); +EXPORT_SYMBOL_GPL(fixedSprintf); +EXPORT_SYMBOL_GPL(freeBuffer); +EXPORT_SYMBOL_GPL(freeFunnelQueue); +EXPORT_SYMBOL_GPL(freeMemory); +EXPORT_SYMBOL_GPL(funnelQueuePoll); +EXPORT_SYMBOL_GPL(getBoolean); +EXPORT_SYMBOL_GPL(getBufferContents); +EXPORT_SYMBOL_GPL(getByte); +EXPORT_SYMBOL_GPL(getBytesFromBuffer); +EXPORT_SYMBOL_GPL(getMemoryStats); +EXPORT_SYMBOL_GPL(getUInt16BEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt16LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt16LEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32BEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32BEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64BEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64LEsFromBuffer); +EXPORT_SYMBOL_GPL(growBuffer); +EXPORT_SYMBOL_GPL(hasSameBytes); +EXPORT_SYMBOL_GPL(isFunnelQueueEmpty); +EXPORT_SYMBOL_GPL(makeBuffer); +EXPORT_SYMBOL_GPL(makeFunnelQueue); +EXPORT_SYMBOL_GPL(MurmurHash3_x64_128); +EXPORT_SYMBOL_GPL(nowUsec); +EXPORT_SYMBOL_GPL(peekByte); +EXPORT_SYMBOL_GPL(putBoolean); +EXPORT_SYMBOL_GPL(putBuffer); +EXPORT_SYMBOL_GPL(putByte); +EXPORT_SYMBOL_GPL(putBytes); +EXPORT_SYMBOL_GPL(putInt64LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16BEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16LEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32BEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32BEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64BEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64LEsIntoBuffer); +EXPORT_SYMBOL_GPL(reallocateMemory); +EXPORT_SYMBOL_GPL(registerAllocatingThread); +EXPORT_SYMBOL_GPL(reportMemoryUsage); +EXPORT_SYMBOL_GPL(resetBufferEnd); +EXPORT_SYMBOL_GPL(rewindBuffer); +EXPORT_SYMBOL_GPL(skipForward); +EXPORT_SYMBOL_GPL(uncompactedAmount); +EXPORT_SYMBOL_GPL(unregisterAllocatingThread); +EXPORT_SYMBOL_GPL(wrapBuffer); +EXPORT_SYMBOL_GPL(zeroBytes); + +/**********************************************************************/ + + +/**********************************************************************/ + +MODULE_DESCRIPTION("deduplication engine"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_VERSION(UDS_VERSION); diff --git a/uds/util/eventCount.c b/uds/util/eventCount.c new file mode 100644 index 0000000..7efeac6 --- /dev/null +++ b/uds/util/eventCount.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.c#2 $ + */ + +/** + * This EventCount implementation uses a posix semaphore for portability, + * although a futex would be slightly superior to use and easy to substitute. + * It is designed to make signalling as cheap as possible, since that is the + * code path likely triggered on most updates to a lock-free data structure. + * Waiters are likely going to sleep, so optimizing for that case isn't + * necessary. + * + * The critical field is the state, which is really two fields that can be + * atomically updated in unison: an event counter and a waiter count. Every + * call to eventCountPrepare() issues a wait token by atomically incrementing + * the waiter count. The key invariant is a strict accounting of the number of + * tokens issued. Every token returned by eventCountPrepare() is a contract + * that the caller will call acquireSemaphore() and a signaller will call + * releaseSemaphore(), each exactly once. Atomic updates to the state field + * ensure that each token is counted once and that tokens are not lost. + * Cancelling a token attempts to take a fast-path by simply decrementing the + * waiters field, but if the token has already been claimed by a signaller, + * the canceller must still wait on the semaphore to consume the transferred + * token. + * + * The state field is 64 bits, partitioned into a 16-bit waiter field and a + * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16 + * threads waiting on any single event transition. 2^48 microseconds is + * several years, so a token holder would have to wait that long for the + * counter to wrap around, and then call eventCountWait() at the exact right + * time to see the re-used counter, in order to lose a wakeup due to counter + * wrap-around. Using a 32-bit state field would greatly increase that chance, + * but if forced to do so, the implementation could likely tolerate it since + * callers are supposed to hold tokens for miniscule periods of time. + * Fortunately, x64 has 64-bit compare-and-swap, and the performance of + * interlocked 64-bit operations appears to be about the same as for 32-bit + * ones, so being paranoid and using 64 bits costs us nothing. + * + * Here are some sequences of calls and state transitions: + * + * action postcondition + * counter waiters semaphore + * initialized 0 0 0 + * prepare 0 1 0 + * wait (blocks) 0 1 0 + * signal 1 0 1 + * wait (unblocks) 1 0 0 + * + * signal (fast-path) 1 0 0 + * signal (fast-path) 1 0 0 + * + * prepare A 1 1 0 + * prepare B 1 2 0 + * signal 2 0 2 + * wait B (fast-path) 2 0 1 + * wait A (fast-path) 2 0 0 + * + * prepare 2 1 0 + * cancel (fast-path) 2 0 0 + * + * prepare 2 1 0 + * signal 3 0 1 + * cancel (must wait) 3 0 0 + * + * The EventCount structure is aligned, sized, and allocated to cache line + * boundaries to avoid any false sharing between the EventCount and other + * shared state. The state field and semaphore should fit on a single cache + * line. The instrumentation counters increase the size of the structure so it + * rounds up to use two (64-byte x86) cache lines. + * + * XXX Need interface to access or display instrumentation counters. + **/ + +#include "eventCount.h" + +#include "atomicDefs.h" +#include "common.h" +#include "compiler.h" +#include "cpu.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "threads.h" + +enum { + ONE_WAITER = 1, // value used to increment the waiters field + ONE_EVENT = (1 << 16), // value used to increment the event counter + WAITERS_MASK = (ONE_EVENT - 1), // bit mask to access the waiters field + EVENTS_MASK = ~WAITERS_MASK, // bit mask to access the event counter +}; + +struct eventCount { + // Atomically mutable state: + // low 16 bits: the number of wait tokens not posted to the semaphore + // high 48 bits: current event counter + atomic64_t state; + + // Semaphore used to block threads when waiting is required. + Semaphore semaphore; + + // Instrumentation counters. + + // Declare alignment so we don't share a cache line. +} __attribute__((aligned(CACHE_LINE_BYTES))); + +/** + * Test the event field in two tokens for equality. + * + * @return true iff the tokens contain the same event field value + **/ +static INLINE bool sameEvent(EventToken token1, EventToken token2) +{ + return ((token1 & EVENTS_MASK) == (token2 & EVENTS_MASK)); +} + +/**********************************************************************/ +void eventCountBroadcast(EventCount *ec) +{ + + // Even if there are no waiters (yet), we will need a memory barrier. + smp_mb(); + + uint64_t waiters; + uint64_t state = atomic64_read(&ec->state); + uint64_t oldState = state; + do { + // Check if there are any tokens that have not yet been been transferred + // to the semaphore. This is the fast no-waiters path. + waiters = (state & WAITERS_MASK); + if (waiters == 0) { + // Fast path first time through--no need to signal or post if there are + // no observers. + return; + } + + /* + * Attempt to atomically claim all the wait tokens and bump the event count + * using an atomic compare-and-swap. This operation contains a memory + * barrier. + */ + EventToken newState = ((state & ~WAITERS_MASK) + ONE_EVENT); + oldState = state; + state = atomic64_cmpxchg(&ec->state, oldState, newState); + // The cmpxchg fails when we lose a race with a new waiter or another + // signaller, so try again. + } while (unlikely(state != oldState)); + + + /* + * Wake the waiters by posting to the semaphore. This effectively transfers + * the wait tokens to the semaphore. There's sadly no bulk post for posix + * semaphores, so we've got to loop to do them all. + */ + while (waiters-- > 0) { + releaseSemaphore(&ec->semaphore); + } +} + +/** + * Attempt to cancel a prepared wait token by decrementing the + * number of waiters in the current state. This can only be done + * safely if the event count hasn't been bumped. + * + * @param ec the event count on which the wait token was issued + * @param token the wait to cancel + * + * @return true if the wait was cancelled, false if the caller must + * still wait on the semaphore + **/ +static INLINE bool fastCancel(EventCount *ec, EventToken token) +{ + EventToken currentToken = atomic64_read(&ec->state); + while (sameEvent(currentToken, token)) { + // Try to decrement the waiter count via compare-and-swap as if we had + // never prepared to wait. + EventToken et = atomic64_cmpxchg(&ec->state, currentToken, + currentToken - 1); + if (et == currentToken) { + return true; + } + currentToken = et; + } + return false; +} + +/** + * Consume a token from the semaphore, waiting (with an optional timeout) if + * one is not currently available. Also attempts to count the number of times + * we'll actually have to wait because there are no tokens (permits) available + * in the semaphore, and the number of times the wait times out. + * + * @param ec the event count instance + * @param timeout an optional timeout value to pass to attemptSemaphore() + * + * @return true if a token was consumed, otherwise false only if a timeout + * was specified and we timed out + **/ +static bool consumeWaitToken(EventCount *ec, const RelTime *timeout) +{ + // Try to grab a token without waiting. + if (attemptSemaphore(&ec->semaphore, 0)) { + return true; + } + + + if (timeout == NULL) { + acquireSemaphore(&ec->semaphore); + } else if (!attemptSemaphore(&ec->semaphore, *timeout)) { + return false; + } + return true; +} + +/**********************************************************************/ +int makeEventCount(EventCount **ecPtr) +{ + // The event count will be allocated on a cache line boundary so there will + // not be false sharing of the line with any other data structure. + EventCount *ec = NULL; + int result = ALLOCATE(1, EventCount, "event count", &ec); + if (result != UDS_SUCCESS) { + return result; + } + + atomic64_set(&ec->state, 0); + result = initializeSemaphore(&ec->semaphore, 0); + if (result != UDS_SUCCESS) { + FREE(ec); + return result; + } + + *ecPtr = ec; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeEventCount(EventCount *ec) +{ + if (ec == NULL) { + return; + } + destroySemaphore(&ec->semaphore); + FREE(ec); +} + +/**********************************************************************/ +EventToken eventCountPrepare(EventCount *ec) +{ + return atomic64_add_return(ONE_WAITER, &ec->state); +} + +/**********************************************************************/ +void eventCountCancel(EventCount *ec, EventToken token) +{ + // Decrement the waiter count if the event hasn't been signalled. + if (fastCancel(ec, token)) { + return; + } + // A signaller has already transferred (or promised to transfer) our token + // to the semaphore, so we must consume it from the semaphore by waiting. + eventCountWait(ec, token, NULL); +} + +/**********************************************************************/ +bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout) +{ + + for (;;) { + // Wait for a signaller to transfer our wait token to the semaphore. + if (!consumeWaitToken(ec, timeout)) { + // The wait timed out, so we must cancel the token instead. Try to + // decrement the waiter count if the event hasn't been signalled. + if (fastCancel(ec, token)) { + return false; + } + /* + * We timed out, but a signaller came in before we could cancel the + * wait. We have no choice but to wait for the semaphore to be posted. + * Since signaller has promised to do it, the wait will be short. The + * timeout and the signal happened at about the same time, so either + * outcome could be returned. It's simpler to ignore the timeout. + */ + timeout = NULL; + continue; + } + + // A wait token has now been consumed from the semaphore. + + // Stop waiting if the count has changed since the token was acquired. + if (!sameEvent(token, atomic64_read(&ec->state))) { + return true; + } + + // We consumed someone else's wait token. Put it back in the semaphore, + // which will wake another waiter, hopefully one who can stop waiting. + releaseSemaphore(&ec->semaphore); + + // Attempt to give an earlier waiter a shot at the semaphore. + yieldScheduler(); + } +} diff --git a/uds/util/eventCount.h b/uds/util/eventCount.h new file mode 100644 index 0000000..e3f2a33 --- /dev/null +++ b/uds/util/eventCount.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.h#1 $ + */ + +#ifndef EVENT_COUNT_H +#define EVENT_COUNT_H + +#include "timeUtils.h" +#include "typeDefs.h" + +/** + * An EventCount is a lock-free equivalent of a condition variable. + * + * Using an EventCount, a lock-free producer/consumer can wait for a state + * change (adding an item to an empty queue, for example) without spinning or + * falling back on the use of mutex-based locks. Signalling is cheap when + * there are no waiters (a memory fence), and preparing to wait is + * also inexpensive (an atomic add instruction). + * + * A lock-free producer should call eventCountBroadcast() after any mutation + * to the lock-free data structure that a consumer might be waiting on. The + * consumers should poll for work like this: + * + * for (;;) { + * // Fast path--no additional cost to consumer. + * if (lockFreeDequeue(&item)) { + * return item; + * } + * // Two-step wait: get current token and poll state, either cancelling + * // the wait or waiting for the token to be signalled. + * EventToken token = eventCountPrepare(ec); + * if (lockFreeDequeue(&item)) { + * eventCountCancel(ec, token); + * return item; + * } + * eventCountWait(ec, token, NULL); + * // State has changed, but must check condition again, so loop. + * } + * + * Once eventCountPrepare() is called, the caller should neither dally, sleep, + * nor perform long-running or blocking actions before passing the token to + * eventCountCancel() or eventCountWait(). The implementation is optimized for + * a short polling window, and will not perform well if there are outstanding + * tokens that have been signalled but not waited upon. + **/ + +typedef struct eventCount EventCount; + +typedef unsigned int EventToken; + +/** + * Allocate and initialize an EventCount. + * + * @param ecPtr a pointer to hold the new EventCount + **/ +__attribute__((warn_unused_result)) +int makeEventCount(EventCount **ecPtr); + +/** + * Free an EventCount. It must no longer be in use. + * + * @param ec the EventCount to free + **/ +void freeEventCount(EventCount *ec); + +/** + * Wake all threads that are waiting for the next event. + * + * @param ec the EventCount to signal + **/ +void eventCountBroadcast(EventCount *ec); + +/** + * Prepare to wait for the EventCount to change by capturing a token of its + * current state. The caller MUST eventually either call eventCountWait() or + * eventCountCancel() exactly once for each token obtained. + * + * @param ec the EventCount on which to prepare to wait + * + * @return an EventToken to be passed to the next eventCountWait() call + **/ +EventToken eventCountPrepare(EventCount *ec) + __attribute__((warn_unused_result)); + +/** + * Cancel a wait token that has been prepared but not waited upon. This must + * be called after eventCountPrepare() when eventCountWait() is not going to + * be invoked on the token. + * + * @param ec the EventCount from which a wait token was obtained + * @param token the wait token that will never be passed to eventCountWait() + **/ +void eventCountCancel(EventCount *ec, EventToken token); + +/** + * Check if the current event count state corresponds to the provided token, + * and if it is, wait for a signal that the state has changed. If an optional + * timeout is provided, the wait will terminate after the timeout has elapsed. + * Timing out automatically cancels the wait token, so callers must not + * attempt to cancel the token on timeout. + * + * @param ec the EventCount on which to wait + * @param token the EventToken returned by eventCountPrepare() + * @param timeout either NULL or a relative timeout for the wait operation + * + * @return true if the state has already changed or if signalled, otherwise + * false if a timeout was provided and the wait timed out + **/ +bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout); + +#endif /* EVENT_COUNT_H */ diff --git a/uds/util/funnelQueue.c b/uds/util/funnelQueue.c new file mode 100644 index 0000000..017e405 --- /dev/null +++ b/uds/util/funnelQueue.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.c#2 $ + */ + +#include "funnelQueue.h" + +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/**********************************************************************/ +int makeFunnelQueue(FunnelQueue **queuePtr) +{ + // Allocate the queue on a cache line boundary so the producer and consumer + // fields in the structure will land on separate cache lines. + FunnelQueue *queue; + int result = ALLOCATE(1, FunnelQueue, "funnel queue", &queue); + if (result != UDS_SUCCESS) { + return result; + } + + // Initialize the stub entry and put it in the queue, establishing the + // invariant that queue->newest and queue->oldest are never null. + queue->stub.next = NULL; + queue->newest = &queue->stub; + queue->oldest = &queue->stub; + + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeFunnelQueue(FunnelQueue *queue) +{ + FREE(queue); +} + +/**********************************************************************/ +static FunnelQueueEntry *getOldest(FunnelQueue *queue) +{ + /* + * Barrier requirements: We need a read barrier between reading a "next" + * field pointer value and reading anything it points to. There's an + * accompanying barrier in funnelQueuePut between its caller setting up the + * entry and making it visible. + */ + FunnelQueueEntry *oldest = queue->oldest; + FunnelQueueEntry *next = oldest->next; + + if (oldest == &queue->stub) { + // When the oldest entry is the stub and it has no successor, the queue is + // logically empty. + if (next == NULL) { + return NULL; + } + // The stub entry has a successor, so the stub can be dequeued and ignored + // without breaking the queue invariants. + oldest = next; + queue->oldest = oldest; + smp_read_barrier_depends(); + next = oldest->next; + } + + // We have a non-stub candidate to dequeue. If it lacks a successor, we'll + // need to put the stub entry back on the queue first. + if (next == NULL) { + FunnelQueueEntry *newest = queue->newest; + if (oldest != newest) { + // Another thread has already swung queue->newest atomically, but not + // yet assigned previous->next. The queue is really still empty. + return NULL; + } + + // Put the stub entry back on the queue, ensuring a successor will + // eventually be seen. + funnelQueuePut(queue, &queue->stub); + + // Check again for a successor. + next = oldest->next; + if (next == NULL) { + // We lost a race with a producer who swapped queue->newest before we + // did, but who hasn't yet updated previous->next. Try again later. + return NULL; + } + } + return oldest; +} + +/**********************************************************************/ +FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) +{ + FunnelQueueEntry *oldest = getOldest(queue); + if (oldest == NULL) { + return oldest; + } + + /* + * Dequeue the oldest entry and return it. Only one consumer thread may call + * this function, so no locking, atomic operations, or fences are needed; + * queue->oldest is owned by the consumer and oldest->next is never used by + * a producer thread after it is swung from NULL to non-NULL. + */ + queue->oldest = oldest->next; + /* + * Make sure the caller sees the proper stored data for this entry. + * + * Since we've already fetched the entry pointer we stored in + * "queue->oldest", this also ensures that on entry to the next call we'll + * properly see the dependent data. + */ + smp_rmb(); + /* + * If "oldest" is a very light-weight work item, we'll be looking + * for the next one very soon, so prefetch it now. + */ + prefetchAddress(queue->oldest, true); + oldest->next = NULL; + return oldest; +} + +/**********************************************************************/ +bool isFunnelQueueEmpty(FunnelQueue *queue) +{ + return getOldest(queue) == NULL; +} + +/**********************************************************************/ +bool isFunnelQueueIdle(FunnelQueue *queue) +{ + /* + * Oldest is not the stub, so there's another entry, though if next is + * NULL we can't retrieve it yet. + */ + if (queue->oldest != &queue->stub) { + return false; + } + + /* + * Oldest is the stub, but newest has been updated by _put(); either + * there's another, retrievable entry in the list, or the list is + * officially empty but in the intermediate state of having an entry + * added. + * + * Whether anything is retrievable depends on whether stub.next has + * been updated and become visible to us, but for idleness we don't + * care. And due to memory ordering in _put(), the update to newest + * would be visible to us at the same time or sooner. + */ + if (queue->newest != &queue->stub) { + return false; + } + + // Otherwise, we're idle. + return true; +} diff --git a/uds/util/funnelQueue.h b/uds/util/funnelQueue.h new file mode 100644 index 0000000..083d00b --- /dev/null +++ b/uds/util/funnelQueue.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.h#2 $ + */ + +#ifndef FUNNEL_QUEUE_H +#define FUNNEL_QUEUE_H + +#include "atomicDefs.h" +#include "compiler.h" +#include "cpu.h" +#include "typeDefs.h" + +/** + * A FunnelQueue is a simple lock-free (almost) queue that accepts entries + * from multiple threads (multi-producer) and delivers them to a single thread + * (single-consumer). "Funnel" is an attempt to evoke the image of requests + * from more than one producer being "funneled down" to a single consumer. + * + * This is an unsynchronized but thread-safe data structure when used as + * intended. There is no mechanism to ensure that only one thread is consuming + * from the queue, so if that is done mistakenly, it will not be trapped, and + * the resulting behavior is undefined. Clients must not directly access or + * manipulate the internals, which are only exposed for the purpose of + * allowing the very simple enqueue operation to be in-lined. + * + * The implementation requires that a FunnelQueueEntry structure (a link + * pointer) be embedded in the queue entries, and pointers to those structures + * are used exclusively by the queue. No macros are defined to template the + * queue, so the offset of the FunnelQueueEntry in the records placed in the + * queue must all have a fixed offset so the client can derive their structure + * pointer from the entry pointer returned by funnelQueuePoll(). + * + * Callers are wholly responsible for allocating and freeing the entries. + * Entries may be freed as soon as they are returned since this queue is not + * susceptible to the "ABA problem" present in many lock-free data structures. + * The queue is dynamically allocated to ensure cache-line alignment, but no + * other dynamic allocation is used. + * + * The algorithm is not actually 100% lock-free. There is a single point in + * funnelQueuePut() at which a pre-empted producer will prevent the consumers + * from seeing items added to the queue by later producers, and only if the + * queue is short enough or the consumer fast enough for it to reach what was + * the end of the queue at the time of the pre-empt. + * + * The consumer function, funnelQueuePoll(), will return NULL when the queue + * is empty. To wait for data to consume, spin (if safe) or combine the queue + * with an EventCount to signal the presence of new entries. + **/ + +/** + * The queue link structure that must be embedded in client entries. + **/ +typedef struct funnelQueueEntry { + // The next (newer) entry in the queue. + struct funnelQueueEntry * volatile next; +} FunnelQueueEntry; + +/** + * The dynamically allocated queue structure, which is aligned to a cache line + * boundary when allocated. This should be consider opaque; it is exposed here + * so funnelQueuePut() can be in-lined. + **/ +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) funnelQueue { + // The producers' end of the queue--an atomically exchanged pointer that + // will never be NULL. + FunnelQueueEntry * volatile newest; + + // The consumer's end of the queue. Owned by the consumer and never NULL. + FunnelQueueEntry *oldest __attribute__((aligned(CACHE_LINE_BYTES))); + + // A re-usable dummy entry used to provide the non-NULL invariants above. + FunnelQueueEntry stub; +} FunnelQueue; + +/** + * Construct and initialize a new, empty queue. + * + * @param queuePtr a pointer in which to store the queue + * + * @return UDS_SUCCESS or an error code + **/ +int makeFunnelQueue(FunnelQueue **queuePtr) + __attribute__((warn_unused_result)); + +/** + * Free a queue. + * + * This will not free any entries in the queue. The caller must ensure that + * either the queue will be empty or that any entries in the queue will not be + * leaked by dropping the references from queue. + * + * @param queue the queue to free + **/ +void freeFunnelQueue(FunnelQueue *queue); + +/** + * Put an entry on the end of the queue. + * + * The entry pointer must be to the FunnelQueueEntry embedded in the caller's + * data structure. The caller must be able to derive the address of the start + * of their data structure from the pointer that passed in here, so every + * entry in the queue must have the FunnelQueueEntry at the same offset within + * the client's structure. + * + * @param queue the queue on which to place the entry + * @param entry the entry to be added to the queue + **/ +static INLINE void funnelQueuePut(FunnelQueue *queue, FunnelQueueEntry *entry) +{ + /* + * Barrier requirements: All stores relating to the entry ("next" pointer, + * containing data structure fields) must happen before the previous->next + * store making it visible to the consumer. Also, the entry's "next" field + * initialization to NULL must happen before any other producer threads can + * see the entry (the xchg) and try to update the "next" field. + * + * xchg implements a full barrier. + */ + entry->next = NULL; + /* + * The xchg macro in the PPC kernel calls a function that takes a void* + * argument, triggering a warning about dropping the volatile qualifier. + */ +#pragma GCC diagnostic push +#if __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" +#endif + FunnelQueueEntry *previous = xchg(&queue->newest, entry); +#pragma GCC diagnostic pop + // Pre-empts between these two statements hide the rest of the queue from + // the consumer, preventing consumption until the following assignment runs. + previous->next = entry; +} + +/** + * Poll a queue, removing the oldest entry if the queue is not empty. This + * function must only be called from a single consumer thread. + * + * @param queue the queue from which to remove an entry + * + * @return the oldest entry in the queue, or NULL if the queue is empty. + **/ +FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +/** + * Check whether the funnel queue is empty or not. This function must only be + * called from a single consumer thread, as with funnelQueuePoll. + * + * If the queue is in a transition state with one or more entries being added + * such that the list view is incomplete, it may not be possible to retrieve an + * entry with the funnelQueuePoll() function. In such states this function will + * report an empty indication. + * + * @param queue the queue which to check for entries. + * + * @return true iff queue contains no entry which can be retrieved + **/ +bool isFunnelQueueEmpty(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +/** + * Check whether the funnel queue is idle or not. This function must only be + * called from a single consumer thread, as with funnel_queue_poll. + * + * If the queue has entries available to be retrieved, it is not idle. If the + * queue is in a transition state with one or more entries being added such + * that the list view is incomplete, it may not be possible to retrieve an + * entry with the funnel_queue_poll() function, but the queue will not be + * considered idle. + * + * @param queue the queue which to check for entries. + * + * @return true iff queue contains no entry which can be retrieved nor is + * known to be having an entry added + **/ +bool isFunnelQueueIdle(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +#endif /* FUNNEL_QUEUE_H */ diff --git a/uds/util/radixSort.c b/uds/util/radixSort.c new file mode 100644 index 0000000..cae4f90 --- /dev/null +++ b/uds/util/radixSort.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.c#2 $ + */ + +/* + * Radix sort is implemented using an American Flag sort, an unstable, + * in-place 8-bit radix exchange sort. + * + * Adapted from the algorithm in the paper by Peter M. McIlroy, Keith Bostic, + * and M. Douglas McIlroy, "Engineering Radix Sort". + * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf + */ + +#include "radixSort.h" + +#include "compiler.h" +#include "memoryAlloc.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" + +enum { + // Piles smaller than this are handled with a simple insertion sort. + INSERTION_SORT_THRESHOLD = 12 +}; + +// Sort keys are pointers to immutable fixed-length arrays of bytes. +typedef const uint8_t * Key; + +/** + * The keys are separated into piles based on the byte in each + * keys at the current offset, so the number of keys with each + * byte must be counted. + **/ +typedef struct { + uint16_t used; // number of non-empty bins + uint16_t first; // index (key byte) of the first non-empty bin + uint16_t last; // index (key byte) of the last non-empty bin + uint32_t size[256]; // size[byte] == # of occurrences of byte +} Histogram; + +/** + * Sub-tasks are manually managed on a stack, both for performance + * and to put a logarithmic bound on the stack space needed. + **/ +typedef struct { + Key *firstKey; // Pointers to first and last keys to sort, inclusive. + Key *lastKey; + uint16_t offset; // The offset into the key at which to continue sorting. + uint16_t length; // The number of bytes remaining in the sort keys. +} Task; + +struct radixSorter { + unsigned int count; + Histogram bins; + Key *pile[256]; + Task *endOfStack; + Task isList[256]; + Task stack[]; +}; + +/** + * Compare a segment of two fixed-length keys starting an offset. + * + * @param key1 the first key + * @param key2 the second key + * @param offset the offset into the keys of the first byte to compare + * @param length the number of bytes remaining in each key + **/ +static INLINE int compare(Key key1, Key key2, uint16_t offset, uint16_t length) +{ + return memcmp(&key1[offset], &key2[offset], length); +} + +/** + * Insert the next unsorted key into an array of sorted keys. + * + * @param task the description of the keys being sorted + * @param next the pointer to the unsorted key to insert into + * the array of sorted key pointers preceding it + **/ +static INLINE void insertKey(const Task task, Key *next) +{ + // Pull the unsorted key out, freeing up the array slot. + Key unsorted = *next; + // Compare the key to the preceding sorted entries, shifting + // down the ones that are larger. + while ((--next >= task.firstKey) + && (compare(unsorted, next[0], task.offset, task.length) < 0)) { + next[1] = next[0]; + } + // Insert the key into the last slot that was cleared, sorting it. + next[1] = unsorted; +} + +/** + * Sort a range of key segments using an insertion sort. This simple sort is + * faster than the 256-way radix sort when the number of keys to sort is + * small. + * + * @param task the description of the keys to sort + **/ +static INLINE void insertionSort(const Task task) +{ + // (firstKey .. firstKey) is trivially sorted. Repeatedly insert the next + // key into the sorted list of keys preceding it, and voila! + Key *next; + for (next = task.firstKey + 1; next <= task.lastKey; next++) { + insertKey(task, next); + } +} + +/** + * Push a sorting task onto the task stack, increasing the stack pointer. + **/ +static INLINE void pushTask(Task **stackPointer, + Key *firstKey, + uint32_t count, + uint16_t offset, + uint16_t length) +{ + Task *task = (*stackPointer)++; + task->firstKey = firstKey; + task->lastKey = &firstKey[count - 1]; + task->offset = offset; + task->length = length; +} + +/**********************************************************************/ +static INLINE void swapKeys(Key *a, Key *b) +{ + Key c = *a; + *a = *b; + *b = c; +} + +/** + * Count the number of times each byte value appears in in the arrays of keys + * to sort at the current offset, keeping track of the number of non-empty + * bins, and the index of the first and last non-empty bin. + * + * @param task the description of the keys to sort + * @param bins the histogram bins receiving the counts + **/ +static INLINE void measureBins(const Task task, Histogram *bins) +{ + // Set bogus values that will will be replaced by min and max, respectively. + bins->first = UINT8_MAX; + bins->last = 0; + + // Subtle invariant: bins->used and bins->size[] are zero because the + // sorting code clears it all out as it goes. Even though this structure is + // re-used, we don't need to pay to zero it before starting a new tally. + + Key *keyPtr; + for (keyPtr = task.firstKey; keyPtr <= task.lastKey; keyPtr++) { + // Increment the count for the byte in the key at the current offset. + uint8_t bin = (*keyPtr)[task.offset]; + uint32_t size = ++bins->size[bin]; + + // Track non-empty bins when the count transitions from zero to one. + if (size == 1) { + bins->used += 1; + if (bin < bins->first) { + bins->first = bin; + } + if (bin > bins->last) { + bins->last = bin; + } + } + } +} + +/** + * Convert the bin sizes to pointers to where each pile goes. + * + * pile[0] = firstKey + bin->size[0], + * pile[1] = pile[0] + bin->size[1], etc. + * + * After the keys are moved to the appropriate pile, we'll need to sort + * each of the piles by the next radix position. A new task is put on the + * stack for each pile containing lots of keys, or a new task is is put on + * the list for each pile containing few keys. + * + * @param stack pointer the top of the stack + * @param endOfStack the end of the stack + * @param list pointer the head of the list + * @param pile array that will be filled pointers to the end of each pile + * @param bins the histogram of the sizes of each pile + * @param firstKey the first key of the stack + * @param offset the next radix position to sort by + * @param length the number of bytes remaining in the sort keys + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int pushBins(Task **stack, + Task *endOfStack, + Task **list, + Key *pile[], + Histogram *bins, + Key *firstKey, + uint16_t offset, + uint16_t length) +{ + Key *pileStart = firstKey; + int bin; + for (bin = bins->first; ; bin++) { + uint32_t size = bins->size[bin]; + // Skip empty piles. + if (size == 0) { + continue; + } + // There's no need to sort empty keys. + if (length > 0) { + if (size > INSERTION_SORT_THRESHOLD) { + if (*stack >= endOfStack) { + return UDS_BAD_STATE; + } + pushTask(stack, pileStart, size, offset, length); + } else if (size > 1) { + pushTask(list, pileStart, size, offset, length); + } + } + pileStart += size; + pile[bin] = pileStart; + if (--bins->used == 0) { + break; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeRadixSorter(unsigned int count, RadixSorter **sorter) +{ + unsigned int stackSize = count / INSERTION_SORT_THRESHOLD; + RadixSorter *radixSorter; + int result = ALLOCATE_EXTENDED(RadixSorter, stackSize, Task, __func__, + &radixSorter); + if (result != UDS_SUCCESS) { + return result; + } + radixSorter->count = count; + radixSorter->endOfStack = radixSorter->stack + stackSize; + *sorter = radixSorter; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeRadixSorter(RadixSorter *sorter) +{ + FREE(sorter); +} + +/**********************************************************************/ +int radixSort(RadixSorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length) +{ + // All zero-length keys are identical and therefore already sorted. + if ((count == 0) || (length == 0)) { + return UDS_SUCCESS; + } + + // The initial task is to sort the entire length of all the keys. + Task start = { + .firstKey = keys, + .lastKey = &keys[count - 1], + .offset = 0, + .length = length, + }; + + if (count <= INSERTION_SORT_THRESHOLD) { + insertionSort(start); + return UDS_SUCCESS; + } + + if (count > sorter->count) { + return UDS_INVALID_ARGUMENT; + } + + Histogram *bins = &sorter->bins; + Key **pile = sorter->pile; + Task *sp = sorter->stack; + + /* + * Repeatedly consume a sorting task from the stack and process it, pushing + * new sub-tasks onto to the stack for each radix-sorted pile. When all + * tasks and sub-tasks have been processed, the stack will be empty and all + * the keys in the starting task will be fully sorted. + */ + for (*sp = start; sp >= sorter->stack; sp--) { + const Task task = *sp; + measureBins(task, bins); + + // Now that we know how large each bin is, generate pointers for each of + // the piles and push a new task to sort each pile by the next radix byte. + Task *lp = sorter->isList; + int result = pushBins(&sp, sorter->endOfStack, &lp, pile, bins, + task.firstKey, task.offset + 1, task.length - 1); + if (result != UDS_SUCCESS) { + memset(bins, 0, sizeof(*bins)); + return result; + } + // Now bins->used is zero again. + + // Don't bother processing the last pile--when piles 0..N-1 are all in + // place, then pile N must also be in place. + Key *end = task.lastKey - bins->size[bins->last]; + bins->size[bins->last] = 0; + + Key *fence; + for (fence = task.firstKey; fence <= end; ) { + uint8_t bin; + Key key = *fence; + // The radix byte of the key tells us which pile it belongs in. Swap it + // for an unprocessed item just below that pile, and repeat. + while (--pile[bin = key[task.offset]] > fence) { + swapKeys(pile[bin], &key); + } + // The pile reached the fence. Put the key at the bottom of that pile. + // completing it, and advance the fence to the next pile. + *fence = key; + fence += bins->size[bin]; + bins->size[bin] = 0; + } + // Now bins->size[] is all zero again. + + // When the number of keys in a task gets small enough, its faster to use + // an insertion sort than to keep subdividing into tiny piles. + while (--lp >= sorter->isList) { + insertionSort(*lp); + } + } + return UDS_SUCCESS; +} diff --git a/uds/util/radixSort.h b/uds/util/radixSort.h new file mode 100644 index 0000000..55f19ba --- /dev/null +++ b/uds/util/radixSort.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.h#1 $ + */ + +#ifndef RADIX_SORT_H +#define RADIX_SORT_H + +/* + * The implementation uses one large object allocated on the heap. This + * large object can be reused as many times as desired. There is no + * further heap usage by the sorting. + */ +typedef struct radixSorter RadixSorter; + +/** + * Reserve the heap storage needed by the radixSort routine. The amount of + * heap space is logarithmically proportional to the number of keys. + * + * @param count The maximum number of keys to be sorted + * @param sorter The RadixSorter object is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeRadixSorter(unsigned int count, RadixSorter **sorter) + __attribute__((warn_unused_result)); + +/** + * Free the heap storage needed by the radixSort routine. + * + * @param sorter The RadixSorter object to free + **/ +void freeRadixSorter(RadixSorter *sorter); + +/** + * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. + * + * The sort implementation is unstable--relative ordering of equal keys is not + * preserved. The implementation does not use any heap allocation. + * + * @param [in] sorter the heap storage used by the sorting + * @param keys the array of key pointers to sort (modified in place) + * @param [in] count the number of keys + * @param [in] length the length of every key, in bytes + * + * @return UDS_SUCCESS or an error code + **/ +int radixSort(RadixSorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length) + __attribute__((warn_unused_result)); + +#endif /* RADIX_SORT_H */ diff --git a/uds/volume.c b/uds/volume.c new file mode 100644 index 0000000..4f320c5 --- /dev/null +++ b/uds/volume.c @@ -0,0 +1,1383 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volume.c#23 $ + */ + +#include "volume.h" + +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "compiler.h" +#include "errors.h" +#include "geometry.h" +#include "hashUtils.h" +#include "indexConfig.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "recordPage.h" +#include "request.h" +#include "sparseCache.h" +#include "stringUtils.h" +#include "threads.h" + +enum { + MAX_BAD_CHAPTERS = 100, // max number of contiguous bad chapters + DEFAULT_VOLUME_READ_THREADS = 2, // Default number of reader threads + MAX_VOLUME_READ_THREADS = 16, // Maximum number of reader threads +}; + +/**********************************************************************/ +static unsigned int getReadThreads(const struct uds_parameters *userParams) +{ + unsigned int readThreads = (userParams == NULL + ? DEFAULT_VOLUME_READ_THREADS + : userParams->read_threads); + if (readThreads < 1) { + readThreads = 1; + } + if (readThreads > MAX_VOLUME_READ_THREADS) { + readThreads = MAX_VOLUME_READ_THREADS; + } + return readThreads; +} + +/**********************************************************************/ +static INLINE unsigned int mapToPageNumber(Geometry *geometry, + unsigned int physicalPage) +{ + return ((physicalPage - 1) % geometry->pagesPerChapter); +} + +/**********************************************************************/ +static INLINE unsigned int mapToChapterNumber(Geometry *geometry, + unsigned int physicalPage) +{ + return ((physicalPage - 1) / geometry->pagesPerChapter); +} + +/**********************************************************************/ +static INLINE bool isRecordPage(Geometry *geometry, unsigned int physicalPage) +{ + return (((physicalPage - 1) % geometry->pagesPerChapter) + >= geometry->indexPagesPerChapter); +} + +/**********************************************************************/ +static INLINE unsigned int getZoneNumber(Request *request) +{ + return (request == NULL) ? 0 : request->zoneNumber; +} + +/**********************************************************************/ +int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) +{ + // Page zero is the header page, so the first index page in the + // first chapter is physical page one. + return (1 + (geometry->pagesPerChapter * chapter) + page); +} + +/**********************************************************************/ +static void waitForReadQueueNotFull(Volume *volume, Request *request) +{ + unsigned int zoneNumber = getZoneNumber(request); + InvalidateCounter invalidateCounter = getInvalidateCounter(volume->pageCache, + zoneNumber); + if (searchPending(invalidateCounter)) { + // Increment the invalidate counter to avoid deadlock where the reader + // threads cannot make progress because they are waiting on the counter + // and the index thread cannot because the read queue is full. + endPendingSearch(volume->pageCache, zoneNumber); + } + + while (readQueueIsFull(volume->pageCache)) { + logDebug("Waiting until read queue not full"); + signalCond(&volume->readThreadsCond); + waitCond(&volume->readThreadsReadDoneCond, &volume->readThreadsMutex); + } + + if (searchPending(invalidateCounter)) { + // Increment again so we get back to an odd value. + beginPendingSearch(volume->pageCache, pageBeingSearched(invalidateCounter), + zoneNumber); + } +} + +/**********************************************************************/ +int enqueuePageRead(Volume *volume, Request *request, int physicalPage) +{ + // Don't allow new requests if we are shutting down, but make sure + // to process any requests that are still in the pipeline. + if ((volume->readerState & READER_STATE_EXIT) != 0) { + logInfo("failed to queue read while shutting down"); + return UDS_SHUTTINGDOWN; + } + + // Mark the page as queued in the volume cache, for chapter invalidation to + // be able to cancel a read. + // If we are unable to do this because the queues are full, flush them first + int result; + while ((result = enqueueRead(volume->pageCache, request, physicalPage)) + == UDS_SUCCESS) { + logDebug("Read queues full, waiting for reads to finish"); + waitForReadQueueNotFull(volume, request); + } + + if (result == UDS_QUEUED) { + /* signal a read thread */ + signalCond(&volume->readThreadsCond); + } + + return result; +} + +/**********************************************************************/ +static INLINE void waitToReserveReadQueueEntry(Volume *volume, + unsigned int *queuePos, + Request **requestList, + unsigned int *physicalPage, + bool *invalid) +{ + while (((volume->readerState & READER_STATE_EXIT) == 0) + && (((volume->readerState & READER_STATE_STOP) != 0) + || !reserveReadQueueEntry(volume->pageCache, queuePos, + requestList, physicalPage, invalid))) { + waitCond(&volume->readThreadsCond, &volume->readThreadsMutex); + } +} + +/**********************************************************************/ +static int initChapterIndexPage(const Volume *volume, + byte *indexPage, + unsigned int chapter, + unsigned int indexPageNumber, + DeltaIndexPage *chapterIndexPage) +{ + Geometry *geometry = volume->geometry; + + int result = initializeChapterIndexPage(chapterIndexPage, geometry, + indexPage, volume->nonce); + if (volume->lookupMode == LOOKUP_FOR_REBUILD) { + return result; + } + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "Reading chapter index page for chapter %u" + " page %u", + chapter, indexPageNumber); + } + + IndexPageBounds bounds; + result = getListNumberBounds(volume->indexPageMap, chapter, + indexPageNumber, &bounds); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t ciVirtual = chapterIndexPage->virtualChapterNumber; + unsigned int ciChapter = mapToPhysicalChapter(geometry, ciVirtual); + if ((chapter == ciChapter) + && (bounds.lowestList == chapterIndexPage->lowestListNumber) + && (bounds.highestList == chapterIndexPage->highestListNumber)) { + return UDS_SUCCESS; + } + + logWarning("Index page map updated to %llu", + getLastUpdate(volume->indexPageMap)); + logWarning("Page map expects that chapter %u page %u has range %u to %u, " + "but chapter index page has chapter %" PRIu64 + " with range %u to %u", + chapter, indexPageNumber, bounds.lowestList, bounds.highestList, + ciVirtual, chapterIndexPage->lowestListNumber, + chapterIndexPage->highestListNumber); + return ASSERT_WITH_ERROR_CODE(false, + UDS_CORRUPT_DATA, + "index page map mismatch with chapter index"); +} + +/**********************************************************************/ +static int initializeIndexPage(const Volume *volume, + unsigned int physicalPage, + CachedPage *page) +{ + unsigned int chapter = mapToChapterNumber(volume->geometry, physicalPage); + unsigned int indexPageNumber = mapToPageNumber(volume->geometry, + physicalPage); + int result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), + chapter, indexPageNumber, + &page->cp_indexPage); + return result; +} + +/**********************************************************************/ +static void readThreadFunction(void *arg) +{ + Volume *volume = arg; + unsigned int queuePos; + Request *requestList; + unsigned int physicalPage; + bool invalid = false; + + logDebug("reader starting"); + lockMutex(&volume->readThreadsMutex); + while (true) { + waitToReserveReadQueueEntry(volume, &queuePos, &requestList, &physicalPage, + &invalid); + if ((volume->readerState & READER_STATE_EXIT) != 0) { + break; + } + + volume->busyReaderThreads++; + + bool recordPage = isRecordPage(volume->geometry, physicalPage); + + CachedPage *page = NULL; + int result = UDS_SUCCESS; + if (!invalid) { + // Find a place to put the read queue page we reserved above. + result = selectVictimInCache(volume->pageCache, &page); + if (result == UDS_SUCCESS) { + unlockMutex(&volume->readThreadsMutex); + result = readVolumePage(&volume->volumeStore, physicalPage, + &page->cp_pageData); + if (result != UDS_SUCCESS) { + logWarning("Error reading page %u from volume", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + lockMutex(&volume->readThreadsMutex); + } else { + logWarning("Error selecting cache victim for page read"); + } + + if (result == UDS_SUCCESS) { + if (!volume->pageCache->readQueue[queuePos].invalid) { + if (!recordPage) { + result = initializeIndexPage(volume, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error initializing chapter index page"); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + } + + if (result == UDS_SUCCESS) { + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + } + } else { + logWarning("Page %u invalidated after read", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + invalid = true; + } + } + } else { + logDebug("Requeuing requests for invalid page"); + } + + if (invalid) { + result = UDS_SUCCESS; + page = NULL; + } + + while (requestList != NULL) { + Request *request = requestList; + requestList = request->nextRequest; + + /* + * If we've read in a record page, we're going to do an immediate search, + * in an attempt to speed up processing when we requeue the request, so + * that it doesn't have to go back into the getRecordFromZone code again. + * However, if we've just read in an index page, we don't want to search. + * We want the request to be processed again and getRecordFromZone to be + * run. We have added new fields in request to allow the index code to + * know whether it can stop processing before getRecordFromZone is called + * again. + */ + if ((result == UDS_SUCCESS) && (page != NULL) && recordPage) { + if (searchRecordPage(getPageData(&page->cp_pageData), + &request->chunkName, volume->geometry, + &request->oldMetadata)) { + request->slLocation = LOC_IN_DENSE; + } else { + request->slLocation = LOC_UNAVAILABLE; + } + request->slLocationKnown = true; + } + + // reflect any read failures in the request status + request->status = result; + restartRequest(request); + } + + releaseReadQueueEntry(volume->pageCache, queuePos); + + volume->busyReaderThreads--; + broadcastCond(&volume->readThreadsReadDoneCond); + } + unlockMutex(&volume->readThreadsMutex); + logDebug("reader done"); +} + +/**********************************************************************/ +static int readPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + bool syncRead, + CachedPage **pagePtr) +{ + syncRead |= ((volume->lookupMode == LOOKUP_FOR_REBUILD) + || (request == NULL) + || (request->session == NULL)); + + int result = UDS_SUCCESS; + + CachedPage *page = NULL; + if (syncRead) { + // Find a place to put the page. + result = selectVictimInCache(volume->pageCache, &page); + if (result != UDS_SUCCESS) { + logWarning("Error selecting cache victim for page read"); + return result; + } + result = readVolumePage(&volume->volumeStore, physicalPage, + &page->cp_pageData); + if (result != UDS_SUCCESS) { + logWarning("Error reading page %u from volume", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + if (!isRecordPage(volume->geometry, physicalPage)) { + result = initializeIndexPage(volume, physicalPage, page); + if (result != UDS_SUCCESS) { + if (volume->lookupMode != LOOKUP_FOR_REBUILD) { + logWarning("Corrupt index page %u", physicalPage); + } + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + } + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + } else { + result = enqueuePageRead(volume, request, physicalPage); + if (result != UDS_SUCCESS) { + return result; + } + } + + *pagePtr = page; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **pagePtr) +{ + CachedPage *page = NULL; + int result = getPageFromCache(volume->pageCache, physicalPage, probeType, + &page); + if (result != UDS_SUCCESS) { + return result; + } + if (page == NULL) { + result = readPageLocked(volume, request, physicalPage, true, &page); + if (result != UDS_SUCCESS) { + return result; + } + } else if (getZoneNumber(request) == 0) { + // Only 1 zone is responsible for updating LRU + makePageMostRecent(volume->pageCache, page); + } + + *pagePtr = page; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPageProtected(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **pagePtr) +{ + CachedPage *page = NULL; + int result = getPageFromCache(volume->pageCache, physicalPage, + probeType | CACHE_PROBE_IGNORE_FAILURE, + &page); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zoneNumber = getZoneNumber(request); + // If we didn't find a page we need to enqueue a read for it, in which + // case we need to grab the mutex. + if (page == NULL) { + endPendingSearch(volume->pageCache, zoneNumber); + lockMutex(&volume->readThreadsMutex); + + /* + * Do the lookup again while holding the read mutex (no longer the fast + * case so this should be ok to repeat). We need to do this because an + * page may have been added to the page map by the reader thread between + * the time searched above and the time we went to actually try to enqueue + * it below. This could result in us enqueuing another read for an page + * which is already in the cache, which would mean we end up with two + * entries in the cache for the same page. + */ + result + = getPageFromCache(volume->pageCache, physicalPage, probeType, &page); + if (result != UDS_SUCCESS) { + /* + * In non-success cases (anything not UDS_SUCCESS, meaning both + * UDS_QUEUED and "real" errors), the caller doesn't get a + * handle on a cache page, so it can't continue the search, and + * we don't need to prevent other threads from messing with the + * cache. + * + * However, we do need to set the "search pending" flag because + * the callers expect it to always be set on return, even if + * they can't actually do the search. + * + * Doing the calls in this order ought to be faster, since we + * let other threads have the reader thread mutex (which can + * require a syscall) ASAP, and set the "search pending" state + * that can block the reader thread as the last thing. + */ + unlockMutex(&volume->readThreadsMutex); + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + return result; + } + + // If we found the page now, we can release the mutex and proceed + // as if this were the fast case. + if (page != NULL) { + /* + * If we found a page (*pagePtr != NULL and return + * UDS_SUCCESS), then we're telling the caller where to look for + * the cache page, and need to switch to "reader thread + * unlocked" and "search pending" state in careful order so no + * other thread can mess with the data before our caller gets to + * look at it. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + unlockMutex(&volume->readThreadsMutex); + } + } + + if (page == NULL) { + result = readPageLocked(volume, request, physicalPage, false, &page); + if (result != UDS_SUCCESS) { + /* + * This code path is used frequently in the UDS_QUEUED case, so + * the performance gain from unlocking first, while "search + * pending" mode is off, turns out to be significant in some + * cases. + */ + unlockMutex(&volume->readThreadsMutex); + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + return result; + } + + // See above re: ordering requirement. + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + unlockMutex(&volume->readThreadsMutex); + } else { + if (getZoneNumber(request) == 0 ) { + // Only 1 zone is responsible for updating LRU + makePageMostRecent(volume->pageCache, page); + } + } + + *pagePtr = page; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPage(Volume *volume, + unsigned int chapter, + unsigned int pageNumber, + CacheProbeType probeType, + byte **dataPtr, + DeltaIndexPage **indexPagePtr) +{ + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, pageNumber); + + lockMutex(&volume->readThreadsMutex); + CachedPage *page = NULL; + int result = getPageLocked(volume, NULL, physicalPage, probeType, &page); + unlockMutex(&volume->readThreadsMutex); + + if (dataPtr != NULL) { + *dataPtr = (page != NULL) ? getPageData(&page->cp_pageData) : NULL; + } + if (indexPagePtr != NULL) { + *indexPagePtr = (page != NULL) ? &page->cp_indexPage : NULL; + } + return result; +} + +/** + * Search for a chunk name in a cached index page or chapter index, returning + * the record page number from a chapter index match. + * + * @param volume the volume containing the index page to search + * @param request the request originating the search (may be NULL for + * a direct query from volume replay) + * @param name the name of the block or chunk + * @param chapter the chapter to search + * @param indexPageNumber the index page number of the page to search + * @param recordPageNumber pointer to return the chapter record page number + * (value will be NO_CHAPTER_INDEX_ENTRY if the name + * was not found) + * + * @return UDS_SUCCESS or an error code + **/ +static int searchCachedIndexPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + unsigned int indexPageNumber, + int *recordPageNumber) +{ + unsigned int zoneNumber = getZoneNumber(request); + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, indexPageNumber); + + /* + * Make sure the invalidate counter is updated before we try and read from + * the page map. This prevents this thread from reading a page in the + * page map which has already been marked for invalidation by the reader + * thread, before the reader thread has noticed that the invalidateCounter + * has been incremented. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + + CachedPage *page = NULL; + int result = getPageProtected(volume, request, physicalPage, + cacheProbeType(request, true), &page); + if (result != UDS_SUCCESS) { + endPendingSearch(volume->pageCache, zoneNumber); + return result; + } + + result + = ASSERT_LOG_ONLY(searchPending(getInvalidateCounter(volume->pageCache, + zoneNumber)), + "Search is pending for zone %u", zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + + result = searchChapterIndexPage(&page->cp_indexPage, volume->geometry, name, + recordPageNumber); + endPendingSearch(volume->pageCache, zoneNumber); + return result; +} + +/**********************************************************************/ +int searchCachedRecordPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + int recordPageNumber, + UdsChunkData *duplicate, + bool *found) +{ + *found = false; + + if (recordPageNumber == NO_CHAPTER_INDEX_ENTRY) { + // No record for that name can exist in the chapter. + return UDS_SUCCESS; + } + + Geometry *geometry = volume->geometry; + int result = ASSERT(((recordPageNumber >= 0) + && ((unsigned int) recordPageNumber + < geometry->recordPagesPerChapter)), + "0 <= %d <= %u", + recordPageNumber, geometry->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int pageNumber = geometry->indexPagesPerChapter + recordPageNumber; + + unsigned int zoneNumber = getZoneNumber(request); + int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, pageNumber); + + /* + * Make sure the invalidate counter is updated before we try and read from + * the page map. This prevents this thread from reading a page in the page + * map which has already been marked for invalidation by the reader thread, + * before the reader thread has noticed that the invalidateCounter has been + * incremented. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + + CachedPage *recordPage; + result = getPageProtected(volume, request, physicalPage, + cacheProbeType(request, false), &recordPage); + if (result != UDS_SUCCESS) { + endPendingSearch(volume->pageCache, zoneNumber); + return result; + } + + if (searchRecordPage(getPageData(&recordPage->cp_pageData), name, geometry, + duplicate)) { + *found = true; + } + endPendingSearch(volume->pageCache, zoneNumber); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int readChapterIndexFromVolume(const Volume *volume, + uint64_t virtualChapter, + struct volume_page volumePages[], + DeltaIndexPage indexPages[]) +{ + const Geometry *geometry = volume->geometry; + unsigned int physicalChapter = mapToPhysicalChapter(geometry, + virtualChapter); + int physicalPage = mapToPhysicalPage(geometry, physicalChapter, 0); + prefetchVolumePages(&volume->volumeStore, physicalPage, + geometry->indexPagesPerChapter); + + unsigned int i; + struct volume_page volumePage; + int result = initializeVolumePage(geometry, &volumePage); + for (i = 0; i < geometry->indexPagesPerChapter; i++) { + int result = readVolumePage(&volume->volumeStore, physicalPage + i, + &volumePages[i]); + if (result != UDS_SUCCESS) { + break; + } + byte *indexPage = getPageData(&volumePages[i]); + result = initChapterIndexPage(volume, indexPage, physicalChapter, i, + &indexPages[i]); + if (result != UDS_SUCCESS) { + break; + } + } + destroyVolumePage(&volumePage); + return result; +} + +/**********************************************************************/ +int searchVolumePageCache(Volume *volume, + Request *request, + const UdsChunkName *name, + uint64_t virtualChapter, + UdsChunkData *metadata, + bool *found) +{ + unsigned int physicalChapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + unsigned int indexPageNumber; + int result = findIndexPageNumber(volume->indexPageMap, name, physicalChapter, + &indexPageNumber); + if (result != UDS_SUCCESS) { + return result; + } + + int recordPageNumber; + result = searchCachedIndexPage(volume, request, name, physicalChapter, + indexPageNumber, &recordPageNumber); + if (result == UDS_SUCCESS) { + result = searchCachedRecordPage(volume, request, name, physicalChapter, + recordPageNumber, metadata, found); + } + + return result; +} + +/**********************************************************************/ +int forgetChapter(Volume *volume, + uint64_t virtualChapter, + InvalidationReason reason) +{ + logDebug("forgetting chapter %llu", virtualChapter); + unsigned int physicalChapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + lockMutex(&volume->readThreadsMutex); + int result + = invalidatePageCacheForChapter(volume->pageCache, physicalChapter, + volume->geometry->pagesPerChapter, + reason); + unlockMutex(&volume->readThreadsMutex); + return result; +} + +/** + * Donate index page data to the page cache for an index page that was just + * written to the volume. The caller must already hold the reader thread + * mutex. + * + * @param volume the volume + * @param physicalChapter the physical chapter number of the index page + * @param indexPageNumber the chapter page number of the index page + * @param scratchPage the index page data + **/ +static int donateIndexPageLocked(Volume *volume, + unsigned int physicalChapter, + unsigned int indexPageNumber, + struct volume_page *scratchPage) +{ + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, physicalChapter, indexPageNumber); + + // Find a place to put the page. + CachedPage *page = NULL; + int result = selectVictimInCache(volume->pageCache, &page); + if (result != UDS_SUCCESS) { + return result; + } + + // Exchange the scratch page with the cache page + swapVolumePages(&page->cp_pageData, scratchPage); + + result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), + physicalChapter, indexPageNumber, + &page->cp_indexPage); + if (result != UDS_SUCCESS) { + logWarning("Error initialize chapter index page"); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeIndexPages(Volume *volume, + int physicalPage, + OpenChapterIndex *chapterIndex, + byte **pages) +{ + Geometry *geometry = volume->geometry; + unsigned int physicalChapterNumber + = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); + unsigned int deltaListNumber = 0; + + unsigned int indexPageNumber; + for (indexPageNumber = 0; + indexPageNumber < geometry->indexPagesPerChapter; + indexPageNumber++) { + int result = prepareToWriteVolumePage(&volume->volumeStore, + physicalPage + indexPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to prepare index page"); + } + + // Pack as many delta lists into the index page as will fit. + unsigned int listsPacked; + bool lastPage = ((indexPageNumber + 1) == geometry->indexPagesPerChapter); + result = packOpenChapterIndexPage(chapterIndex, + getPageData(&volume->scratchPage), + deltaListNumber, lastPage, &listsPacked); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to pack index page"); + } + + result = writeVolumePage(&volume->volumeStore, + physicalPage + indexPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write chapter index page"); + } + + if (pages != NULL) { + memcpy(pages[indexPageNumber], getPageData(&volume->scratchPage), + geometry->bytesPerPage); + } + + // Tell the index page map the list number of the last delta list that was + // packed into the index page. + if (listsPacked == 0) { + logDebug("no delta lists packed on chapter %u page %u", + physicalChapterNumber, indexPageNumber); + } else { + deltaListNumber += listsPacked; + } + result = updateIndexPageMap(volume->indexPageMap, + chapterIndex->virtualChapterNumber, + physicalChapterNumber, + indexPageNumber, deltaListNumber - 1); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to update index page map"); + } + + // Donate the page data for the index page to the page cache. + lockMutex(&volume->readThreadsMutex); + result = donateIndexPageLocked(volume, physicalChapterNumber, + indexPageNumber, &volume->scratchPage); + unlockMutex(&volume->readThreadsMutex); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeRecordPages(Volume *volume, + int physicalPage, + const UdsChunkRecord records[], + byte **pages) +{ + Geometry *geometry = volume->geometry; + // Skip over the index pages, which come before the record pages + physicalPage += geometry->indexPagesPerChapter; + // The record array from the open chapter is 1-based. + const UdsChunkRecord *nextRecord = &records[1]; + + unsigned int recordPageNumber; + for (recordPageNumber = 0; + recordPageNumber < geometry->recordPagesPerChapter; + recordPageNumber++) { + int result = prepareToWriteVolumePage(&volume->volumeStore, + physicalPage + recordPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to prepare record page"); + } + + // Sort the next page of records and copy them to the record page as a + // binary tree stored in heap order. + result = encodeRecordPage(volume, nextRecord, + getPageData(&volume->scratchPage)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to encode record page %u", + recordPageNumber); + } + nextRecord += geometry->recordsPerPage; + + result = writeVolumePage(&volume->volumeStore, + physicalPage + recordPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write chapter record page"); + } + + if (pages != NULL) { + memcpy(pages[recordPageNumber], getPageData(&volume->scratchPage), + geometry->bytesPerPage); + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeChapter(Volume *volume, + OpenChapterIndex *chapterIndex, + const UdsChunkRecord records[]) +{ + // Determine the position of the virtual chapter in the volume file. + Geometry *geometry = volume->geometry; + unsigned int physicalChapterNumber + = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); + int physicalPage = mapToPhysicalPage(geometry, physicalChapterNumber, 0); + + // Pack and write the delta chapter index pages to the volume. + int result = writeIndexPages(volume, physicalPage, chapterIndex, NULL); + if (result != UDS_SUCCESS) { + return result; + } + // Sort and write the record pages to the volume. + result = writeRecordPages(volume, physicalPage, records, NULL); + if (result != UDS_SUCCESS) { + return result; + } + releaseVolumePage(&volume->scratchPage); + // Flush the data to permanent storage. + return syncVolumeStore(&volume->volumeStore); +} + +/**********************************************************************/ +size_t getCacheSize(Volume *volume) +{ + size_t size = getPageCacheSize(volume->pageCache); + if (isSparse(volume->geometry)) { + size += getSparseCacheMemorySize(volume->sparseCache); + } + return size; +} + +/**********************************************************************/ +static int probeChapter(Volume *volume, + unsigned int chapterNumber, + uint64_t *virtualChapterNumber) +{ + const Geometry *geometry = volume->geometry; + unsigned int expectedListNumber = 0; + uint64_t lastVCN = UINT64_MAX; + + prefetchVolumePages(&volume->volumeStore, + mapToPhysicalPage(geometry, chapterNumber, 0), + geometry->indexPagesPerChapter); + + unsigned int i; + for (i = 0; i < geometry->indexPagesPerChapter; ++i) { + DeltaIndexPage *page; + int result = getPage(volume, chapterNumber, i, CACHE_PROBE_INDEX_FIRST, + NULL, &page); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t vcn = page->virtualChapterNumber; + if (lastVCN == UINT64_MAX) { + lastVCN = vcn; + } else if (vcn != lastVCN) { + logError("inconsistent chapter %u index page %u: expected vcn %" + PRIu64 ", got vcn %llu", + chapterNumber, i, lastVCN, vcn); + return UDS_CORRUPT_COMPONENT; + } + + if (expectedListNumber != page->lowestListNumber) { + logError("inconsistent chapter %u index page %u: expected list number %u" + ", got list number %u", + chapterNumber, i, expectedListNumber, page->lowestListNumber); + return UDS_CORRUPT_COMPONENT; + } + expectedListNumber = page->highestListNumber + 1; + + result = validateChapterIndexPage(page, geometry); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (lastVCN == UINT64_MAX) { + logError("no chapter %u virtual chapter number determined", chapterNumber); + return UDS_CORRUPT_COMPONENT; + } + if (chapterNumber != lastVCN % geometry->chaptersPerVolume) { + logError("chapter %u vcn %llu is out of phase (%u)", + chapterNumber, lastVCN, geometry->chaptersPerVolume); + return UDS_CORRUPT_COMPONENT; + } + *virtualChapterNumber = lastVCN; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int probeWrapper(void *aux, + unsigned int chapterNumber, + uint64_t *virtualChapterNumber) +{ + Volume *volume = aux; + int result = probeChapter(volume, chapterNumber, virtualChapterNumber); + if ((result == UDS_CORRUPT_COMPONENT) || (result == UDS_CORRUPT_DATA)) { + *virtualChapterNumber = UINT64_MAX; + return UDS_SUCCESS; + } + return result; +} + +/**********************************************************************/ +static int findRealEndOfVolume(Volume *volume, + unsigned int limit, + unsigned int *limitPtr) +{ + /* + * Start checking from the end of the volume. As long as we hit corrupt + * data, start skipping larger and larger amounts until we find real data. + * If we find real data, reduce the span and try again until we find + * the exact boundary. + */ + unsigned int span = 1; + unsigned int tries = 0; + while (limit > 0) { + unsigned int chapter = (span > limit) ? 0 : limit - span; + uint64_t vcn = 0; + int result = probeChapter(volume, chapter, &vcn); + if (result == UDS_SUCCESS) { + if (span == 1) { + break; + } + span /= 2; + tries = 0; + } else if (result == UDS_CORRUPT_COMPONENT) { + limit = chapter; + if (++tries > 1) { + span *= 2; + } + } else { + return logErrorWithStringError(result, "cannot determine end of volume"); + } + } + + if (limitPtr != NULL) { + *limitPtr = limit; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int findVolumeChapterBoundaries(Volume *volume, + uint64_t *lowestVCN, + uint64_t *highestVCN, + bool *isEmpty) +{ + unsigned int chapterLimit = volume->geometry->chaptersPerVolume; + + int result = findRealEndOfVolume(volume, chapterLimit, &chapterLimit); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot find end of volume"); + } + + if (chapterLimit == 0) { + *lowestVCN = 0; + *highestVCN = 0; + *isEmpty = true; + return UDS_SUCCESS; + } + + *isEmpty = false; + return findVolumeChapterBoundariesImpl(chapterLimit, MAX_BAD_CHAPTERS, + lowestVCN, highestVCN, probeWrapper, + volume); +} + +/**********************************************************************/ +int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, + unsigned int maxBadChapters, + uint64_t *lowestVCN, + uint64_t *highestVCN, + int (*probeFunc)(void *aux, + unsigned int chapter, + uint64_t *vcn), + void *aux) +{ + if (chapterLimit == 0) { + *lowestVCN = 0; + *highestVCN = 0; + return UDS_SUCCESS; + } + + /* + * This method assumes there is at most one run of contiguous bad chapters + * caused by unflushed writes. Either the bad spot is at the beginning and + * end, or somewhere in the middle. Wherever it is, the highest and lowest + * VCNs are adjacent to it. Otherwise the volume is cleanly saved and + * somewhere in the middle of it the highest VCN immediately preceeds the + * lowest one. + */ + + uint64_t firstVCN = UINT64_MAX; + + // doesn't matter if this results in a bad spot (UINT64_MAX) + int result = (*probeFunc)(aux, 0, &firstVCN); + if (result != UDS_SUCCESS) { + return UDS_SUCCESS; + } + + /* + * Binary search for end of the discontinuity in the monotonically + * increasing virtual chapter numbers; bad spots are treated as a span of + * UINT64_MAX values. In effect we're searching for the index of the + * smallest value less than firstVCN. In the case we go off the end it means + * that chapter 0 has the lowest vcn. + */ + + unsigned int leftChapter = 0; + unsigned int rightChapter = chapterLimit; + + while (leftChapter < rightChapter) { + unsigned int chapter = (leftChapter + rightChapter) / 2; + uint64_t probeVCN; + + result = (*probeFunc)(aux, chapter, &probeVCN); + if (result != UDS_SUCCESS) { + return result; + } + if (firstVCN <= probeVCN) { + leftChapter = chapter + 1; + } else { + rightChapter = chapter; + } + } + + uint64_t lowest = UINT64_MAX; + uint64_t highest = UINT64_MAX; + + result = ASSERT(leftChapter == rightChapter, "leftChapter == rightChapter"); + if (result != UDS_SUCCESS) { + return result; + } + + leftChapter %= chapterLimit; // in case we're at the end + + // At this point, leftChapter is the chapter with the lowest virtual chapter + // number. + + result = (*probeFunc)(aux, leftChapter, &lowest); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((lowest != UINT64_MAX), "invalid lowest chapter"); + if (result != UDS_SUCCESS) { + return result; + } + + // We now circularly scan backwards, moving over any bad chapters until we + // find the chapter with the highest vcn (the first good chapter we + // encounter). + + unsigned int badChapters = 0; + + for (;;) { + rightChapter = (rightChapter + chapterLimit - 1) % chapterLimit; + result = (*probeFunc)(aux, rightChapter, &highest); + if (result != UDS_SUCCESS) { + return result; + } + if (highest != UINT64_MAX) { + break; + } + if (++badChapters >= maxBadChapters) { + logError("too many bad chapters in volume: %u", badChapters); + return UDS_CORRUPT_COMPONENT; + } + } + + *lowestVCN = lowest; + *highestVCN = highest; + return UDS_SUCCESS; +} + +/** + * Allocate a volume. + * + * @param config The configuration to use + * @param layout The index layout + * @param readQueueMaxSize The maximum size of the read queue + * @param zoneCount The number of zones to use + * @param newVolume A pointer to hold the new volume + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int allocateVolume(const Configuration *config, + IndexLayout *layout, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) +{ + Volume *volume; + int result = ALLOCATE(1, Volume, "volume", &volume); + if (result != UDS_SUCCESS) { + return result; + } + volume->nonce = getVolumeNonce(layout); + // It is safe to call freeVolume now to clean up and close the volume + + result = copyGeometry(config->geometry, &volume->geometry); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return logWarningWithStringError(result, + "failed to allocate geometry: error"); + } + + // Need a buffer for each entry in the page cache + unsigned int reservedBuffers + = config->cacheChapters * config->geometry->recordPagesPerChapter; + // And a buffer for the chapter writer + reservedBuffers += 1; + // And a buffer for each entry in the sparse cache + if (isSparse(volume->geometry)) { + reservedBuffers + += config->cacheChapters * config->geometry->indexPagesPerChapter; + } + result = openVolumeStore(&volume->volumeStore, layout, reservedBuffers, + config->geometry->bytesPerPage); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initializeVolumePage(config->geometry, &volume->scratchPage); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + result = makeRadixSorter(config->geometry->recordsPerPage, + &volume->radixSorter); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + result = ALLOCATE(config->geometry->recordsPerPage, const UdsChunkRecord *, + "record pointers", &volume->recordPointers); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + if (isSparse(volume->geometry)) { + result = makeSparseCache(volume->geometry, config->cacheChapters, + zoneCount, &volume->sparseCache); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + } + result = makePageCache(volume->geometry, config->cacheChapters, + readQueueMaxSize, zoneCount, &volume->pageCache); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = makeIndexPageMap(volume->geometry, &volume->indexPageMap); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + *newVolume = volume; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeVolume(const Configuration *config, + IndexLayout *layout, + const struct uds_parameters *userParams, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) +{ + unsigned int volumeReadThreads = getReadThreads(userParams); + + if (readQueueMaxSize <= volumeReadThreads) { + logError("Number of read threads must be smaller than read queue"); + return UDS_INVALID_ARGUMENT; + } + + Volume *volume = NULL; + int result = allocateVolume(config, layout, readQueueMaxSize, zoneCount, + &volume); + if (result != UDS_SUCCESS) { + return result; + } + result = initMutex(&volume->readThreadsMutex); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initCond(&volume->readThreadsReadDoneCond); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initCond(&volume->readThreadsCond); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + // Start the reader threads. If this allocation succeeds, freeVolume knows + // that it needs to try and stop those threads. + result = ALLOCATE(volumeReadThreads, Thread, "reader threads", + &volume->readerThreads); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + unsigned int i; + for (i = 0; i < volumeReadThreads; i++) { + result = createThread(readThreadFunction, (void *) volume, "reader", + &volume->readerThreads[i]); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + // We only stop as many threads as actually got started. + volume->numReadThreads = i + 1; + } + + *newVolume = volume; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeVolume(Volume *volume) +{ + if (volume == NULL) { + return; + } + + // If readerThreads is NULL, then we haven't set up the reader threads. + if (volume->readerThreads != NULL) { + // Stop the reader threads. It is ok if there aren't any of them. + lockMutex(&volume->readThreadsMutex); + volume->readerState |= READER_STATE_EXIT; + broadcastCond(&volume->readThreadsCond); + unlockMutex(&volume->readThreadsMutex); + unsigned int i; + for (i = 0; i < volume->numReadThreads; i++) { + joinThreads(volume->readerThreads[i]); + } + FREE(volume->readerThreads); + volume->readerThreads = NULL; + } + + // Must close the volume store AFTER freeing the scratch page and the caches + destroyVolumePage(&volume->scratchPage); + freePageCache(volume->pageCache); + freeSparseCache(volume->sparseCache); + closeVolumeStore(&volume->volumeStore); + + destroyCond(&volume->readThreadsCond); + destroyCond(&volume->readThreadsReadDoneCond); + destroyMutex(&volume->readThreadsMutex); + freeIndexPageMap(volume->indexPageMap); + freeRadixSorter(volume->radixSorter); + FREE(volume->geometry); + FREE(volume->recordPointers); + FREE(volume); +} diff --git a/uds/volume.h b/uds/volume.h new file mode 100644 index 0000000..82aef00 --- /dev/null +++ b/uds/volume.h @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volume.h#14 $ + */ + +#ifndef VOLUME_H +#define VOLUME_H + +#include "cacheCounters.h" +#include "common.h" +#include "chapterIndex.h" +#include "indexConfig.h" +#include "indexLayout.h" +#include "indexPageMap.h" +#include "pageCache.h" +#include "request.h" +#include "sparseCache.h" +#include "uds.h" +#include "util/radixSort.h" +#include "volumeStore.h" + +typedef enum { + READER_STATE_RUN = 1, + READER_STATE_EXIT = 2, + READER_STATE_STOP = 4 +} ReaderState; + +typedef enum indexLookupMode { + /* Always do lookups in all chapters normally. */ + LOOKUP_NORMAL, + /* + * Don't do lookups in closed chapters; assume records not in the + * open chapter are always new. You don't want this normally; it's + * for programs like albfill. (Even then, with multiple runs using + * the same tag, we may actually duplicate older records, but if + * it's in a separate chapter it won't really matter.) + */ + LOOKUP_CURRENT_CHAPTER_ONLY, + /* + * Only do a subset of lookups needed when rebuilding an index. + * This cannot be set externally. + */ + LOOKUP_FOR_REBUILD +} IndexLookupMode; + +typedef struct volume { + /* The layout of the volume */ + Geometry *geometry; + /* The configuration of the volume */ + Configuration *config; + /* The access to the volume's backing store */ + struct volume_store volumeStore; + /* A single page used for writing to the volume */ + struct volume_page scratchPage; + /* The nonce used to save the volume */ + uint64_t nonce; + /* A single page's records, for sorting */ + const UdsChunkRecord **recordPointers; + /* For sorting record pages */ + RadixSorter *radixSorter; + /* The sparse chapter index cache */ + SparseCache *sparseCache; + /* The page cache */ + PageCache *pageCache; + /* The index page map maps delta list numbers to index page numbers */ + IndexPageMap *indexPageMap; + /* Mutex to sync between read threads and index thread */ + Mutex readThreadsMutex; + /* Condvar to indicate when read threads should start working */ + CondVar readThreadsCond; + /* Condvar to indicate when a read thread has finished a read */ + CondVar readThreadsReadDoneCond; + /* Threads to read data from disk */ + Thread *readerThreads; + /* Number of threads busy with reads */ + unsigned int busyReaderThreads; + /* The state of the reader threads */ + ReaderState readerState; + /* The lookup mode for the index */ + IndexLookupMode lookupMode; + /* Number of read threads to use (run-time parameter) */ + unsigned int numReadThreads; +} Volume; + +/** + * Create a volume. + * + * @param config The configuration to use. + * @param layout The index layout + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param readQueueMaxSize The maximum size of the read queue. + * @param zoneCount The number of zones to use. + * @param newVolume A pointer to hold a pointer to the new volume. + * + * @return UDS_SUCCESS or an error code + **/ +int makeVolume(const Configuration *config, + IndexLayout *layout, + const struct uds_parameters *userParams, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) + __attribute__((warn_unused_result)); + +/** + * Clean up a volume and its memory. + * + * @param volume The volume to destroy. + **/ +void freeVolume(Volume *volume); + +/** + * Enqueue a page read. + * + * @param volume the volume + * @param request the request to waiting on the read + * @param physicalPage the page number to read + * + * @return UDS_QUEUED if successful, or an error code + **/ +int enqueuePageRead(Volume *volume, Request *request, int physicalPage) + __attribute__((warn_unused_result)); + +/** + * Find the lowest and highest contiguous chapters and determine their + * virtual chapter numbers. + * + * @param [in] volume The volume to probe. + * @param [out] lowestVCN Pointer for lowest virtual chapter number. + * @param [out] highestVCN Pointer for highest virtual chapter number. + * @param [out] isEmpty Pointer to a bool indicating whether or not the + * volume is empty. + * + * @return UDS_SUCCESS, or an error code. + * + * @note This routine does something similar to a binary search to find + * the location in the volume file where the discontinuity of + * chapter numbers occurs. In a good save, the discontinuity is + * a sharp cliff, but if write failures occured during saving + * there may be one or more chapters which are partially written. + * + * @note This method takes advantage of the fact that the physical + * chapter number in which the index pages are found should have + * headers which state that the virtual chapter number are all + * identical and maintain the invariant that + * pcn == vcn % chaptersPerVolume. + **/ +int findVolumeChapterBoundaries(Volume *volume, + uint64_t *lowestVCN, + uint64_t *highestVCN, + bool *isEmpty) + __attribute__((warn_unused_result)); + +/** + * Find any matching metadata for the given name within a given physical + * chapter. + * + * @param volume The volume. + * @param request The request originating the search. + * @param name The block name of interest. + * @param virtualChapter The number of the chapter to search. + * @param metadata The old metadata for the name. + * @param found A pointer which will be set to + * true if a match was found. + * + * @return UDS_SUCCESS or an error + **/ +int searchVolumePageCache(Volume *volume, + Request *request, + const UdsChunkName *name, + uint64_t virtualChapter, + UdsChunkData *metadata, + bool *found) + __attribute__((warn_unused_result)); + +/** + * Fetch a record page from the cache or read it from the volume and search it + * for a chunk name. + * + * If a match is found, optionally returns the metadata from the stored + * record. If the requested record page is not cached, the page fetch may be + * asynchronously completed on the slow lane, in which case UDS_QUEUED will be + * returned and the request will be requeued for continued processing after + * the page is read and added to the cache. + * + * @param volume the volume containing the record page to search + * @param request the request originating the search (may be NULL for + * a direct query from volume replay) + * @param name the name of the block or chunk + * @param chapter the chapter to search + * @param recordPageNumber the record page number of the page to search + * @param duplicate an array in which to place the metadata of the + * duplicate, if one was found + * @param found a (bool *) which will be set to true if the chunk + * was found + * + * @return UDS_SUCCESS, UDS_QUEUED, or an error code + **/ +int searchCachedRecordPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + int recordPageNumber, + UdsChunkData *duplicate, + bool *found) + __attribute__((warn_unused_result)); + +/** + * Forget the contents of a chapter. Invalidates any cached state for the + * specified chapter. + * + * @param volume the volume containing the chapter + * @param chapter the virtual chapter number + * @param reason the reason for invalidation + * + * @return UDS_SUCCESS or an error code + **/ +int forgetChapter(Volume *volume, + uint64_t chapter, + InvalidationReason reason) + __attribute__((warn_unused_result)); + +/** + * Write a chapter's worth of index pages to a volume + * + * @param volume the volume containing the chapter + * @param physicalPage the page number in the volume for the chapter + * @param chapterIndex the populated delta chapter index + * @param pages pointer to array of page pointers. Used only in testing + * to return what data has been written to disk. + * + * @return UDS_SUCCESS or an error code + **/ +int writeIndexPages(Volume *volume, + int physicalPage, + OpenChapterIndex *chapterIndex, + byte **pages) +__attribute__((warn_unused_result)); + +/** + * Write a chapter's worth of record pages to a volume + * + * @param volume the volume containing the chapter + * @param physicalPage the page number in the volume for the chapter + * @param records a 1-based array of chunk records in the chapter + * @param pages pointer to array of page pointers. Used only in testing + * to return what data has been written to disk. + * + * @return UDS_SUCCESS or an error code + **/ +int writeRecordPages(Volume *volume, + int physicalPage, + const UdsChunkRecord records[], + byte **pages) +__attribute__((warn_unused_result)); + +/** + * Write the index and records from the most recently filled chapter to the + * volume. + * + * @param volume the volume containing the chapter + * @param chapterIndex the populated delta chapter index + * @param records a 1-based array of chunk records in the chapter + * + * @return UDS_SUCCESS or an error code + **/ +int writeChapter(Volume *volume, + OpenChapterIndex *chapterIndex, + const UdsChunkRecord records[]) + __attribute__((warn_unused_result)); + +/** + * Read all the index pages for a chapter from the volume and initialize an + * array of ChapterIndexPages to represent them. + * + * @param [in] volume the volume containing the chapter + * @param [in] virtualChapter the virtual chapter number of the index to read + * @param [out] volumePages an array to receive the raw index page data + * @param [out] indexPages an array of ChapterIndexPages to initialize + * + * @return UDS_SUCCESS or an error code + **/ +int readChapterIndexFromVolume(const Volume *volume, + uint64_t virtualChapter, + struct volume_page volumePages[], + DeltaIndexPage indexPages[]) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, this is done immediately in the same thread and the + * page is then returned. + * + * The caller of this function must be holding the volume read mutex before + * calling this function. + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function is only exposed for the use of unit tests. + * + * @param volume The volume containing the page + * @param request The request originating the search + * @param physicalPage The physical page number + * @param probeType The type of cache access being done + * @param entryPtr A pointer to hold the retrieved cached entry + * + * @return UDS_SUCCESS or an error code + **/ +int getPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **entryPtr) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, the read request is enqueued for later processing + * by another thread. When that thread finally reads the page into the cache, + * a callback function is called to inform the caller the read is complete. + * + * The caller of this function should not be holding the volume read lock. + * Instead, the caller must call beingPendingSearch() for the given zone + * the request is being processed in. That state will be maintained or + * restored when the call returns, at which point the caller should call + * endPendingSearch(). + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function is only exposed for the use of unit tests. + * + * @param volume The volume containing the page + * @param request The request originating the search + * @param physicalPage The physical page number + * @param probeType The type of cache access being done + * @param entryPtr A pointer to hold the retrieved cached entry + * + * @return UDS_SUCCESS or an error code + **/ +int getPageProtected(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **entryPtr) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, this is done immediately in the same thread and the + * page is then returned. + * + * The caller of this function must not be holding the volume read lock before + * calling this function. This method will grab that lock and release it + * when it returns. + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function should only be called by areas of the code that do not use + * multi-threading to access the volume. These include rebuild, volume + * explorer, and certain unit tests. + * + * @param volume The volume containing the page + * @param chapter The number of the chapter containing the page + * @param pageNumber The number of the page + * @param probeType The type of cache access being done + * @param dataPtr Pointer to hold the retrieved page, NULL if not wanted + * @param indexPagePtr Pointer to hold the retrieved chapter index page, or + * NULL if not wanted + * + * @return UDS_SUCCESS or an error code + **/ +int getPage(Volume *volume, + unsigned int chapter, + unsigned int pageNumber, + CacheProbeType probeType, + byte **dataPtr, + DeltaIndexPage **indexPagePtr) + __attribute__((warn_unused_result)); + +/**********************************************************************/ +size_t getCacheSize(Volume *volume) __attribute__((warn_unused_result)); + +/**********************************************************************/ +int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, + unsigned int maxBadChapters, + uint64_t *lowestVCN, + uint64_t *highestVCN, + int (*probeFunc)(void *aux, + unsigned int chapter, + uint64_t *vcn), + void *aux) + __attribute__((warn_unused_result)); + +/** + * Map a chapter number and page number to a phsical volume page number. + * + * @param geometry the layout of the volume + * @param chapter the chapter number of the desired page + * @param page the chapter page number of the desired page + * + * @return the physical page number + **/ +int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) + __attribute__((warn_unused_result)); + +#endif /* VOLUME_H */ diff --git a/uds/volumeStore.c b/uds/volumeStore.c new file mode 100644 index 0000000..8b9f820 --- /dev/null +++ b/uds/volumeStore.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.c#2 $ + */ + +#include "geometry.h" +#include "indexLayout.h" +#include "logger.h" +#include "uds-error.h" +#include "volumeStore.h" + + +/*****************************************************************************/ +void closeVolumeStore(struct volume_store *volumeStore) +{ +#ifdef __KERNEL__ + if (volumeStore->vs_client != NULL) { + dm_bufio_client_destroy(volumeStore->vs_client); + volumeStore->vs_client = NULL; + } +#else + if (volumeStore->vs_region != NULL) { + putIORegion(volumeStore->vs_region); + volumeStore->vs_region = NULL; + } +#endif +} + +/*****************************************************************************/ +void destroyVolumePage(struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); +#else + FREE(volumePage->vp_data); + volumePage->vp_data = NULL; +#endif +} + +/*****************************************************************************/ +int initializeVolumePage(const struct geometry *geometry, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + volumePage->vp_buffer = NULL; + return UDS_SUCCESS; +#else + return ALLOCATE_IO_ALIGNED(geometry->bytesPerPage, byte, __func__, + &volumePage->vp_data); +#endif +} + +/*****************************************************************************/ +int openVolumeStore(struct volume_store *volumeStore, + IndexLayout *layout, + unsigned int reservedBuffers __attribute__((unused)), + size_t bytesPerPage) +{ +#ifdef __KERNEL__ + return openVolumeBufio(layout, bytesPerPage, reservedBuffers, + &volumeStore->vs_client); +#else + volumeStore->vs_bytesPerPage = bytesPerPage; + return openVolumeRegion(layout, &volumeStore->vs_region); +#endif +} + +/*****************************************************************************/ +void prefetchVolumePages(const struct volume_store *vs __attribute__((unused)), + unsigned int physicalPage __attribute__((unused)), + unsigned int pageCount __attribute__((unused))) +{ +#ifdef __KERNEL__ + dm_bufio_prefetch(vs->vs_client, physicalPage, pageCount); +#else + // Nothing to do in user mode +#endif +} + +/*****************************************************************************/ +int prepareToWriteVolumePage(const struct volume_store *volumeStore + __attribute__((unused)), + unsigned int physicalPage + __attribute__((unused)), + struct volume_page *volumePage + __attribute__((unused))) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); + struct dm_buffer *buffer = NULL; + byte *data = dm_bufio_new(volumeStore->vs_client, physicalPage, &buffer); + if (IS_ERR(data)) { + return -PTR_ERR(data); + } + volumePage->vp_buffer = buffer; +#else + // Nothing to do in user mode +#endif + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int readVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); + byte *data = dm_bufio_read(volumeStore->vs_client, physicalPage, + &volumePage->vp_buffer); + if (IS_ERR(data)) { + return logWarningWithStringError(-PTR_ERR(data), + "error reading physical page %u", + physicalPage); + } +#else + off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; + int result = readFromRegion(volumeStore->vs_region, offset, + getPageData(volumePage), + volumeStore->vs_bytesPerPage, NULL); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "error reading physical page %u", + physicalPage); + } +#endif + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void releaseVolumePage(struct volume_page *volumePage __attribute__((unused))) +{ +#ifdef __KERNEL__ + if (volumePage->vp_buffer != NULL) { + dm_bufio_release(volumePage->vp_buffer); + volumePage->vp_buffer = NULL; + } +#else + // Nothing to do in user mode +#endif +} + +/*****************************************************************************/ +void swapVolumePages(struct volume_page *volumePage1, + struct volume_page *volumePage2) +{ + struct volume_page temp = *volumePage1; + *volumePage1 = *volumePage2; + *volumePage2 = temp; +} + +/*****************************************************************************/ +int syncVolumeStore(const struct volume_store *volumeStore) +{ +#ifdef __KERNEL__ + int result = -dm_bufio_write_dirty_buffers(volumeStore->vs_client); +#else + int result = syncRegionContents(volumeStore->vs_region); +#endif + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot sync chapter to volume"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int writeVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + dm_bufio_mark_buffer_dirty(volumePage->vp_buffer); + return UDS_SUCCESS; +#else + off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; + return writeToRegion(volumeStore->vs_region, offset, getPageData(volumePage), + volumeStore->vs_bytesPerPage, + volumeStore->vs_bytesPerPage); +#endif +} diff --git a/uds/volumeStore.h b/uds/volumeStore.h new file mode 100644 index 0000000..f475427 --- /dev/null +++ b/uds/volumeStore.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.h#2 $ + */ + +#ifndef VOLUME_STORE_H +#define VOLUME_STORE_H + +#include "common.h" +#include "compiler.h" +#include "memoryAlloc.h" + +#ifdef __KERNEL__ +#include +#else +#include "ioRegion.h" +#endif + +struct geometry; +struct indexLayout; + + +struct volume_store { +#ifdef __KERNEL__ + struct dm_bufio_client *vs_client; +#else + IORegion *vs_region; + size_t vs_bytesPerPage; +#endif +}; + + +struct volume_page { +#ifdef __KERNEL__ + struct dm_buffer *vp_buffer; +#else + byte *vp_data; +#endif +}; + +/** + * Close a volume store. + * + * @param volumeStore The volume store + **/ +void closeVolumeStore(struct volume_store *volumeStore); + +/** + * Uninitialize a volume page buffer. + * + * @param volumePage The volume page buffer + **/ +void destroyVolumePage(struct volume_page *volumePage); + +/** + * Get a pointer to the data contained in a volume page buffer. + * + * @param volumePage The volume page buffer + * + * @return the address of the data + **/ +__attribute__((warn_unused_result)) +static INLINE byte *getPageData(const struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + return dm_bufio_get_block_data(volumePage->vp_buffer); +#else + return volumePage->vp_data; +#endif +} + +/** + * Initialize a volume page buffer. + * + * @param geometry The volume geometry + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error status + **/ +int initializeVolumePage(const struct geometry *geometry, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Open a volume store. + * + * @param volumeStore The volume store + * @param layout The index layout + * @param reservedBuffers The number of buffers that can be reserved + * @param bytesPerPage The number of bytes in a volume page + **/ +int openVolumeStore(struct volume_store *volumeStore, + struct indexLayout *layout, + unsigned int reservedBuffers, + size_t bytesPerPage) + __attribute__((warn_unused_result)); + +/** + * Prefetch volume pages into memory. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the first desired page + * @param pageCount The number of volume pages to prefetch + **/ +void prefetchVolumePages(const struct volume_store *volumeStore, + unsigned int physicalPage, + unsigned int pageCount); + +/** + * Prepare a buffer to write a page to the volume. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int prepareToWriteVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Read a page from a volume store. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int readVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Release a volume page buffer, because it will no longer be accessed before a + * call to readVolumePage or prepareToWriteVolumePage. + * + * @param volumePage The volume page buffer + **/ +void releaseVolumePage(struct volume_page *volumePage); + +/** + * Swap volume pages. This is used to put the contents of a newly written + * index page (in the scratch page) into the page cache. + * + * @param volumePage1 The volume page buffer + * @param volumePage2 The volume page buffer + **/ +void swapVolumePages(struct volume_page *volumePage1, + struct volume_page *volumePage2); + +/** + * Sync the volume store to storage. + * + * @param volumeStore The volume store + * + * @return UDS_SUCCESS or an error code + **/ +int syncVolumeStore(const struct volume_store *volumeStore) + __attribute__((warn_unused_result)); + +/** + * Write a page to a volume store. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int writeVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +#endif /* VOLUME_STORE_H */ diff --git a/uds/zone.c b/uds/zone.c new file mode 100644 index 0000000..cc07674 --- /dev/null +++ b/uds/zone.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/zone.c#4 $ + */ + +#include "zone.h" + +#include "logger.h" +#include "threads.h" + +/**********************************************************************/ +unsigned int getZoneCount(const struct uds_parameters *userParams) +{ + unsigned int zoneCount = (userParams == NULL) ? 0 : userParams->zone_count; + if (zoneCount == 0) { + zoneCount = getNumCores() / 2; + } + if (zoneCount < 1) { + zoneCount = 1; + } + if (zoneCount > MAX_ZONES) { + zoneCount = MAX_ZONES; + } + logInfo("Using %u indexing zone%s for concurrency.", zoneCount, + zoneCount == 1 ? "" : "s"); + return zoneCount; +} diff --git a/uds/zone.h b/uds/zone.h new file mode 100644 index 0000000..99daf40 --- /dev/null +++ b/uds/zone.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/zone.h#2 $ + */ + +#ifndef ZONE_H +#define ZONE_H + +#include "uds.h" + +enum { + MAX_ZONES = 16, +}; + +/** + * Return the number of zones. + * + * @param userParams the index session parameters. If NULL, the default + * session parameters will be used. + * + * @return the number of zones + **/ +unsigned int getZoneCount(const struct uds_parameters *userParams) + __attribute__((warn_unused_result)); + +#endif /* ZONE_H */ diff --git a/vdo/Makefile b/vdo/Makefile new file mode 100644 index 0000000..816c219 --- /dev/null +++ b/vdo/Makefile @@ -0,0 +1,31 @@ +VDO_VERSION = 6.2.4.26 + +VDO_VERSION_MAJOR = $(word 1,$(subst ., ,$(VDO_VERSION))) +VDO_VERSION_MINOR = $(word 2,$(subst ., ,$(VDO_VERSION))) +VDO_VERSION_MICRO = $(word 3,$(subst ., ,$(VDO_VERSION))) + +SOURCES = $(addprefix base/,$(notdir $(wildcard $(src)/base/*.c))) +SOURCES += $(addprefix kernel/,$(notdir $(wildcard $(src)/kernel/*.c))) +OBJECTS = $(SOURCES:%.c=%.o) +INCLUDES = -I$(src)/base -I$(src)/kernel -I$(src)/../uds + +EXTRA_CFLAGS = -std=gnu99 \ + -fno-builtin-memset \ + -Werror \ + -Wframe-larger-than=400 \ + -Wno-declaration-after-statement \ + -DVDO_VERSION_MAJOR=$(VDO_VERSION_MAJOR) \ + -DVDO_VERSION_MINOR=$(VDO_VERSION_MINOR) \ + -DVDO_VERSION_MICRO=$(VDO_VERSION_MICRO) \ + -DCURRENT_VERSION=\"$(VDO_VERSION)\" \ + $(INCLUDES) + +CFLAGS_REMOVE_vdoPageCache.o= -std=gnu99 +CFLAGS_REMOVE_vio.o= -std=gnu99 + +CFLAGS_vdoPageCache.o= -std=gnu89 +CFLAGS_vio.o= -std=gnu89 + +obj-m += kvdo.o + +kvdo-objs = $(OBJECTS) diff --git a/vdo/base/actionManager.c b/vdo/base/actionManager.c new file mode 100644 index 0000000..664131d --- /dev/null +++ b/vdo/base/actionManager.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.c#9 $ + */ + +#include "actionManager.h" + +#include "memoryAlloc.h" + +#include "adminState.h" +#include "completion.h" +#include "types.h" + +/** An action to be performed in each of a set of zones */ +typedef struct action Action; +struct action { + /** Whether this structure is in use */ + bool inUse; + /** The admin operation associated with this action */ + AdminStateCode operation; + /** + * The method to run on the initiator thread before the action is applied to + * each zone. + **/ + ActionPreamble *preamble; + /** The action to be performed in each zone */ + ZoneAction *zoneAction; + /** + * The method to run on the initiator thread after the action has been + * applied to each zone + **/ + ActionConclusion *conclusion; + /** The object to notify when the action is complete */ + VDOCompletion *parent; + /** The action specific context */ + void *context; + /** The action to perform after this one */ + Action *next; +}; + +struct actionManager { + /** The completion for performing actions */ + VDOCompletion completion; + /** The state of this action manager */ + AdminState state; + /** The two action slots*/ + Action actions[2]; + /** The current action slot */ + Action *currentAction; + /** The number of zones in which an action is to be applied */ + ZoneCount zones; + /** A function to schedule a default next action */ + ActionScheduler *scheduler; + /** + * A function to get the id of the thread on which to apply an action to a + * zone + **/ + ZoneThreadGetter *getZoneThreadID; + /** The ID of the thread on which actions may be initiated */ + ThreadID initiatorThreadID; + /** Opaque data associated with this action manager */ + void *context; + /** The zone currently being acted upon */ + ZoneCount actingZone; +}; + +/** + * Convert a generic VDOCompletion to a ActionManager. + * + * @param completion The completion to convert + * + * @return The completion as a ActionManager + **/ +static inline ActionManager *asActionManager(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ActionManager, completion) == 0); + assertCompletionType(completion->type, ACTION_COMPLETION); + return (ActionManager *) completion; +} + +/** + * An action scheduler which does not schedule an action. + * + *

Implements ActionScheduler. + **/ +static bool noDefaultAction(void *context __attribute__((unused))) +{ + return false; +} + +/** + * A default preamble which does nothing. + * + *

Implements ActionPreamble + **/ +static void noPreamble(void *context __attribute__((unused)), + VDOCompletion *completion) +{ + completeCompletion(completion); +} + +/** + * A default conclusion which does nothing. + * + *

Implements ActionConclusion. + **/ +static int noConclusion(void *context __attribute__((unused))) { + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeActionManager(ZoneCount zones, + ZoneThreadGetter *getZoneThreadID, + ThreadID initiatorThreadID, + void *context, + ActionScheduler *scheduler, + PhysicalLayer *layer, + ActionManager **managerPtr) +{ + ActionManager *manager; + int result = ALLOCATE(1, ActionManager, __func__, &manager); + if (result != VDO_SUCCESS) { + return result; + } + + *manager = (ActionManager) { + .zones = zones, + .scheduler = ((scheduler == NULL) ? noDefaultAction : scheduler), + .getZoneThreadID = getZoneThreadID, + .initiatorThreadID = initiatorThreadID, + .context = context, + }; + + manager->actions[0].next = &manager->actions[1]; + manager->currentAction = manager->actions[1].next = &manager->actions[0]; + + result = initializeEnqueueableCompletion(&manager->completion, + ACTION_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeActionManager(&manager); + return result; + } + + *managerPtr = manager; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeActionManager(ActionManager **managerPtr) +{ + ActionManager *manager = *managerPtr; + if (manager == NULL) { + return; + } + + destroyEnqueueable(&manager->completion); + FREE(manager); + *managerPtr = NULL; +} + +/**********************************************************************/ +AdminStateCode getCurrentManagerOperation(ActionManager *manager) +{ + return manager->state.state; +} + +/**********************************************************************/ +void *getCurrentActionContext(ActionManager *manager) +{ + return (manager->currentAction->inUse + ? manager->currentAction->context : NULL); +} + +/**********************************************************************/ +static void finishActionCallback(VDOCompletion *completion); +static void applyToZone(VDOCompletion *completion); + +/** + * Get the thread ID for the current zone. + * + * @param manager The action manager + * + * @return The ID of the thread on which to run actions for the current zone + **/ +static ThreadID getActingZoneThreadID(ActionManager *manager) +{ + return manager->getZoneThreadID(manager->context, manager->actingZone); +} + +/** + * Prepare the manager's completion to run on the next zone. + * + * @param manager The action manager + **/ +static void prepareForNextZone(ActionManager *manager) +{ + prepareForRequeue(&manager->completion, applyToZone, + preserveErrorAndContinue, getActingZoneThreadID(manager), + manager->currentAction->parent); +} + +/** + * Prepare the manager's completion to run the conclusion on the initiator + * thread. + * + * @param manager The action manager + **/ +static void prepareForConclusion(ActionManager *manager) +{ + prepareForRequeue(&manager->completion, finishActionCallback, + preserveErrorAndContinue, manager->initiatorThreadID, + manager->currentAction->parent); +} + +/** + * Perform an action on the next zone if there is one. + * + * @param completion The action completion + **/ +static void applyToZone(VDOCompletion *completion) +{ + ActionManager *manager = asActionManager(completion); + ASSERT_LOG_ONLY((getCallbackThreadID() == getActingZoneThreadID(manager)), + "applyToZone() called on acting zones's thread"); + + ZoneCount zone = manager->actingZone++; + if (manager->actingZone == manager->zones) { + // We are about to apply to the last zone. Once that is finished, + // we're done, so go back to the initiator thread and finish up. + prepareForConclusion(manager); + } else { + // Prepare to come back on the next zone + prepareForNextZone(manager); + } + + manager->currentAction->zoneAction(manager->context, zone, completion); +} + +/** + * The error handler for preamble errors. + * + * @param completion The manager completion + **/ +static void handlePreambleError(VDOCompletion *completion) +{ + // Skip the zone actions since the preamble failed. + completion->callback = finishActionCallback; + preserveErrorAndContinue(completion); +} + +/** + * Launch the current action. + * + * @param manager The action manager + **/ +static void launchCurrentAction(ActionManager *manager) +{ + Action *action = manager->currentAction; + int result = startOperation(&manager->state, action->operation); + if (result != VDO_SUCCESS) { + if (action->parent != NULL) { + setCompletionResult(action->parent, result); + } + + // We aren't going to run the preamble, so don't run the conclusion + action->conclusion = noConclusion; + finishActionCallback(&manager->completion); + return; + } + + if (action->zoneAction == NULL) { + prepareForConclusion(manager); + } else { + manager->actingZone = 0; + prepareForRequeue(&manager->completion, applyToZone, handlePreambleError, + getActingZoneThreadID(manager), + manager->currentAction->parent); + } + + action->preamble(manager->context, &manager->completion); +} + +/**********************************************************************/ +bool scheduleDefaultAction(ActionManager *manager) +{ + // Don't schedule a default action if we are operating or not in normal + // operation. + return ((manager->state.state == ADMIN_STATE_NORMAL_OPERATION) + && manager->scheduler(manager->context)); +} + +/** + * Finish an action now that it has been applied to all zones. This + * callback is registered in applyToZone(). + * + * @param completion The action manager completion + **/ +static void finishActionCallback(VDOCompletion *completion) +{ + ActionManager *manager = asActionManager(completion); + Action action = *(manager->currentAction); + manager->currentAction->inUse = false; + manager->currentAction = manager->currentAction->next; + + // We need to check this now to avoid use-after-free issues if running the + // conclusion or notifying the parent results in the manager being freed. + bool hasNextAction = (manager->currentAction->inUse + || scheduleDefaultAction(manager)); + int result = action.conclusion(manager->context); + finishOperation(&manager->state); + if (action.parent != NULL) { + finishCompletion(action.parent, result); + } + + if (hasNextAction) { + launchCurrentAction(manager); + } +} + +/**********************************************************************/ +bool scheduleAction(ActionManager *manager, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent) +{ + return scheduleOperation(manager, ADMIN_STATE_OPERATING, preamble, + zoneAction, conclusion, parent); +} + +/**********************************************************************/ +bool scheduleOperation(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent) +{ + return scheduleOperationWithContext(manager, operation, preamble, zoneAction, + conclusion, NULL, parent); +} + +/**********************************************************************/ +bool scheduleOperationWithContext(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + void *context, + VDOCompletion *parent) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == manager->initiatorThreadID), + "action initiated from correct thread"); + Action *action; + if (!manager->currentAction->inUse) { + action = manager->currentAction; + } else if (!manager->currentAction->next->inUse) { + action = manager->currentAction->next; + } else { + if (parent != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + } + + return false; + } + + *action = (Action) { + .inUse = true, + .operation = operation, + .preamble = (preamble == NULL) ? noPreamble : preamble, + .zoneAction = zoneAction, + .conclusion = (conclusion == NULL) ? noConclusion : conclusion, + .context = context, + .parent = parent, + .next = action->next, + }; + + if (action == manager->currentAction) { + launchCurrentAction(manager); + } + + return true; +} diff --git a/vdo/base/actionManager.h b/vdo/base/actionManager.h new file mode 100644 index 0000000..2e0ef13 --- /dev/null +++ b/vdo/base/actionManager.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.h#6 $ + */ + +#ifndef ACTION_MANAGER_H +#define ACTION_MANAGER_H + +#include "adminState.h" +#include "completion.h" +#include "types.h" + +/** + * ActionManager provides a generic mechanism for applying actions to + * multi-zone entities (such as the block map or slab depot). Each action + * manager is tied to a specific context for which it manages actions. The + * manager ensures that only one action is active on that context at a time, + * and supports at most one pending action. Calls to schedule an action when + * there is already a pending action will result in VDO_COMPONENT_BUSY errors. + * Actions may only be submitted to the action manager from a single thread + * (which thread is determined when the action manager is constructed). + * + * A scheduled action consists of four components: + * preamble: an optional method to be run on the initator thread before + * applying the action to all zones + * zoneAction: an optional method to be applied to each of the zones + * conclusion: an optional method to be run on the initiator thread once the + * per-zone method has been applied to all zones + * parent: an optional completion to be finished once the conclusion + * is done + * + * At least one of the three methods must be provided. + **/ + +/** + * A function which is to be applied asynchronously to a set of zones. + * + * @param context The object which holds the per-zone context for the + * action + * @param zoneNumber The number of zone to which the action is being applied + * @param parent The object to notify when the action is complete + **/ +typedef void ZoneAction(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * A function which is to be applied asynchronously on an action manager's + * initiator thread as the preamble of an action. + * + * @param context The object which holds the per-zone context for the action + * @param parent The object to notify when the action is complete + **/ +typedef void ActionPreamble(void *context, VDOCompletion *parent); + +/** + * A function which will run on the action manager's initiator thread as the + * conclusion of an action. + * + * @param context The object which holds the per-zone context for the action + * + * @return VDO_SUCCESS or an error + **/ +typedef int ActionConclusion(void *context); + +/** + * A function to schedule an action. + * + * @param context The object which holds the per-zone context for the action + * + * @return true if an action was scheduled + **/ +typedef bool ActionScheduler(void *context); + +/** + * Get the id of the thread associated with a given zone. + * + * @param context The action context + * @param zoneNumber The number of the zone for which the thread ID is desired + **/ +typedef ThreadID ZoneThreadGetter(void *context, ZoneCount zoneNumber); + +/** + * Make an action manager. + * + * @param [in] zones The number of zones to which actions will be + * applied + * @param [in] getZoneThreadID A function to get the thread id associated + * with a zone + * @param [in] initiatorThreadID The thread on which actions may initiated + * @param [in] context The object which holds the per-zone context + * for the action + * @param [in] scheduler A function to schedule a next action after an + * action concludes if there is no pending + * action (may be NULL) + * @param [in] layer The layer used to make completions + * @param [out] managerPtr A pointer to hold the new action manager + * + * @return VDO_SUCCESS or an error code + **/ +int makeActionManager(ZoneCount zones, + ZoneThreadGetter *getZoneThreadID, + ThreadID initiatorThreadID, + void *context, + ActionScheduler *scheduler, + PhysicalLayer *layer, + ActionManager **managerPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy an action manager and null out the reference to it. + * + * @param managerPtr The reference to the manager to destroy + **/ +void freeActionManager(ActionManager **managerPtr); + +/** + * Get the current operation an action manager is performing. + * + * @param manager The manager to query + * + * @return The manager's current operation + **/ +AdminStateCode getCurrentManagerOperation(ActionManager *manager) + __attribute__((warn_unused_result)); + +/** + * Get the action-specific context for the operation an action manager is + * currently performing. + * + * @param manager The manager to query + * + * @return The action-specific context for the manager's current action or + * NULL if there is no context or no current action + **/ +void *getCurrentActionContext(ActionManager *manager) + __attribute__((warn_unused_result)); + +/** + * Attempt to schedule the default action. If the manager is not operating + * normally, the action will not be scheduled. + * + * @param manager The action manager + * + * @return true if an action was scheduled. + **/ +bool scheduleDefaultAction(ActionManager *manager); + +/** + * Schedule an action to be applied to all zones. The action will be launched + * immediately if there is no current action, or as soon as the current action + * completes. If there is already a pending action, this action will not be + * scheduled, and, if it has a parent, that parent will be notified. At least + * one of the preamble, zoneAction, or conclusion must not be NULL. + * + * @param manager The action manager to schedule the action on + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleAction(ActionManager *manager, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent); + +/** + * Schedule an operation to be applied to all zones. The operation's action + * will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this + * operation will not be scheduled, and, if it has a parent, that parent will + * be notified. At least one of the preamble, zoneAction, or conclusion must + * not be NULL. + * + * @param manager The action manager to schedule the action on + * @param operation The operation this action will perform + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleOperation(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent); + +/** + * Schedule an operation to be applied to all zones. The operation's action + * will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this + * operation will not be scheduled, and, if it has a parent, that parent will + * be notified. At least one of the preamble, zoneAction, or conclusion must + * not be NULL. + * + * @param manager The action manager to schedule the action on + * @param operation The operation this action will perform + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param context An action-specific context which may be retrieved via + * getCurrentActionContext(); may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleOperationWithContext(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + void *context, + VDOCompletion *parent); + +#endif // ACTION_MANAGER_H diff --git a/vdo/base/adminCompletion.c b/vdo/base/adminCompletion.c new file mode 100644 index 0000000..5c5ed26 --- /dev/null +++ b/vdo/base/adminCompletion.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.c#4 $ + */ + +#include "adminCompletion.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "atomic.h" +#include "completion.h" +#include "types.h" +#include "vdoInternal.h" + +/**********************************************************************/ +void assertAdminOperationType(AdminCompletion *completion, + AdminOperationType expected) +{ + ASSERT_LOG_ONLY(completion->type == expected, + "admin operation type is %u instead of %u", + completion->type, expected); +} + +/**********************************************************************/ +AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(AdminCompletion, completion) == 0); + assertCompletionType(completion->type, SUB_TASK_COMPLETION); + VDOCompletion *parent = completion->parent; + assertCompletionType(parent->type, ADMIN_COMPLETION); + return (AdminCompletion *) parent; +} + +/**********************************************************************/ +void assertAdminPhaseThread(AdminCompletion *adminCompletion, + const char *what, + const char *phaseNames[]) +{ + ThreadID expected = adminCompletion->getThreadID(adminCompletion); + ASSERT_LOG_ONLY((getCallbackThreadID() == expected), + "%s on correct thread for %s", + what, phaseNames[adminCompletion->phase]); +} + +/**********************************************************************/ +VDO *vdoFromAdminSubTask(VDOCompletion *completion, + AdminOperationType expected) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, expected); + return adminCompletion->completion.parent; +} + +/**********************************************************************/ +int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) +{ + int result = initializeEnqueueableCompletion(&adminCompletion->completion, + ADMIN_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&adminCompletion->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + uninitializeAdminCompletion(adminCompletion); + return result; + } + + atomicStoreBool(&adminCompletion->busy, false); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void uninitializeAdminCompletion(AdminCompletion *adminCompletion) +{ + destroyEnqueueable(&adminCompletion->subTaskCompletion); + destroyEnqueueable(&adminCompletion->completion); +} + +/**********************************************************************/ +VDOCompletion *resetAdminSubTask(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + resetCompletion(completion); + completion->callbackThreadID = adminCompletion->getThreadID(adminCompletion); + return completion; +} + +/**********************************************************************/ +void prepareAdminSubTaskOnThread(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID) +{ + prepareForRequeue(&vdo->adminCompletion.subTaskCompletion, callback, + errorHandler, threadID, &vdo->adminCompletion); +} + +/**********************************************************************/ +void prepareAdminSubTask(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler) +{ + AdminCompletion *adminCompletion = &vdo->adminCompletion; + prepareAdminSubTaskOnThread(vdo, callback, errorHandler, + adminCompletion->completion.callbackThreadID); +} + +/** + * Callback for admin operations which will notify the layer that the operation + * is complete. + * + * @param completion The admin completion + **/ +static void adminOperationCallback(VDOCompletion *completion) +{ + completion->layer->completeAdminOperation(completion->layer); +} + +/**********************************************************************/ +int performAdminOperation(VDO *vdo, + AdminOperationType type, + ThreadIDGetterForPhase *threadIDGetter, + VDOAction *action, + VDOAction *errorHandler) +{ + AdminCompletion *adminCompletion = &vdo->adminCompletion; + if (!compareAndSwapBool(&adminCompletion->busy, false, true)) { + return logErrorWithStringError(VDO_COMPONENT_BUSY, + "Can't start admin operation of type %u, " + "another operation is already in progress", + type); + } + + prepareCompletion(&adminCompletion->completion, adminOperationCallback, + adminOperationCallback, + getAdminThread(getThreadConfig(vdo)), vdo); + adminCompletion->type = type; + adminCompletion->getThreadID = threadIDGetter; + adminCompletion->phase = 0; + prepareAdminSubTask(vdo, action, errorHandler); + + PhysicalLayer *layer = vdo->layer; + layer->enqueue(adminCompletion->subTaskCompletion.enqueueable); + layer->waitForAdminOperation(layer); + int result = adminCompletion->completion.result; + atomicStoreBool(&adminCompletion->busy, false); + return result; +} diff --git a/vdo/base/adminCompletion.h b/vdo/base/adminCompletion.h new file mode 100644 index 0000000..50eeecd --- /dev/null +++ b/vdo/base/adminCompletion.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.h#4 $ + */ + +#ifndef ADMIN_COMPLETION_H +#define ADMIN_COMPLETION_H + +#include "atomic.h" +#include "completion.h" +#include "types.h" + +typedef enum adminOperationType { + ADMIN_OPERATION_UNKNOWN = 0, + ADMIN_OPERATION_GROW_LOGICAL, + ADMIN_OPERATION_GROW_PHYSICAL, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + ADMIN_OPERATION_LOAD, + ADMIN_OPERATION_RESUME, + ADMIN_OPERATION_SAVE, + ADMIN_OPERATION_SUSPEND, +} AdminOperationType; + +typedef struct adminCompletion AdminCompletion; + +/** + * A function which gets the ID of the thread on which the current phase of an + * admin operation should be run. + * + * @param adminCompletion The AdminCompletion + * + * @return The ID of the thread on which the current phase should be performed + **/ +typedef ThreadID ThreadIDGetterForPhase(AdminCompletion *adminCompletion); + +struct adminCompletion { + /** The completion */ + VDOCompletion completion; + /** The sub-task completion */ + VDOCompletion subTaskCompletion; + /** Whether this completion is in use */ + AtomicBool busy; + /** The operation type */ + AdminOperationType type; + /** Method to get the ThreadID for the current phase */ + ThreadIDGetterForPhase *getThreadID; + /** The current phase of the operation */ + uint32_t phase; +}; + +/** + * Check that an AdminCompletion's type is as expected. + * + * @param completion The AdminCompletion to check + * @param expected The expected type + **/ +void assertAdminOperationType(AdminCompletion *completion, + AdminOperationType expected); + +/** + * Convert the sub-task completion of an AdminCompletion to an AdminCompletion. + * + * @param completion the AdminCompletion's sub-task completion + * + * @return The sub-task completion as its enclosing AdminCompletion + **/ +AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Assert that we are operating on the correct thread for the current phase. + * + * @param adminCompletion The AdminCompletion to check + * @param what The method doing the phase check + * @param phaseNames The names of the phases of the current operation + **/ +void assertAdminPhaseThread(AdminCompletion *adminCompletion, + const char *what, + const char *phaseNames[]); + +/** + * Get the VDO from the sub-task completion of its AdminCompletion. + * + * @param completion the sub-task completion + * @param expected the expected operation type of the AdminCompletion + * + * @return The VDO + **/ +VDO *vdoFromAdminSubTask(VDOCompletion *completion, + AdminOperationType expected) + __attribute__((warn_unused_result)); + +/** + * Initialize an admin completion. + * + * @param vdo The VDO which owns the completion + * @param adminCompletion The AdminCompletion to initialize + * + * @return VDO_SUCCESS or an error + **/ +int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) + __attribute__((warn_unused_result)); + +/** + * Clean up an admin completion's resources. + * + * @param adminCompletion The AdminCompletion to uninitialize + **/ +void uninitializeAdminCompletion(AdminCompletion *adminCompletion); + +/** + * Reset an AdminCompletion's sub-task completion. + * + * @param completion The AdminCompletion's sub-task completion + * + * @return The sub-task completion for the convenience of callers + **/ +VDOCompletion *resetAdminSubTask(VDOCompletion *completion); + +/** + * Prepare the sub-task completion of a VDO's AdminCompletion + * + * @param vdo The VDO + * @param callback The callback for the sub-task + * @param errorHandler The error handler for the sub-task + * @param threadID The ID of the thread on which to run the callback + **/ +void prepareAdminSubTaskOnThread(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID); + +/** + * Prepare the sub-task completion of a VDO's AdminCompletion to run on the + * same thread as the AdminCompletion's main completion. + * + * @param vdo The VDO + * @param callback The callback for the sub-task + * @param errorHandler The error handler for the sub-task + **/ +void prepareAdminSubTask(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Perform an administrative operation (load, suspend, grow logical, or grow + * physical). This method should not be called from base threads unless it is + * certain the calling thread won't be needed to perform the operation. It may + * (and should) be called from non-base threads. + * + * @param vdo The VDO on which to perform the operation + * @param type The type of operation to perform + * @param threadIDGetter A function for getting the ID of the thread on which + * a given phase should be run + * @param action The action which starts the operation + * @param errorHandler The error handler for the operation + * + * @return The result of the operation + **/ +int performAdminOperation(VDO *vdo, + AdminOperationType type, + ThreadIDGetterForPhase *threadIDGetter, + VDOAction *action, + VDOAction *errorHandler) + __attribute__((warn_unused_result)); + +#endif /* ADMIN_COMPLETION_H */ diff --git a/vdo/base/adminState.c b/vdo/base/adminState.c new file mode 100644 index 0000000..6b30315 --- /dev/null +++ b/vdo/base/adminState.c @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.c#14 $ + */ + +#include "adminState.h" + +#include "logger.h" +#include "permassert.h" + +#include "completion.h" +#include "types.h" + +/**********************************************************************/ +const char *getAdminStateCodeName(AdminStateCode code) +{ + switch (code) { + case ADMIN_STATE_NORMAL_OPERATION: + return "ADMIN_STATE_NORMAL_OPERATION"; + + case ADMIN_STATE_OPERATING: + return "ADMIN_STATE_OPERATING"; + + case ADMIN_STATE_FORMATTING: + return "ADMIN_STATE_FORMATTING"; + + case ADMIN_STATE_LOADING: + return "ADMIN_STATE_LOADING"; + + case ADMIN_STATE_LOADING_FOR_RECOVERY: + return "ADMIN_STATE_LOADING_FOR_RECOVERY"; + + case ADMIN_STATE_LOADING_FOR_REBUILD: + return "ADMIN_STATE_LOADING_FOR_REBUILD"; + + case ADMIN_STATE_NEW: + return "ADMIN_STATE_NEW"; + + case ADMIN_STATE_WAITING_FOR_RECOVERY: + return "ADMIN_STATE_WAITING_FOR_RECOVERY"; + + case ADMIN_STATE_RECOVERING: + return "ADMIN_STATE_RECOVERING"; + + case ADMIN_STATE_REBUILDING: + return "ADMIN_STATE_REBUILDING"; + + case ADMIN_STATE_SAVING: + return "ADMIN_STATE_SAVING"; + + case ADMIN_STATE_SAVED: + return "ADMIN_STATE_SAVED"; + + case ADMIN_STATE_SCRUBBING: + return "ADMIN_STATE_SCRUBBING"; + + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + return "ADMIN_STATE_SAVE_FOR_SCRUBBING"; + + case ADMIN_STATE_SUSPENDING: + return "ADMIN_STATE_SUSPENDING"; + + case ADMIN_STATE_SUSPENDED: + return "ADMIN_STATE_SUSPENDED"; + + case ADMIN_STATE_SUSPENDED_OPERATION: + return "ADMIN_STATE_SUSPENDED_OPERATION"; + + case ADMIN_STATE_RESUMING: + return "ADMIN_STATE_RESUMING"; + + default: + return "INVALID ADMIN_STATE"; + } +} + +/**********************************************************************/ +const char *getAdminStateName(const AdminState *state) +{ + return getAdminStateCodeName(state->state); +} + +/**********************************************************************/ +static AdminStateCode getNextState(AdminStateCode previousState, + AdminStateCode operation) +{ + if (isQuiescingCode(operation)) { + return ((operation & ADMIN_TYPE_MASK) | ADMIN_FLAG_QUIESCENT); + } + + if (operation == ADMIN_STATE_SUSPENDED_OPERATION) { + return previousState; + } + + return ADMIN_STATE_NORMAL_OPERATION; +} + +/** + * Finish an operation if one is in progress. If there is a waiter, it will be + * notified. + * + * @param state The AdminState + * @param result The result of the operation + * + * @return true if an operation was in progress and has been + * finished. + **/ +static bool endOperation(AdminState *state, int result) +{ + if (!isOperating(state)) { + return false; + } + + if (state->starting) { + state->complete = true; + if (state->waiter != NULL) { + setCompletionResult(state->waiter, result); + } + } else { + state->complete = false; + state->state = state->nextState; + releaseCompletionWithResult(&state->waiter, result); + } + + return true; +} + +/** + * Begin an operation if it may be started given the current state. + * + * @param state The AdminState + * @param operation The operation to begin + * @param waiter A completion to notify when the operation is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int beginOperation(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + int result; + if (isOperating(state) + || (isQuiescent(state) != isQuiescentOperation(operation))) { + result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, + "Can't start %s from %s", + getAdminStateCodeName(operation), + getAdminStateName(state)); + } else if (state->waiter != NULL) { + result = logErrorWithStringError(VDO_COMPONENT_BUSY, + "Can't start %s with extant waiter", + getAdminStateCodeName(operation)); + } else { + state->waiter = waiter; + state->nextState = getNextState(state->state, operation); + state->state = operation; + if (initiator != NULL) { + state->starting = true; + initiator(state); + state->starting = false; + if (state->complete) { + endOperation(state, VDO_SUCCESS); + } + } + + return VDO_SUCCESS; + } + + if (waiter != NULL) { + finishCompletion(waiter, result); + } + + return result; +} + +/** + * Check the result of a state validation. If the result failed, log an invalid + * state error and, if there is a waiter, notify it. + * + * @param valid true if the code is of an appropriate type + * @param code The code which failed to be of the correct type + * @param what What the code failed to be, for logging + * @param waiter The completion to notify of the error; may be NULL + * + * @return The result of the check + **/ +static bool checkCode(bool valid, + AdminStateCode code, + const char *what, + VDOCompletion *waiter) +{ + if (valid) { + return true; + } + + int result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, + "%s is not a %s", + getAdminStateCodeName(code), what); + if (waiter != NULL) { + finishCompletion(waiter, result); + } + + return false; +} + +/**********************************************************************/ +bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isDrainOperation(operation), operation, "drain operation", + waiter); +} + +/**********************************************************************/ +bool startDraining(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertDrainOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishDraining(AdminState *state) +{ + return finishDrainingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishDrainingWithResult(AdminState *state, int result) +{ + return (isDraining(state) && endOperation(state, result)); +} + +/**********************************************************************/ +bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isLoadOperation(operation), operation, "load operation", + waiter); +} + +/**********************************************************************/ +bool startLoading(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertLoadOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishLoading(AdminState *state) +{ + return finishLoadingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishLoadingWithResult(AdminState *state, int result) +{ + return (isLoading(state) && endOperation(state, result)); +} + +/**********************************************************************/ +bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isResumeOperation(operation), operation, "resume operation", + waiter); +} + +/**********************************************************************/ +bool startResuming(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertResumeOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishResuming(AdminState *state) +{ + return finishResumingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishResumingWithResult(AdminState *state, int result) +{ + return (isResuming(state) && endOperation(state, result)); +} + +/**********************************************************************/ +int resumeIfQuiescent(AdminState *state) +{ + if (!isQuiescent(state)) { + return VDO_INVALID_ADMIN_STATE; + } + + state->state = ADMIN_STATE_NORMAL_OPERATION; + return VDO_SUCCESS; +} + +/** + * Check whether an AdminStateCode is an operation. + * + * @param code The operation to check + * @param waiter The completion to notify if the code is not an operation; may + * be NULL + * + * @return true if the code is an operation + **/ +static bool assertOperation(AdminStateCode code, VDOCompletion *waiter) +{ + return checkCode(isOperation(code), code, "operation", waiter); +} + +/**********************************************************************/ +int startOperation(AdminState *state, AdminStateCode operation) +{ + return (assertOperation(operation, NULL) + ? beginOperation(state, operation, NULL, NULL) + : VDO_INVALID_ADMIN_STATE); +} + +/**********************************************************************/ +bool startOperationWithWaiter(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishOperation(AdminState *state) +{ + return finishOperationWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishOperationWithResult(AdminState *state, int result) +{ + return endOperation(state, result); +} diff --git a/vdo/base/adminState.h b/vdo/base/adminState.h new file mode 100644 index 0000000..5ab13cb --- /dev/null +++ b/vdo/base/adminState.h @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.h#17 $ + */ + +#ifndef ADMIN_STATE_H +#define ADMIN_STATE_H + +#include "completion.h" +#include "types.h" + +/** + * The list of state types. + **/ +typedef enum { + /** Normal operation, DataVIOs may be active */ + ADMIN_TYPE_NORMAL = 0, + /** + * Format: an operation for formatting a new VDO. + **/ + ADMIN_TYPE_FORMAT, + /** + * Recover: a recovery operation. + **/ + ADMIN_TYPE_RECOVER, + /** + * Rebuild: write data necessary for a full rebuild, drain outstanding I/O, + * and return to normal operation. + **/ + ADMIN_TYPE_REBUILD, + /** + * Save: write all dirty metadata thereby restoring the VDO to a clean state, + * drain outstanding I/O, and become quiescent. + **/ + ADMIN_TYPE_SAVE, + /** + * Scrub: load and/or save state necessary to scrub a slab. + **/ + ADMIN_TYPE_SCRUB, + /** + * Suspend: write enough dirty metadata to perform resize transactions, + * drain outstanding I/O, and become quiescent. + **/ + ADMIN_TYPE_SUSPEND, + /** + * Resume: return to normal from a quiescent state + **/ + ADMIN_TYPE_RESUME, + /** The mask for extracting the AdminType from and AdminStateCode */ + ADMIN_TYPE_MASK = 0xff, +} AdminType; + + +/** + * The bit position of flags used to categorize states. + **/ +typedef enum { + ADMIN_FLAG_BIT_START = 8, + /** Flag indicating that I/O is draining */ + ADMIN_FLAG_BIT_DRAINING = ADMIN_FLAG_BIT_START, + /** Flag indicating a load operation */ + ADMIN_FLAG_BIT_LOADING, + /** Flag indicating that the next state will be a quiescent state */ + ADMIN_FLAG_BIT_QUIESCING, + /** Flag indicating that the state is quiescent */ + ADMIN_FLAG_BIT_QUIESCENT, + /** + * Flag indicating that an operation is in progress and so no other + * operation may be started. + **/ + ADMIN_FLAG_BIT_OPERATING, +} AdminFlagBit; + +/** + * The flags themselves. + **/ +typedef enum { + ADMIN_FLAG_DRAINING = (uint32_t) (1 << ADMIN_FLAG_BIT_DRAINING), + ADMIN_FLAG_LOADING = (uint32_t) (1 << ADMIN_FLAG_BIT_LOADING), + ADMIN_FLAG_QUIESCING = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCING), + ADMIN_FLAG_QUIESCENT = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCENT), + ADMIN_FLAG_OPERATING = (uint32_t) (1 << ADMIN_FLAG_BIT_OPERATING), +} AdminFlag; + +/** + * The state codes. + **/ +typedef enum { + ADMIN_STATE_NORMAL_OPERATION = ADMIN_TYPE_NORMAL, + ADMIN_STATE_OPERATING = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_OPERATING), + ADMIN_STATE_FORMATTING = (ADMIN_TYPE_FORMAT + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING_FOR_REBUILD = (ADMIN_TYPE_REBUILD + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_WAITING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING), + ADMIN_STATE_NEW = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_RECOVERING = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_REBUILDING = (ADMIN_TYPE_REBUILD + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_SAVING = (ADMIN_TYPE_SAVE + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_QUIESCING), + ADMIN_STATE_SAVED = (ADMIN_TYPE_SAVE + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_SCRUBBING = (ADMIN_TYPE_SCRUB + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_SAVE_FOR_SCRUBBING = (ADMIN_TYPE_SCRUB + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_SUSPENDING = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_QUIESCING), + ADMIN_STATE_SUSPENDED = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_SUSPENDED_OPERATION = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_RESUMING = (ADMIN_TYPE_RESUME + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_QUIESCENT), +} AdminStateCode; + +typedef struct { + /** The current administrative state */ + AdminStateCode state; + /** The next administrative state (when the current operation finishes */ + AdminStateCode nextState; + /** A completion waiting on a state change */ + VDOCompletion *waiter; + /** Whether an operation is being initiated */ + bool starting; + /** Whether an operation has completed in the initiator */ + bool complete; +} AdminState; + +/** + * A method to be called once an admin operation may be initiated. + **/ +typedef void AdminInitiator(AdminState *state); + +/** + * Get the name of an AdminStateCode for logging purposes. + * + * @param code The AdminStateCode + * + * @return The name of the state's code + **/ +const char *getAdminStateCodeName(AdminStateCode code) + __attribute__((warn_unused_result)); + +/** + * Get the name of an AdminState's code for logging purposes. + * + * @param state The AdminState + * + * @return The name of the state's code + **/ +const char *getAdminStateName(const AdminState *state) + __attribute__((warn_unused_result)); + +/** + * Check whether an AdminState is in normal operation. + * + * @param state The AdminState to query + * + * @return true if the state is normal + **/ +__attribute__((warn_unused_result)) +static inline bool isNormal(AdminState *state) +{ + return ((state->state & ADMIN_TYPE_MASK) == ADMIN_TYPE_NORMAL); +} + +/** + * Check whether an AdminStateCode is an operation. + * + * @param code The code to check + * + * @return true if the code is an operation + **/ +__attribute__((warn_unused_result)) +static inline bool isOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_OPERATING) == ADMIN_FLAG_OPERATING); +} + +/** + * Check whether an AdminState is operating. + * + * @param state The AdminState to query + * + * @return true if the state is operating + **/ +__attribute__((warn_unused_result)) +static inline bool isOperating(AdminState *state) +{ + return isOperation(state->state); +} + +/** + * Check whether an AdminState is suspending. + * + * @param state The AdminState to query + * + * @return true if the state is suspending + **/ +__attribute__((warn_unused_result)) +static inline bool isSuspending(AdminState *state) +{ + return (state->state == ADMIN_STATE_SUSPENDING); +} + +/** + * Check whether an AdminState is suspended. + * + * @param state The AdminState to query + * + * @return true if the state is suspended + **/ +__attribute__((warn_unused_result)) +static inline bool isSuspended(AdminState *state) +{ + return (state->state == ADMIN_STATE_SUSPENDED); +} + +/** + * Check whether an AdminState is saving. + * + * @param state The AdminState to query + * + * @return true if the state is saving + **/ +__attribute__((warn_unused_result)) +static inline bool isSaving(AdminState *state) +{ + return (state->state == ADMIN_STATE_SAVING); +} + +/** + * Check whether an AdminState is saved. + * + * @param state The AdminState to query + * + * @return true if the state is saved + **/ +__attribute__((warn_unused_result)) +static inline bool isSaved(AdminState *state) +{ + return (state->state == ADMIN_STATE_SAVED); +} + +/** + * Check whether an AdminStateCode is a drain operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a drain operation + **/ +__attribute__((warn_unused_result)) +static inline bool isDrainOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_DRAINING) == ADMIN_FLAG_DRAINING); +} + +/** + * Check whether an AdminState is draining. + * + * @param state The AdminState to query + * + * @return true if the state is draining + **/ +__attribute__((warn_unused_result)) +static inline bool isDraining(AdminState *state) +{ + return isDrainOperation(state->state); +} + +/** + * Check whether an AdminStateCode is a load operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a load operation + **/ +__attribute__((warn_unused_result)) +static inline bool isLoadOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_LOADING) == ADMIN_FLAG_LOADING); +} + +/** + * Check whether an AdminState is loading. + * + * @param state The AdminState to query + * + * @return true if the state is loading + **/ +__attribute__((warn_unused_result)) +static inline bool isLoading(AdminState *state) +{ + return isLoadOperation(state->state); +} + +/** + * Check whether an AdminStateCode is a resume operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a resume operation + **/ +__attribute__((warn_unused_result)) +static inline bool isResumeOperation(AdminStateCode code) +{ + return ((code & ADMIN_TYPE_MASK) == ADMIN_TYPE_RESUME); +} + +/** + * Check whether an AdminState is resumeing. + * + * @param state The AdminState to query + * + * @return true if the state is resumeing + **/ +__attribute__((warn_unused_result)) +static inline bool isResuming(AdminState *state) +{ + return isResumeOperation(state->state); +} + +/** + * Check whether an AdminState is doing a clean load. + * + * @param state The AdminState to query + * + * @return true if the state is a clean load + **/ +__attribute__((warn_unused_result)) +static inline bool isCleanLoad(AdminState *state) +{ + return ((state->state == ADMIN_STATE_FORMATTING) + || (state->state == ADMIN_STATE_LOADING)); +} + +/** + * Check whether an AdminStateCode is quiescing. + * + * param code The AdminStateCode to check + * + * @return true is the state is quiescing + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescingCode(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_QUIESCING) == ADMIN_FLAG_QUIESCING); +} + +/** + * Check whether an AdminState is quiescing. + * + * @param state The AdminState to check + * + * @return true if the state is quiescing + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescing(AdminState *state) +{ + return isQuiescingCode(state->state); +} + +/** + * Check where an AdminStateCode is quiescent. + * + * param code The AdminStateCode to check + * + * @return true is the state is quiescent + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescentCode(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_QUIESCENT) == ADMIN_FLAG_QUIESCENT); +} + +/** + * Check whether an AdminState is quiescent. + * + * @param state The AdminState to query + * + * @return true is the state is quiescent + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescent(AdminState *state) +{ + return isQuiescentCode(state->state); +} + +/** + * Check whether an AdminStateCode is a quiescent operation. + * + * @param code The code to check + * + * @return true if the code is a quiescent operation + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescentOperation(AdminStateCode code) +{ + return (isQuiescentCode(code) && isOperation(code)); +} + +/** + * Check that an operation is a drain. + * + * @param operation The operation to check + * @param waiter The completion to finish with an error if the operation is + * not a drain + * + * @return true if the specified operation is a drain + **/ +bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) + __attribute__((warn_unused_result)); + +/** + * Initiate a drain operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of drain to initiate + * @param waiter The completion to notify when the drain is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the drain was initiated, if not the waiter + * will be notified + **/ +bool startDraining(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a drain operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was draining; will notify the waiter + * if so + **/ +bool finishDraining(AdminState *state); + +/** + * Finish a drain operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the drain operation + * + * @return true if the state was draining; will notify the + * waiter if so + **/ +bool finishDrainingWithResult(AdminState *state, int result); + +/** + * Check that an operation is a load. + * + * @param operation The operation to check + * @param waiter The completion to finish with an error if the operation is + * not a load + * + * @return true if the specified operation is a load + **/ +bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) + __attribute__((warn_unused_result)); + +/** + * Initiate a load operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of load to initiate + * @param waiter The completion to notify when the load is complete; may be + * NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the load was initiated, if not the waiter + * will be notified + **/ +bool startLoading(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a load operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was loading; will notify the waiter + * if so + **/ +bool finishLoading(AdminState *state); + +/** + * Finish a load operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the load operation + * + * @return true if the state was loading; will notify the + * waiter if so + **/ +bool finishLoadingWithResult(AdminState *state, int result); + +/** + * Check whether an AdminStateCode is a resume operation. + * + * @param operation The operation to check + * @param waiter The completion to notify if the operation is not a resume + * operation; may be NULL + * + * @return true if the code is a resume operation + **/ +bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter); + +/** + * Initiate a resume operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of resume to start + * @param waiter The completion to notify when the resume is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the resume was initiated, if not the waiter + * will be notified + **/ +bool startResuming(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a resume operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was resuming; will notify the waiter + * if so + **/ +bool finishResuming(AdminState *state); + +/** + * Finish a resume operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the resume operation + * + * @return true if the state was resuming; will notify the + * waiter if so + **/ +bool finishResumingWithResult(AdminState *state, int result); + +/** + * Change the state to normal operation if the current state is quiescent. + * + * @param state The AdminState to resume + * + * @return VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise + **/ +int resumeIfQuiescent(AdminState *state); + +/** + * Attempt to start an operation. + * + * @param state the AdminState + * @param operation the operation to start + * + * @return VDO_SUCCESS if the operation was started + * VDO_INVALID_ADMIN_STATE if not + **/ +int startOperation(AdminState *state, AdminStateCode operation); + +/** + * Attempt to start an operation. + * + * @param state the AdminState + * @param operation the operation to start + * @param waiter the completion to notify when the operation completes or + * fails to start; may be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the operation was started + **/ +bool startOperationWithWaiter(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish the current operation. Will notify the operation waiter if there is + * one. This method should be used for operations started with + * startOperation(). For operations which were started with startDraining(), + * use finishDraining() instead. + * + * @param state The state whose operation is to be finished + * + * @return true if there was an operation to finish + **/ +bool finishOperation(AdminState *state); + +/** + * Finish the current operation with a status code. Will notify the operation + * waiter if there is one. + * + * @param state The state whose operation is to be finished + * @param result The result of the operation + **/ +bool finishOperationWithResult(AdminState *state, int result); + +/** + * Set a result for the current operation. + * + * @param state the AdminState + * @param result the result to set; if there is no waiter, this is a no-op + **/ +static inline void setOperationResult(AdminState *state, int result) +{ + if (state->waiter != NULL) { + setCompletionResult(state->waiter, result); + } +} + +#endif // ADMIN_STATE_H diff --git a/vdo/base/allocatingVIO.c b/vdo/base/allocatingVIO.c new file mode 100644 index 0000000..4e0ffa8 --- /dev/null +++ b/vdo/base/allocatingVIO.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.c#4 $ + */ + +#include "allocatingVIO.h" + +#include "logger.h" + +#include "allocationSelector.h" +#include "blockAllocator.h" +#include "dataVIO.h" +#include "pbnLock.h" +#include "slabDepot.h" +#include "types.h" +#include "vdoInternal.h" +#include "vioWrite.h" + +/** + * Make a single attempt to acquire a write lock on a newly-allocated PBN. + * + * @param allocatingVIO The AllocatingVIO that wants a write lock for its + * newly allocated block + * + * @return VDO_SUCCESS or an error code + **/ +static int attemptPBNWriteLock(AllocatingVIO *allocatingVIO) +{ + assertInPhysicalZone(allocatingVIO); + + ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, + "must not acquire a lock while already referencing one"); + + PBNLock *lock; + int result = attemptPBNLock(allocatingVIO->zone, allocatingVIO->allocation, + allocatingVIO->writeLockType, &lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (lock->holderCount > 0) { + // This block is already locked, which should be impossible. + return logErrorWithStringError(VDO_LOCK_ERROR, + "Newly allocated block %" PRIu64 + " was spuriously locked (holderCount=%u)", + allocatingVIO->allocation, + lock->holderCount); + } + + // We've successfully acquired a new lock, so mark it as ours. + lock->holderCount += 1; + allocatingVIO->allocationLock = lock; + assignProvisionalReference(lock); + return VDO_SUCCESS; +} + +/** + * Attempt to allocate and lock a physical block. If successful, continue + * along the write path. + * + * @param allocatingVIO The AllocatingVIO which needs an allocation + * + * @return VDO_SUCCESS or an error if a block could not be allocated + **/ +static int allocateAndLockBlock(AllocatingVIO *allocatingVIO) +{ + BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); + int result = allocateBlock(allocator, &allocatingVIO->allocation); + if (result != VDO_SUCCESS) { + return result; + } + + result = attemptPBNWriteLock(allocatingVIO); + if (result != VDO_SUCCESS) { + return result; + } + + // We got a block! + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + vio->physical = allocatingVIO->allocation; + allocatingVIO->allocationCallback(allocatingVIO); + return VDO_SUCCESS; +} + +static void allocateBlockForWrite(VDOCompletion *completion); + +/** + * Retry allocating a block for write. + * + * @param waiter The AllocatingVIO that was waiting to allocate + * @param context The context (unused) + **/ +static void +retryAllocateBlockForWrite(Waiter *waiter, + void *context __attribute__((unused))) +{ + AllocatingVIO *allocatingVIO = waiterAsAllocatingVIO(waiter); + allocateBlockForWrite(allocatingVIOAsCompletion(allocatingVIO)); +} + +/** + * Attempt to enqueue an AllocatingVIO to wait for a slab to be scrubbed in the + * current allocation zone. + * + * @param allocatingVIO The AllocatingVIO which wants to allocate a block + * + * @return VDO_SUCCESS if the AllocatingVIO was queued, VDO_NO_SPACE if there + * are no slabs to be scrubbed in the current zone, or some other + * error + **/ +static int waitForCleanSlab(AllocatingVIO *allocatingVIO) +{ + Waiter *waiter = allocatingVIOAsWaiter(allocatingVIO); + waiter->callback = retryAllocateBlockForWrite; + + BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); + int result = enqueueForCleanSlab(allocator, waiter); + if (result != VDO_SUCCESS) { + return result; + } + + // We've successfully enqueued, when we come back, pretend like we've + // never tried this allocation before. + allocatingVIO->waitForCleanSlab = false; + allocatingVIO->allocationAttempts = 0; + return VDO_SUCCESS; +} + +/** + * Attempt to allocate a block in an AllocatingVIO's current allocation zone. + * + * @param allocatingVIO The AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +static int allocateBlockInZone(AllocatingVIO *allocatingVIO) +{ + allocatingVIO->allocationAttempts++; + int result = allocateAndLockBlock(allocatingVIO); + if (result != VDO_NO_SPACE) { + return result; + } + + if (allocatingVIO->waitForCleanSlab) { + result = waitForCleanSlab(allocatingVIO); + if (result != VDO_NO_SPACE) { + return result; + } + } + + VDO *vdo = getVDOFromAllocatingVIO(allocatingVIO); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (allocatingVIO->allocationAttempts >= threadConfig->physicalZoneCount) { + if (allocatingVIO->waitForCleanSlab) { + // There were no free blocks in any zone, and no zone had slabs to + // scrub. + allocatingVIO->allocationCallback(allocatingVIO); + return VDO_SUCCESS; + } + + allocatingVIO->waitForCleanSlab = true; + allocatingVIO->allocationAttempts = 0; + } + + // Try the next zone + ZoneCount zoneNumber = getPhysicalZoneNumber(allocatingVIO->zone) + 1; + if (zoneNumber == threadConfig->physicalZoneCount) { + zoneNumber = 0; + } + allocatingVIO->zone = vdo->physicalZones[zoneNumber]; + launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, + THIS_LOCATION("$F;cb=allocBlockInZone")); + return VDO_SUCCESS; +} + +/** + * Attempt to allocate a block. This callback is registered in + * allocateDataBlock() and allocateBlockInZone(). + * + * @param completion The AllocatingVIO needing an allocation + **/ +static void allocateBlockForWrite(VDOCompletion *completion) +{ + AllocatingVIO *allocatingVIO = asAllocatingVIO(completion); + assertInPhysicalZone(allocatingVIO); + allocatingVIOAddTraceRecord(allocatingVIO, THIS_LOCATION(NULL)); + int result = allocateBlockInZone(allocatingVIO); + if (result != VDO_SUCCESS) { + setCompletionResult(completion, result); + allocatingVIO->allocationCallback(allocatingVIO); + } +} + +/**********************************************************************/ +void allocateDataBlock(AllocatingVIO *allocatingVIO, + AllocationSelector *selector, + PBNLockType writeLockType, + AllocationCallback *callback) +{ + allocatingVIO->writeLockType = writeLockType; + allocatingVIO->allocationCallback = callback; + allocatingVIO->allocationAttempts = 0; + allocatingVIO->allocation = ZERO_BLOCK; + + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + allocatingVIO->zone + = vio->vdo->physicalZones[getNextAllocationZone(selector)]; + + launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, + THIS_LOCATION("$F;cb=allocDataBlock")); +} + +/**********************************************************************/ +void releaseAllocationLock(AllocatingVIO *allocatingVIO) +{ + assertInPhysicalZone(allocatingVIO); + PhysicalBlockNumber lockedPBN = allocatingVIO->allocation; + if (hasProvisionalReference(allocatingVIO->allocationLock)) { + allocatingVIO->allocation = ZERO_BLOCK; + } + + releasePBNLock(allocatingVIO->zone, lockedPBN, + &allocatingVIO->allocationLock); +} + +/**********************************************************************/ +void resetAllocation(AllocatingVIO *allocatingVIO) +{ + ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, + "must not reset allocation while holding a PBN lock"); + + allocatingVIOAsVIO(allocatingVIO)->physical = ZERO_BLOCK; + allocatingVIO->zone = NULL; + allocatingVIO->allocation = ZERO_BLOCK; + allocatingVIO->allocationAttempts = 0; + allocatingVIO->waitForCleanSlab = false; +} diff --git a/vdo/base/allocatingVIO.h b/vdo/base/allocatingVIO.h new file mode 100644 index 0000000..a2f2b7b --- /dev/null +++ b/vdo/base/allocatingVIO.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.h#4 $ + */ + +#ifndef ALLOCATING_VIO_H +#define ALLOCATING_VIO_H + +#include "atomic.h" +#include "pbnLock.h" +#include "physicalZone.h" +#include "types.h" +#include "vio.h" +#include "waitQueue.h" + +typedef void AllocationCallback(AllocatingVIO *allocationVIO); + +/** + * A VIO which can receive an allocation from the block allocator. Currently, + * these are used both for servicing external data requests and for compressed + * block writes. + **/ +struct allocatingVIO { + /** The underlying VIO */ + VIO vio; + + /** The WaitQueue entry structure */ + Waiter waiter; + + /** The physical zone in which to allocate a physical block */ + PhysicalZone *zone; + + /** The block allocated to this VIO */ + PhysicalBlockNumber allocation; + + /** + * If non-NULL, the pooled PBN lock held on the allocated block. Must be a + * write lock until the block has been written, after which it will become a + * read lock. + **/ + PBNLock *allocationLock; + + /** The type of write lock to obtain on the allocated block */ + PBNLockType writeLockType; + + /** The number of zones in which this VIO has attempted an allocation */ + ZoneCount allocationAttempts; + + /** Whether this VIO should wait for a clean slab */ + bool waitForCleanSlab; + + /** The function to call once allocation is complete */ + AllocationCallback *allocationCallback; +}; + +/** + * Convert a VIO to an AllocatingVIO. + * + * @param vio The VIO to convert + * + * @return The VIO as an AllocatingVIO + **/ +static inline AllocatingVIO *vioAsAllocatingVIO(VIO *vio) +{ + STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); + ASSERT_LOG_ONLY(((vio->type == VIO_TYPE_DATA) + || (vio->type == VIO_TYPE_COMPRESSED_BLOCK)), + "VIO is an AllocatingVIO"); + return (AllocatingVIO *) vio; +} + +/** + * Convert an AllocatingVIO to a VIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a VIO + **/ +static inline VIO *allocatingVIOAsVIO(AllocatingVIO *allocatingVIO) +{ + return &allocatingVIO->vio; +} + +/** + * Convert a generic VDOCompletion to an AllocatingVIO. + * + * @param completion The completion to convert + * + * @return The completion as an AllocatingVIO + **/ +static inline AllocatingVIO *asAllocatingVIO(VDOCompletion *completion) +{ + return vioAsAllocatingVIO(asVIO(completion)); +} + +/** + * Convert an AllocatingVIO to a generic completion. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a completion + **/ +static inline +VDOCompletion *allocatingVIOAsCompletion(AllocatingVIO *allocatingVIO) +{ + return vioAsCompletion(allocatingVIOAsVIO(allocatingVIO)); +} + +/** + * Convert an AllocatingVIO to a generic wait queue entry. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a wait queue entry + **/ +static inline Waiter *allocatingVIOAsWaiter(AllocatingVIO *allocatingVIO) +{ + return &allocatingVIO->waiter; +} + +/** + * Convert an AllocatingVIO's generic wait queue entry back to the + * AllocatingVIO. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as an AllocatingVIO + **/ +static inline AllocatingVIO *waiterAsAllocatingVIO(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + return + (AllocatingVIO *) ((uintptr_t) waiter - offsetof(AllocatingVIO, waiter)); +} + +/** + * Check whether an AllocatingVIO is a compressed block write. + * + * @param allocatingVIO The AllocatingVIO to check + * + * @return true if the AllocatingVIO is a compressed block write + **/ +static inline bool isCompressedWriteAllocatingVIO(AllocatingVIO *allocatingVIO) +{ + return isCompressedWriteVIO(allocatingVIOAsVIO(allocatingVIO)); +} + +/** + * Add a trace record for the current source location. + * + * @param allocatingVIO The AllocatingVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void allocatingVIOAddTraceRecord(AllocatingVIO *allocatingVIO, + TraceLocation location) +{ + vioAddTraceRecord(allocatingVIOAsVIO(allocatingVIO), location); +} + +/** + * Get the VDO from an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO from which to get the VDO + * + * @return The VDO to which an AllocatingVIO belongs + **/ +static inline VDO *getVDOFromAllocatingVIO(AllocatingVIO *allocatingVIO) +{ + return allocatingVIOAsVIO(allocatingVIO)->vdo; +} + +/** + * Check that an AllocatingVIO is running on the physical zone thread in + * which it did its allocation. + * + * @param allocatingVIO The AllocatingVIO in question + **/ +static inline void assertInPhysicalZone(AllocatingVIO *allocatingVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(allocatingVIO->zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "AllocatingVIO for allocated physical block %" PRIu64 + " on thread %u, should be on thread %u", + allocatingVIO->allocation, threadID, expected); +} + +/** + * Set a callback as a physical block operation in an AllocatingVIO's allocated + * zone. + * + * @param allocatingVIO The AllocatingVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setPhysicalZoneCallback(AllocatingVIO *allocatingVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(allocatingVIOAsCompletion(allocatingVIO), callback, + getPhysicalZoneThreadID(allocatingVIO->zone)); + allocatingVIOAddTraceRecord(allocatingVIO, location); +} + +/** + * Set a callback as a physical block operation in an AllocatingVIO's allocated + * zone and invoke it immediately. + * + * @param allocatingVIO The AllocatingVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchPhysicalZoneCallback(AllocatingVIO *allocatingVIO, + VDOAction *callback, + TraceLocation location) +{ + setPhysicalZoneCallback(allocatingVIO, callback, location); + invokeCallback(allocatingVIOAsCompletion(allocatingVIO)); +} + +/** + * Allocate a data block to an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO which needs an allocation + * @param selector The allocation selector for deciding which physical + * zone to allocate from + * @param writeLockType The type of write lock to obtain on the block + * @param callback The function to call once the allocation is complete + **/ +void allocateDataBlock(AllocatingVIO *allocatingVIO, + AllocationSelector *selector, + PBNLockType writeLockType, + AllocationCallback *callback); + +/** + * Release the PBN lock on the allocated block. If the reference to the locked + * block is still provisional, it will be released as well. + * + * @param allocatingVIO The lock holder + **/ +void releaseAllocationLock(AllocatingVIO *allocatingVIO); + +/** + * Reset an AllocatingVIO after it has done an allocation. + * + * @param allocatingVIO The AllocatingVIO + **/ +void resetAllocation(AllocatingVIO *allocatingVIO); + +#endif // ALLOCATING_VIO_H diff --git a/vdo/base/allocationSelector.c b/vdo/base/allocationSelector.c new file mode 100644 index 0000000..e703d09 --- /dev/null +++ b/vdo/base/allocationSelector.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.c#1 $ + */ + +#include "allocationSelector.h" +#include "allocationSelectorInternals.h" + +#include "memoryAlloc.h" + +#include "types.h" + +enum { + ALLOCATIONS_PER_ZONE = 128, +}; + +/**********************************************************************/ +int makeAllocationSelector(ZoneCount physicalZoneCount, + ThreadID threadID, + AllocationSelector **selectorPtr) +{ + AllocationSelector *selector; + int result = ALLOCATE(1, AllocationSelector, __func__, &selector); + if (result != VDO_SUCCESS) { + return result; + } + + *selector = (AllocationSelector) { + .nextAllocationZone = threadID % physicalZoneCount, + .lastPhysicalZone = physicalZoneCount - 1, + }; + + *selectorPtr = selector; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeAllocationSelector(AllocationSelector **selectorPtr) +{ + AllocationSelector *selector = *selectorPtr; + if (selector == NULL) { + return; + } + + FREE(selector); + *selectorPtr = NULL; +} + +/**********************************************************************/ +ZoneCount getNextAllocationZone(AllocationSelector *selector) +{ + if (selector->lastPhysicalZone > 0) { + if (selector->allocationCount < ALLOCATIONS_PER_ZONE) { + selector->allocationCount++; + } else { + selector->allocationCount = 1; + if (selector->nextAllocationZone < selector->lastPhysicalZone) { + selector->nextAllocationZone++; + } else { + selector->nextAllocationZone = 0; + } + } + } + + return selector->nextAllocationZone; +} diff --git a/vdo/base/allocationSelector.h b/vdo/base/allocationSelector.h new file mode 100644 index 0000000..7b922e9 --- /dev/null +++ b/vdo/base/allocationSelector.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.h#1 $ + */ + +#ifndef ALLOCATION_SELECTOR_H +#define ALLOCATION_SELECTOR_H + +#include "completion.h" + +/** + * An AllocationSelector is used by any zone which does data block allocations. + * The selector is used to round-robin allocation requests to different + * physical zones. Currently, 128 allocations will be made to a given physical + * zone before switching to the next. + **/ + +/** + * Make a new allocation selector. + * + * @param [in] physicalZoneCount The number of physical zones + * @param [in] threadID The ID of the thread using this selector + * @param [out] selectorPtr A pointer to receive the new selector + * + * @return VDO_SUCCESS or an error + **/ +int makeAllocationSelector(ZoneCount physicalZoneCount, + ThreadID threadID, + AllocationSelector **selectorPtr) + __attribute__((warn_unused_result)); + +/** + * Free an AllocationSelector and null out the reference to it. + * + * @param selectorPtr A reference to the selector to free + **/ +void freeAllocationSelector(AllocationSelector **selectorPtr); + +/** + * Get number of the physical zone from which to allocate next. + * + * @param selector The selector to query + * + * @return The number of the physical zone from which to allocate + **/ +ZoneCount getNextAllocationZone(AllocationSelector *selector) + __attribute__((warn_unused_result)); + +#endif /* ALLOCATION_SELECTOR_H */ diff --git a/vdo/base/allocationSelectorInternals.h b/vdo/base/allocationSelectorInternals.h new file mode 100644 index 0000000..13df50f --- /dev/null +++ b/vdo/base/allocationSelectorInternals.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelectorInternals.h#1 $ + */ + +#ifndef ALLOCATION_SELECTOR_INTERNALS_H +#define ALLOCATION_SELECTOR_INTERNALS_H + +#include "types.h" + +/** Structure used to select which physical zone to allocate from */ +struct allocationSelector { + /** The number of allocations done in the current zone */ + BlockCount allocationCount; + /** The physical zone to allocate from next */ + ZoneCount nextAllocationZone; + /** The number of the last physical zone */ + ZoneCount lastPhysicalZone; +}; + +#endif /* ALLOCATION_SELECTOR_INTERNALS_H */ diff --git a/vdo/base/atomic.h b/vdo/base/atomic.h new file mode 100644 index 0000000..93b7318 --- /dev/null +++ b/vdo/base/atomic.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/atomic.h#2 $ + */ + +#ifndef ATOMIC_H +#define ATOMIC_H + +#include "atomicDefs.h" +#include "compiler.h" +#include "typeDefs.h" + +#define ATOMIC_INITIALIZER(value) { (value) } + +typedef struct { + atomic_t value; +} __attribute__((aligned(4))) Atomic32; + +typedef struct { + atomic64_t value; +} __attribute__((aligned(8))) Atomic64; + +typedef struct { + Atomic32 value; +} __attribute__((aligned(4))) AtomicBool; + +/** + * Memory load operations that precede this fence will be prevented from + * changing order with any that follow this fence, by either the compiler or + * the CPU. This can be used to ensure that the load operations accessing + * the fields of a structure are not re-ordered so they actually take effect + * before a pointer to the structure is resolved. + **/ +static INLINE void loadFence(void) +{ + smp_rmb(); +} + +/** + * Memory store operations that precede this fence will be prevented from + * changing order with any that follow this fence, by either the compiler or + * the CPU. This can be used to ensure that the store operations initializing + * the fields of a structure are not re-ordered so they actually take effect + * after a pointer to the structure is published. + **/ +static INLINE void storeFence(void) +{ + smp_wmb(); +} + +/** + * Generate a full memory fence for the compiler and CPU. Load and store + * operations issued before the fence will not be re-ordered with operations + * issued after the fence. + **/ +static INLINE void memoryFence(void) +{ + smp_mb(); +} + +/** + * Access the value of a 32-bit atomic variable, ensuring that the load is not + * re-ordered by the compiler or CPU with any subsequent load operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint32_t atomicLoad32(const Atomic32 *atom) +{ + uint32_t value = atomic_read(&atom->value); + loadFence(); + return value; +} + +/** + * Access the value of a 64-bit atomic variable, ensuring that the memory load + * is not re-ordered by the compiler or CPU with any subsequent load + * operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint64_t atomicLoad64(const Atomic64 *atom) +{ + uint64_t value = atomic64_read(&atom->value); + loadFence(); + return value; +} + +/** + * Access the value of a boolean atomic variable, ensuring that the load is not + * re-ordered by the compiler or CPU with any subsequent load operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE bool atomicLoadBool(const AtomicBool *atom) +{ + return (atomicLoad32(&atom->value) > 0); +} + +/** + * Set the value of a 32-bit atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStore32(Atomic32 *atom, uint32_t newValue) +{ + storeFence(); + atomic_set(&atom->value, newValue); +} + +/** + * Set the value of a 64-bit atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStore64(Atomic64 *atom, uint64_t newValue) +{ + storeFence(); + atomic64_set(&atom->value, newValue); +} + +/** + * Set the value of a boolean atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStoreBool(AtomicBool *atom, bool newValue) +{ + atomicStore32(&atom->value, (newValue ? 1 : 0)); +} + +/** + * Add a 32-bit signed delta to a 32-bit atomic variable. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint32_t atomicAdd32(Atomic32 *atom, int32_t delta) +{ + return atomic_add_return(delta, &atom->value); +} + +/** + * Add a 64-bit signed delta to a 64-bit atomic variable. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint64_t atomicAdd64(Atomic64 *atom, int64_t delta) +{ + return atomic64_add_return(delta, &atom->value); +} + +/** + * Atomic 32-bit compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwap32(Atomic32 *atom, + uint32_t requiredValue, + uint32_t newValue) +{ + /* + * Our initial implementation, for x86, effectively got a full + * memory barrier because of how "lock cmpxchg" operates. The + * atomic_cmpxchg interface provides for a full barrier *if* the + * exchange is done, but not necessarily if it is not. + * + * Do we need the full barrier always? We need to investigate that, + * as part of (eventually) converting to using that API directly. + * For now, play it safe, and ensure the same behavior on other + * architectures too. + */ +#ifndef __x86_64__ + smp_mb(); +#endif + int oldValue = atomic_cmpxchg(&atom->value, requiredValue, newValue); +#ifndef __x86_64__ + smp_mb(); +#endif + return requiredValue == (uint32_t) oldValue; +} + +/** + * Atomic 64-bit compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwap64(Atomic64 *atom, + uint64_t requiredValue, + uint64_t newValue) +{ +#ifndef __x86_64__ + smp_mb(); +#endif + long oldValue = atomic64_cmpxchg(&atom->value, requiredValue, newValue); +#ifndef __x86_64__ + smp_mb(); +#endif + return requiredValue == (uint64_t) oldValue; +} + +/** + * Atomic boolean compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwapBool(AtomicBool *atom, + bool requiredValue, + bool newValue) +{ + return compareAndSwap32(&atom->value, (requiredValue ? 1 : 0), + (newValue ? 1 : 0)); +} + +/** + * Access the value of a 32-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint32_t relaxedLoad32(const Atomic32 *atom) +{ + return atomic_read(&atom->value); +} + +/** + * Access the value of a 64-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint64_t relaxedLoad64(const Atomic64 *atom) +{ + return atomic64_read(&atom->value); +} + +/** + * Access the value of a boolean atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE bool relaxedLoadBool(const AtomicBool *atom) +{ + return (relaxedLoad32(&atom->value) > 0); +} + +/** + * Set the value of a 32-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStore32(Atomic32 *atom, uint32_t newValue) +{ + atomic_set(&atom->value, newValue); +} + +/** + * Set the value of a 64-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStore64(Atomic64 *atom, uint64_t newValue) +{ + atomic64_set(&atom->value, newValue); +} + +/** + * Set the value of a boolean atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStoreBool(AtomicBool *atom, bool newValue) +{ + relaxedStore32(&atom->value, (newValue ? 1 : 0)); +} + +/** + * Non-atomically add a 32-bit signed delta to a 32-bit atomic variable, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint32_t relaxedAdd32(Atomic32 *atom, int32_t delta) +{ + uint32_t newValue = (relaxedLoad32(atom) + delta); + relaxedStore32(atom, newValue); + return newValue; +} + +/** + * Non-atomically add a 64-bit signed delta to a 64-bit atomic variable, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint64_t relaxedAdd64(Atomic64 *atom, int64_t delta) +{ + uint64_t newValue = (relaxedLoad64(atom) + delta); + relaxedStore64(atom, newValue); + return newValue; +} + +#endif /* ATOMIC_H */ diff --git a/vdo/base/blockAllocator.c b/vdo/base/blockAllocator.c new file mode 100644 index 0000000..a1eaae4 --- /dev/null +++ b/vdo/base/blockAllocator.c @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.c#22 $ + */ + +#include "blockAllocatorInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "heap.h" +#include "numUtils.h" +#include "priorityTable.h" +#include "readOnlyNotifier.h" +#include "refCounts.h" +#include "slab.h" +#include "slabDepotInternals.h" +#include "slabIterator.h" +#include "slabJournalEraser.h" +#include "slabJournalInternals.h" +#include "slabScrubber.h" +#include "slabSummary.h" +#include "vdoRecovery.h" +#include "vio.h" +#include "vioPool.h" + +/** + * Assert that a block allocator function was called from the correct thread. + * + * @param threadID The allocator's thread id + * @param functionName The name of the function + **/ +static inline void assertOnAllocatorThread(ThreadID threadID, + const char *functionName) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == threadID), + "%s called on correct thread", functionName); +} + +/** + * Get the priority for a slab in the allocator's slab queue. Slabs are + * essentially prioritized by an approximation of the number of free blocks in + * the slab so slabs with lots of free blocks with be opened for allocation + * before slabs that have few free blocks. + * + * @param slab The slab whose queue priority is desired + * + * @return the queue priority of the slab + **/ +static unsigned int calculateSlabPriority(Slab *slab) +{ + BlockCount freeBlocks = getSlabFreeBlockCount(slab); + + // Slabs that are completely full must be the only ones with the lowest + // priority: zero. + if (freeBlocks == 0) { + return 0; + } + + /* + * Slabs that have never been opened (empty, newly initialized, never been + * written to) have lower priority than previously opened slabs that have a + * signficant number of free blocks. This ranking causes VDO to avoid + * writing physical blocks for the first time until there are very few free + * blocks that have been previously written to. That policy makes VDO a + * better client of any underlying storage that is thinly-provisioned + * [VDOSTORY-123]. + */ + unsigned int unopenedSlabPriority = slab->allocator->unopenedSlabPriority; + if (isSlabJournalBlank(slab->journal)) { + return unopenedSlabPriority; + } + + /* + * For all other slabs, the priority is derived from the logarithm of the + * number of free blocks. Slabs with the same order of magnitude of free + * blocks have the same priority. With 2^23 blocks, the priority will range + * from 1 to 25. The reserved unopenedSlabPriority divides the range and is + * skipped by the logarithmic mapping. + */ + unsigned int priority = (1 + logBaseTwo(freeBlocks)); + return ((priority < unopenedSlabPriority) ? priority : priority + 1); +} + +/** + * Add a slab to the priority queue of slabs available for allocation. + * + * @param slab The slab to prioritize + **/ +static void prioritizeSlab(Slab *slab) +{ + ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), + "a slab must not already be on a ring when prioritizing"); + slab->priority = calculateSlabPriority(slab); + priorityTableEnqueue(slab->allocator->prioritizedSlabs, slab->priority, + &slab->ringNode); +} + +/**********************************************************************/ +void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab) +{ + allocator->slabCount++; + allocator->lastSlab = slab->slabNumber; +} + +/** + * Get an iterator over all the slabs in the allocator. + * + * @param allocator The allocator + * + * @return An iterator over the allocator's slabs + **/ +static SlabIterator getSlabIterator(const BlockAllocator *allocator) +{ + return iterateSlabs(allocator->depot->slabs, allocator->lastSlab, + allocator->zoneNumber, allocator->depot->zoneCount); +} + +/** + * Notify a block allocator that the VDO has entered read-only mode. + * + * Implements ReadOnlyNotification. + * + * @param listener The block allocator + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyBlockAllocatorOfReadOnlyMode(void *listener, + VDOCompletion *parent) +{ + BlockAllocator *allocator = listener; + assertOnAllocatorThread(allocator->threadID, __func__); + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + Slab *slab = nextSlab(&iterator); + abortSlabJournalWaiters(slab->journal); + } + + completeCompletion(parent); +} + +/**********************************************************************/ +int makeAllocatorPoolVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) +{ + return createVIO(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, parent, + buffer, vioPtr); +} + +/** + * Allocate those component of the block allocator which are needed only at + * load time, not at format time. + * + * @param allocator The allocator + * @param layer The physical layer below this allocator + * @param vioPoolSize The VIO pool size + * + * @return VDO_SUCCESS or an error + **/ +static int allocateComponents(BlockAllocator *allocator, + PhysicalLayer *layer, + BlockCount vioPoolSize) +{ + /* + * If createVIO is NULL, the block allocator is only being used to format + * or audit the VDO. These only require the SuperBlock component, so we can + * just skip allocating all the memory needed for runtime components. + */ + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + int result = registerReadOnlyListener(allocator->readOnlyNotifier, + allocator, + notifyBlockAllocatorOfReadOnlyMode, + allocator->threadID); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepot *depot = allocator->depot; + result = initializeEnqueueableCompletion(&allocator->completion, + BLOCK_ALLOCATOR_COMPLETION, layer); + if (result != VDO_SUCCESS) { + return result; + } + + allocator->summary = getSlabSummaryForZone(depot, allocator->zoneNumber); + + result = makeVIOPool(layer, vioPoolSize, allocator->threadID, + makeAllocatorPoolVIOs, NULL, &allocator->vioPool); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabJournalSize = depot->slabConfig.slabJournalBlocks; + result = makeSlabScrubber(layer, slabJournalSize, + allocator->readOnlyNotifier, + &allocator->slabScrubber); + if (result != VDO_SUCCESS) { + return result; + } + + // The number of data blocks is the maximum number of free blocks that could + // be used in calculateSlabPriority(). + BlockCount maxFreeBlocks = depot->slabConfig.dataBlocks; + unsigned int maxPriority = (2 + logBaseTwo(maxFreeBlocks)); + result = makePriorityTable(maxPriority, &allocator->prioritizedSlabs); + if (result != VDO_SUCCESS) { + return result; + } + + /* + * VDOSTORY-123 requires that we try to open slabs that already have + * allocated blocks in preference to slabs that have never been opened. For + * reasons we have not been able to fully understand, performance tests on + * SSD harvards have been very sensitive (50% reduction in test throughput) + * to very slight differences in the timing and locality of block + * allocation. Assigning a low priority to unopened slabs (maxPriority/2, + * say) would be ideal for the story, but anything less than a very high + * threshold (maxPriority - 1) hurts PMI results. + * + * This sets the free block threshold for preferring to open an unopened + * slab to the binary floor of 3/4ths the total number of datablocks in a + * slab, which will generally evaluate to about half the slab size, but + * avoids degenerate behavior in unit tests where the number of data blocks + * is artificially constrained to a power of two. + */ + allocator->unopenedSlabPriority = (1 + logBaseTwo((maxFreeBlocks * 3) / 4)); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeBlockAllocator(SlabDepot *depot, + ZoneCount zoneNumber, + ThreadID threadID, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + BlockAllocator **allocatorPtr) +{ + + BlockAllocator *allocator; + int result = ALLOCATE(1, BlockAllocator, __func__, &allocator); + if (result != VDO_SUCCESS) { + return result; + } + + allocator->depot = depot; + allocator->zoneNumber = zoneNumber; + allocator->threadID = threadID; + allocator->nonce = nonce; + allocator->readOnlyNotifier = readOnlyNotifier; + initializeRing(&allocator->dirtySlabJournals); + + result = allocateComponents(allocator, layer, vioPoolSize); + if (result != VDO_SUCCESS) { + freeBlockAllocator(&allocator); + return result; + } + + *allocatorPtr = allocator; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeBlockAllocator(BlockAllocator **blockAllocatorPtr) +{ + BlockAllocator *allocator = *blockAllocatorPtr; + if (allocator == NULL) { + return; + } + + freeSlabScrubber(&allocator->slabScrubber); + freeVIOPool(&allocator->vioPool); + freePriorityTable(&allocator->prioritizedSlabs); + destroyEnqueueable(&allocator->completion); + FREE(allocator); + *blockAllocatorPtr = NULL; +} + +/**********************************************************************/ +int replaceVIOPool(BlockAllocator *allocator, + size_t size, + PhysicalLayer *layer) +{ + freeVIOPool(&allocator->vioPool); + return makeVIOPool(layer, size, allocator->threadID, makeAllocatorPoolVIOs, + NULL, &allocator->vioPool); +} + +/** + * Get the maximum number of data blocks that can be allocated. + * + * @param allocator The block allocator to query + * + * @return The number of data blocks that can be allocated + **/ +__attribute__((warn_unused_result)) +static inline BlockCount getDataBlockCount(const BlockAllocator *allocator) +{ + return (allocator->slabCount * allocator->depot->slabConfig.dataBlocks); +} + +/**********************************************************************/ +BlockCount getAllocatedBlocks(const BlockAllocator *allocator) +{ + return relaxedLoad64(&allocator->statistics.allocatedBlocks); +} + +/**********************************************************************/ +BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) +{ + return getScrubberSlabCount(allocator->slabScrubber); +} + +/**********************************************************************/ +void queueSlab(Slab *slab) +{ + ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), + "a requeued slab must not already be on a ring"); + BlockAllocator *allocator = slab->allocator; + BlockCount freeBlocks = getSlabFreeBlockCount(slab); + int result = ASSERT((freeBlocks <= allocator->depot->slabConfig.dataBlocks), + "rebuilt slab %u must have a valid free block count" + " (has %llu, expected maximum %llu)", + slab->slabNumber, freeBlocks, + allocator->depot->slabConfig.dataBlocks); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(allocator->readOnlyNotifier, result); + return; + } + + if (isUnrecoveredSlab(slab)) { + registerSlabForScrubbing(allocator->slabScrubber, slab, false); + return; + } + + if (!isSlabResuming(slab)) { + // If the slab is resuming, we've already accounted for it here, so don't + // do it again. + relaxedAdd64(&allocator->statistics.allocatedBlocks, -freeBlocks); + if (!isSlabJournalBlank(slab->journal)) { + relaxedAdd64(&allocator->statistics.slabsOpened, 1); + } + } + + // All slabs are kept in a priority queue for allocation. + prioritizeSlab(slab); +} + +/**********************************************************************/ +void adjustFreeBlockCount(Slab *slab, bool increment) +{ + BlockAllocator *allocator = slab->allocator; + // The sense of increment is reversed since allocations are being counted. + relaxedAdd64(&allocator->statistics.allocatedBlocks, (increment ? -1 : 1)); + + // The open slab doesn't need to be reprioritized until it is closed. + if (slab == allocator->openSlab) { + return; + } + + // The slab priority rarely changes; if no change, then don't requeue it. + if (slab->priority == calculateSlabPriority(slab)) { + return; + } + + // Reprioritize the slab to reflect the new free block count by removing it + // from the table and re-enqueuing it with the new priority. + priorityTableRemove(allocator->prioritizedSlabs, &slab->ringNode); + prioritizeSlab(slab); +} + +/** + * Allocate the next free physical block in a slab. + * + * The block allocated will have a provisional reference and the + * reference must be either confirmed with a subsequent call to + * incrementReferenceCount() or vacated with a subsequent call to + * decrementReferenceCount(). + * + * @param [in] slab The slab + * @param [out] blockNumberPtr A pointer to receive the allocated block number + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateSlabBlock(Slab *slab, PhysicalBlockNumber *blockNumberPtr) +{ + PhysicalBlockNumber pbn; + int result = allocateUnreferencedBlock(slab->referenceCounts, &pbn); + if (result != VDO_SUCCESS) { + return result; + } + + adjustFreeBlockCount(slab, false); + + *blockNumberPtr = pbn; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int allocateBlock(BlockAllocator *allocator, + PhysicalBlockNumber *blockNumberPtr) +{ + if (allocator->openSlab != NULL) { + // Try to allocate the next block in the currently open slab. + int result = allocateSlabBlock(allocator->openSlab, blockNumberPtr); + if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) { + return result; + } + + // Put the exhausted open slab back into the priority table. + prioritizeSlab(allocator->openSlab); + } + + // Remove the highest priority slab from the priority table and make it + // the open slab. + allocator->openSlab + = slabFromRingNode(priorityTableDequeue(allocator->prioritizedSlabs)); + + if (isSlabJournalBlank(allocator->openSlab->journal)) { + relaxedAdd64(&allocator->statistics.slabsOpened, 1); + dirtyAllReferenceBlocks(allocator->openSlab->referenceCounts); + } else { + relaxedAdd64(&allocator->statistics.slabsReopened, 1); + } + + // Try allocating again. If we're out of space immediately after opening a + // slab, then every slab must be fully allocated. + return allocateSlabBlock(allocator->openSlab, blockNumberPtr); +} + +/**********************************************************************/ +void releaseBlockReference(BlockAllocator *allocator, + PhysicalBlockNumber pbn, + const char *why) +{ + if (pbn == ZERO_BLOCK) { + return; + } + + Slab *slab = getSlab(allocator->depot, pbn); + ReferenceOperation operation = { + .type = DATA_DECREMENT, + .pbn = pbn, + }; + int result = modifySlabReferenceCount(slab, NULL, operation); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Failed to release reference to %s " + "physical block %llu", + why, pbn); + } +} + +/** + * This is a HeapComparator function that orders SlabStatuses using the + * 'isClean' field as the primary key and the 'emptiness' field as the + * secondary key. + * + * Slabs need to be pushed onto the rings in the same order they are to be + * popped off. Popping should always get the most empty first, so pushing + * should be from most empty to least empty. Thus, the comparator order is + * the usual sense since Heap returns larger elements before smaller ones. + * + * @param item1 The first item to compare + * @param item2 The second item to compare + * + * @return 1 if the first item is cleaner or emptier than the second; + * 0 if the two items are equally clean and empty; + -1 otherwise + **/ +static int compareSlabStatuses(const void *item1, const void *item2) +{ + const SlabStatus *info1 = (const SlabStatus *) item1; + const SlabStatus *info2 = (const SlabStatus *) item2; + + if (info1->isClean != info2->isClean) { + return (info1->isClean ? 1 : -1); + } + if (info1->emptiness != info2->emptiness) { + return ((info1->emptiness > info2->emptiness) ? 1 : -1); + } + return ((info1->slabNumber < info2->slabNumber) ? 1 : -1); +} + +/** + * Swap two SlabStatus structures. Implements HeapSwapper. + **/ +static void swapSlabStatuses(void *item1, void *item2) +{ + SlabStatus *info1 = item1; + SlabStatus *info2 = item2; + SlabStatus temp = *info1; + *info1 = *info2; + *info2 = temp; +} + +/** + * Inform the allocator that a slab action has finished on some slab. This + * callback is registered in applyToSlabs(). + * + * @param completion The allocator completion + **/ +static void slabActionCallback(VDOCompletion *completion) +{ + BlockAllocator *allocator = container_of(completion, BlockAllocator, + completion); + SlabActor *actor = &allocator->slabActor; + if (--actor->slabActionCount == 0) { + actor->callback(completion); + return; + } + + resetCompletion(completion); +} + +/** + * Preserve the error from part of an administrative action and continue. + * + * @param completion The allocator completion + **/ +static void handleOperationError(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + setOperationResult(&allocator->state, completion->result); + completion->callback(completion); +} + +/** + * Perform an administrative action on each of an allocator's slabs in + * parallel. + * + * @param allocator The allocator + * @param callback The method to call when the action is complete on every + * slab + **/ +static void applyToSlabs(BlockAllocator *allocator, VDOAction *callback) +{ + prepareCompletion(&allocator->completion, slabActionCallback, + handleOperationError, allocator->threadID, NULL); + allocator->completion.requeue = false; + + // Since we are going to dequeue all of the slabs, the open slab will become + // invalid, so clear it. + allocator->openSlab = NULL; + + // Ensure that we don't finish before we're done starting. + allocator->slabActor = (SlabActor) { + .slabActionCount = 1, + .callback = callback, + }; + + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + Slab *slab = nextSlab(&iterator); + unspliceRingNode(&slab->ringNode); + allocator->slabActor.slabActionCount++; + startSlabAction(slab, allocator->state.state, &allocator->completion); + } + + slabActionCallback(&allocator->completion); +} + +/** + * Inform the allocator that all load I/O has finished. + * + * @param completion The allocator completion + **/ +static void finishLoadingAllocator(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + if (allocator->state.state == ADMIN_STATE_LOADING_FOR_RECOVERY) { + void *context = getCurrentActionContext(allocator->depot->actionManager); + replayIntoSlabJournals(allocator, completion, context); + return; + } + + finishLoading(&allocator->state); +} + +/** + * Initiate a load. + * + * Implements AdminInitiator. + **/ +static void initiateLoad(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + if (state->state == ADMIN_STATE_LOADING_FOR_REBUILD) { + prepareCompletion(&allocator->completion, finishLoadingAllocator, + handleOperationError, allocator->threadID, NULL); + eraseSlabJournals(allocator->depot, getSlabIterator(allocator), + &allocator->completion); + return; + } + + applyToSlabs(allocator, finishLoadingAllocator); +} + +/**********************************************************************/ +void loadBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startLoading(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateLoad); +} + +/**********************************************************************/ +void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result) +{ + finishLoadingWithResult(&allocator->state, result); +} + +/**********************************************************************/ +int prepareSlabsForAllocation(BlockAllocator *allocator) +{ + relaxedStore64(&allocator->statistics.allocatedBlocks, + getDataBlockCount(allocator)); + + SlabDepot *depot = allocator->depot; + SlabCount slabCount = depot->slabCount; + + SlabStatus *slabStatuses; + int result = ALLOCATE(slabCount, SlabStatus, __func__, &slabStatuses); + if (result != VDO_SUCCESS) { + return result; + } + + getSummarizedSlabStatuses(allocator->summary, slabCount, slabStatuses); + + // Sort the slabs by cleanliness, then by emptiness hint. + Heap heap; + initializeHeap(&heap, compareSlabStatuses, swapSlabStatuses, + slabStatuses, slabCount, sizeof(SlabStatus)); + buildHeap(&heap, slabCount); + + SlabStatus currentSlabStatus; + while (popMaxHeapElement(&heap, ¤tSlabStatus)) { + Slab *slab = depot->slabs[currentSlabStatus.slabNumber]; + if (slab->allocator != allocator) { + continue; + } + + if ((depot->loadType == REBUILD_LOAD) + || (!mustLoadRefCounts(allocator->summary, slab->slabNumber) + && currentSlabStatus.isClean)) { + queueSlab(slab); + continue; + } + + markSlabUnrecovered(slab); + bool highPriority + = ((currentSlabStatus.isClean && (depot->loadType == NORMAL_LOAD)) + || requiresScrubbing(slab->journal)); + registerSlabForScrubbing(allocator->slabScrubber, slab, highPriority); + } + FREE(slabStatuses); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void prepareAllocatorToAllocate(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + int result = prepareSlabsForAllocation(allocator); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + scrubHighPrioritySlabs(allocator->slabScrubber, + isPriorityTableEmpty(allocator->prioritizedSlabs), + parent, finishParentCallback, finishParentCallback); +} + +/**********************************************************************/ +void registerNewSlabsForAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + SlabDepot *depot = allocator->depot; + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + Slab *slab = depot->newSlabs[i]; + if (slab->allocator == allocator) { + registerSlabWithAllocator(allocator, slab); + } + } + completeCompletion(parent); +} + +/** + * Perform a step in draining the allocator. This method is its own callback. + * + * @param completion The allocator's completion + **/ +static void doDrainStep(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + prepareForRequeue(&allocator->completion, doDrainStep, handleOperationError, + allocator->threadID, NULL); + switch (++allocator->drainStep) { + case DRAIN_ALLOCATOR_STEP_SCRUBBER: + stopScrubbing(allocator->slabScrubber, completion); + return; + + case DRAIN_ALLOCATOR_STEP_SLABS: + applyToSlabs(allocator, doDrainStep); + return; + + case DRAIN_ALLOCATOR_STEP_SUMMARY: + drainSlabSummaryZone(allocator->summary, allocator->state.state, + completion); + return; + + case DRAIN_ALLOCATOR_STEP_FINISHED: + ASSERT_LOG_ONLY(!isVIOPoolBusy(allocator->vioPool), "VIO Pool not busy"); + finishDrainingWithResult(&allocator->state, completion->result); + return; + + default: + finishDrainingWithResult(&allocator->state, UDS_BAD_STATE); + } +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + allocator->drainStep = DRAIN_ALLOCATOR_START; + doDrainStep(&allocator->completion); +} + +/**********************************************************************/ +void drainBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startDraining(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateDrain); +} + +/** + * Perform a step in resuming a quiescent allocator. This method is its own + * callback. + * + * @param completion The allocator's completion + **/ +static void doResumeStep(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + prepareForRequeue(&allocator->completion, doResumeStep, handleOperationError, + allocator->threadID, NULL); + switch (--allocator->drainStep) { + case DRAIN_ALLOCATOR_STEP_SUMMARY: + resumeSlabSummaryZone(allocator->summary, completion); + return; + + case DRAIN_ALLOCATOR_STEP_SLABS: + applyToSlabs(allocator, doResumeStep); + return; + + case DRAIN_ALLOCATOR_STEP_SCRUBBER: + resumeScrubbing(allocator->slabScrubber, completion); + return; + + case DRAIN_ALLOCATOR_START: + finishResumingWithResult(&allocator->state, completion->result); + return; + + default: + finishResumingWithResult(&allocator->state, UDS_BAD_STATE); + } +} + +/** + * Initiate a resume. + * + * Implements AdminInitiator. + **/ +static void initiateResume(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + allocator->drainStep = DRAIN_ALLOCATOR_STEP_FINISHED; + doResumeStep(&allocator->completion); +} + +/**********************************************************************/ +void resumeBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startResuming(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateResume); +} + +/**********************************************************************/ +void releaseTailBlockLocks(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + RingNode *ring = &allocator->dirtySlabJournals; + while (!isRingEmpty(ring)) { + if (!releaseRecoveryJournalLock(slabJournalFromDirtyNode(ring->next), + allocator->depot->activeReleaseRequest)) { + break; + } + } + completeCompletion(parent); +} + +/**********************************************************************/ +SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) +{ + return allocator->summary; +} + +/**********************************************************************/ +int acquireVIO(BlockAllocator *allocator, Waiter *waiter) +{ + return acquireVIOFromPool(allocator->vioPool, waiter); +} + +/**********************************************************************/ +void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry) +{ + returnVIOToPool(allocator->vioPool, entry); +} + +/**********************************************************************/ +void scrubAllUnrecoveredSlabsInZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + scrubSlabs(allocator->slabScrubber, allocator->depot, + notifyZoneFinishedScrubbing, noopCallback); + completeCompletion(parent); +} + +/**********************************************************************/ +int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) +{ + return enqueueCleanSlabWaiter(allocator->slabScrubber, waiter); +} + +/**********************************************************************/ +void increaseScrubbingPriority(Slab *slab) +{ + registerSlabForScrubbing(slab->allocator->slabScrubber, slab, true); +} + +/**********************************************************************/ +void allocateFromAllocatorLastSlab(BlockAllocator *allocator) +{ + ASSERT_LOG_ONLY(allocator->openSlab == NULL, "mustn't have an open slab"); + Slab *lastSlab = allocator->depot->slabs[allocator->lastSlab]; + priorityTableRemove(allocator->prioritizedSlabs, &lastSlab->ringNode); + allocator->openSlab = lastSlab; +} + +/**********************************************************************/ +BlockAllocatorStatistics +getBlockAllocatorStatistics(const BlockAllocator *allocator) +{ + const AtomicAllocatorStatistics *atoms = &allocator->statistics; + return (BlockAllocatorStatistics) { + .slabCount = allocator->slabCount, + .slabsOpened = relaxedLoad64(&atoms->slabsOpened), + .slabsReopened = relaxedLoad64(&atoms->slabsReopened), + }; +} + +/**********************************************************************/ +SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) +{ + const AtomicSlabJournalStatistics *atoms = &allocator->slabJournalStatistics; + return (SlabJournalStatistics) { + .diskFullCount = atomicLoad64(&atoms->diskFullCount), + .flushCount = atomicLoad64(&atoms->flushCount), + .blockedCount = atomicLoad64(&atoms->blockedCount), + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + .tailBusyCount = atomicLoad64(&atoms->tailBusyCount), + }; +} + +/**********************************************************************/ +RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) +{ + const AtomicRefCountStatistics *atoms = &allocator->refCountStatistics; + return (RefCountsStatistics) { + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + }; +} + +/**********************************************************************/ +void dumpBlockAllocator(const BlockAllocator *allocator) +{ + unsigned int pauseCounter = 0; + logInfo("BlockAllocator zone %u", allocator->zoneNumber); + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + dumpSlab(nextSlab(&iterator)); + + // Wait for a while after each batch of 32 slabs dumped, allowing the + // kernel log a chance to be flushed instead of being overrun. + if (pauseCounter++ == 31) { + pauseCounter = 0; + pauseForLogger(); + } + } + + dumpSlabScrubber(allocator->slabScrubber); +} diff --git a/vdo/base/blockAllocator.h b/vdo/base/blockAllocator.h new file mode 100644 index 0000000..cd8eb39 --- /dev/null +++ b/vdo/base/blockAllocator.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.h#12 $ + */ + +#ifndef BLOCK_ALLOCATOR_H +#define BLOCK_ALLOCATOR_H + +#include "completion.h" +#include "fixedLayout.h" +#include "statistics.h" +#include "types.h" +#include "vioPool.h" +#include "waitQueue.h" + +/** + * Create a block allocator. + * + * @param [in] depot The slab depot for this allocator + * @param [in] zoneNumber The physical zone number for this allocator + * @param [in] threadID The thread ID for this allocator's zone + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this allocator + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [out] allocatorPtr A pointer to hold the allocator + * + * @return A success or error code + **/ +int makeBlockAllocator(SlabDepot *depot, + ZoneCount zoneNumber, + ThreadID threadID, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + BlockAllocator **allocatorPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a block allocator and null out the reference to it. + * + * @param blockAllocatorPtr The reference to the allocator to destroy + **/ +void freeBlockAllocator(BlockAllocator **blockAllocatorPtr); + +/** + * Queue a slab for allocation or scrubbing. + * + * @param slab The slab to queue + **/ +void queueSlab(Slab *slab); + +/** + * Update the block allocator to reflect an increment or decrement of the free + * block count in a slab. This adjusts the allocated block count and + * reprioritizes the slab when appropriate. + * + * @param slab The slab whose free block count changed + * @param increment True if the free block count went up by one, + * false if it went down by one + **/ +void adjustFreeBlockCount(Slab *slab, bool increment); + +/** + * Allocate a physical block. + * + * The block allocated will have a provisional reference and the + * reference must be either confirmed with a subsequent call to + * incrementReferenceCount() or vacated with a subsequent call to + * decrementReferenceCount(). + * + * @param [in] allocator The block allocator + * @param [out] blockNumberPtr A pointer to receive the allocated block number + * + * @return UDS_SUCCESS or an error code + **/ +int allocateBlock(BlockAllocator *allocator, + PhysicalBlockNumber *blockNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Release an unused provisional reference. + * + * @param allocator The block allocator + * @param pbn The block to dereference + * @param why Why the block was referenced (for logging) + **/ +void releaseBlockReference(BlockAllocator *allocator, + PhysicalBlockNumber pbn, + const char *why); + +/** + * Get the number of allocated blocks, which is the total number of + * blocks in all slabs that have a non-zero reference count. + * + * @param allocator The block allocator + * + * @return The number of blocks with a non-zero reference count + **/ +BlockCount getAllocatedBlocks(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the number of unrecovered slabs. + * + * @param allocator The block allocator + * + * @return The number of slabs that are unrecovered + **/ +BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Load the state of an allocator from disk. + * + *

Implements ZoneAction. + **/ +void loadBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Inform a block allocator that its slab journals have been recovered from the + * recovery journal. + * + * @param allocator The allocator to inform + * @param result The result of the recovery operation + **/ +void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result); + +/** + * Prepare the block allocator to come online and start allocating blocks. + * + *

Implements ZoneAction. + **/ +void prepareAllocatorToAllocate(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Register a slab with the allocator, ready for use. + * + * @param allocator The allocator to use + * @param slab The slab in question + **/ +void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab); + +/** + * Register the new slabs belonging to this allocator. + * + *

Implements ZoneAction. + **/ +void registerNewSlabsForAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Drain all allocator I/O. Depending upon the type of drain, some or all + * dirty metadata may be written to disk. The type of drain will be determined + * from the state of the allocator's depot. + * + *

Implements ZoneAction. + **/ +void drainBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Resume a quiescent allocator. + * + *

Implements ZoneAction. + **/ +void resumeBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Request a commit of all dirty tail blocks which are locking a given recovery + * journal block. + * + *

Implements ZoneAction. + **/ +void releaseTailBlockLocks(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Get the slab summary zone for an allocator. + * + * @param allocator The allocator + * + * @return The SlabSummaryZone for that allocator + **/ +SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Acquire a VIO from a block allocator's VIO pool (asynchronous). + * + * @param allocator The allocator from which to get a VIO + * @param waiter The object requesting the VIO + * + * @return VDO_SUCCESS or an error + **/ +int acquireVIO(BlockAllocator *allocator, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Return a VIO to a block allocator's VIO pool + * + * @param allocator The block allocator which owns the VIO + * @param entry The VIO being returned + **/ +void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry); + +/** + * Initiate scrubbing all unrecovered slabs. + * + *

Implements ZoneAction. + **/ +void scrubAllUnrecoveredSlabsInZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Queue a waiter for a clean slab. + * + * @param allocator The allocator to wait on + * @param waiter The waiter + * + * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no + * slabs to scrub, and some other error otherwise + **/ +int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Increase the scrubbing priority of a slab. + * + * @param slab The slab + **/ +void increaseScrubbingPriority(Slab *slab); + +/** + * Get the statistics for this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +BlockAllocatorStatistics +getBlockAllocatorStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the aggregated slab journal statistics for the slabs in this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the cumulative RefCounts statistics for the slabs in this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Dump information about a block allocator to the log for debugging. + * + * @param allocator The allocator to dump + **/ +void dumpBlockAllocator(const BlockAllocator *allocator); + +#endif // BLOCK_ALLOCATOR_H diff --git a/vdo/base/blockAllocatorInternals.h b/vdo/base/blockAllocatorInternals.h new file mode 100644 index 0000000..83db684 --- /dev/null +++ b/vdo/base/blockAllocatorInternals.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocatorInternals.h#11 $ + */ + +#ifndef BLOCK_ALLOCATOR_INTERNALS_H +#define BLOCK_ALLOCATOR_INTERNALS_H + +#include "adminState.h" +#include "atomic.h" +#include "blockAllocator.h" +#include "priorityTable.h" +#include "ringNode.h" +#include "slabScrubber.h" +#include "vioPool.h" + +enum { + /* + * The number of VIOs in the VIO pool is proportional to the throughput of + * the VDO. + */ + VIO_POOL_SIZE = 128, +}; + +typedef enum { + DRAIN_ALLOCATOR_START = 0, + DRAIN_ALLOCATOR_STEP_SCRUBBER, + DRAIN_ALLOCATOR_STEP_SLABS, + DRAIN_ALLOCATOR_STEP_SUMMARY, + DRAIN_ALLOCATOR_STEP_FINISHED, +} BlockAllocatorDrainStep; + +/** + * A sub-structure for applying actions in parallel to all an allocator's + * slabs. + **/ +typedef struct { + /** The number of slabs performing a slab action */ + SlabCount slabActionCount; + /** The method to call when a slab action has been completed by all slabs */ + VDOAction *callback; +} SlabActor; + +/** + * These fields are only modified by the physical zone thread, but are queried + * by other threads. + **/ +typedef struct atomicAllocatorStatistics { + /** The count of allocated blocks in this zone */ + Atomic64 allocatedBlocks; + /** The number of slabs from which blocks have ever been allocated */ + Atomic64 slabsOpened; + /** The number of times since loading that a slab been re-opened */ + Atomic64 slabsReopened; +} AtomicAllocatorStatistics; + +/** + * The statistics for all the slab journals in the slabs owned by this + * allocator. These fields are all mutated only by the physical zone thread, + * but are read by other threads when gathering statistics for the entire + * depot. + **/ +typedef struct atomicSlabJournalStatistics { + /** Number of times the on-disk journal was full */ + Atomic64 diskFullCount; + /** Number of times an entry was added over the flush threshold */ + Atomic64 flushCount; + /** Number of times an entry was added over the block threshold */ + Atomic64 blockedCount; + /** Number of times the tail block was written */ + Atomic64 blocksWritten; + /** Number of times we had to wait for the tail block commit */ + Atomic64 tailBusyCount; +} AtomicSlabJournalStatistics; + +/** + * The statistics for all the RefCounts in the slabs owned by this + * allocator. These fields are all mutated only by the physical zone thread, + * but are read by other threads when gathering statistics for the entire + * depot. + **/ +typedef struct atomicRefCountStatistics { + /** Number of blocks written */ + Atomic64 blocksWritten; +} AtomicRefCountStatistics; + +struct blockAllocator { + VDOCompletion completion; + /** The slab depot for this allocator */ + SlabDepot *depot; + /** The slab summary zone for this allocator */ + SlabSummaryZone *summary; + /** The notifier for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The nonce of the VDO */ + Nonce nonce; + /** The physical zone number of this allocator */ + ZoneCount zoneNumber; + /** The thread ID for this allocator's physical zone */ + ThreadID threadID; + /** The number of slabs in this allocator */ + SlabCount slabCount; + /** The number of the last slab owned by this allocator */ + SlabCount lastSlab; + /** The reduced priority level used to preserve unopened slabs */ + unsigned int unopenedSlabPriority; + /** The state of this allocator */ + AdminState state; + /** The actor for applying an action to all slabs */ + SlabActor slabActor; + + /** The slab from which blocks are currently being allocated */ + Slab *openSlab; + /** A priority queue containing all slabs available for allocation */ + PriorityTable *prioritizedSlabs; + /** The slab scrubber */ + SlabScrubber *slabScrubber; + /** What phase of the close operation the allocator is to perform */ + BlockAllocatorDrainStep drainStep; + /** Statistics for this block allocator */ + AtomicAllocatorStatistics statistics; + /** Cumulative statistics for the slab journals in this zone */ + AtomicSlabJournalStatistics slabJournalStatistics; + /** Cumulative statistics for the RefCounts in this zone */ + AtomicRefCountStatistics refCountStatistics; + + /** + * This is the head of a queue of slab journals which have entries in their + * tail blocks which have not yet started to commit. When the recovery + * journal is under space pressure, slab journals which have uncommitted + * entries holding a lock on the recovery journal head are forced to commit + * their blocks early. This list is kept in order, with the tail containing + * the slab journal holding the most recent recovery journal lock. + **/ + RingNode dirtySlabJournals; + + /** The VIO pool for reading and writing block allocator metadata */ + VIOPool *vioPool; +}; + +/** + * Construct allocator metadata VIOs. Exposed for unit tests. + * + * Implements VIOConstructor + **/ +int makeAllocatorPoolVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) + __attribute__((warn_unused_result)); + +/** + * Replace the VIO pool in a block allocator. This method exists for unit + * tests. + * + * @param allocator The block allocator + * @param size The number of entries in the pool + * @param layer The physical layer from which to allocate VIOs + * + * @return VDO_SUCCESS or an error + **/ +int replaceVIOPool(BlockAllocator *allocator, + size_t size, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Prepare slabs for allocation or scrubbing. This method is exposed for + * testing. + * + * @param allocator The allocator to prepare + * + * @return VDO_SUCCESS or an error code + **/ +int prepareSlabsForAllocation(BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Start allocating from the highest numbered slab. + * + * @param allocator The allocator + **/ +void allocateFromAllocatorLastSlab(BlockAllocator *allocator); + +#endif // BLOCK_ALLOCATOR_INTERNALS_H diff --git a/vdo/base/blockMap.c b/vdo/base/blockMap.c new file mode 100644 index 0000000..9a13c30 --- /dev/null +++ b/vdo/base/blockMap.c @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.c#24 $ + */ + +#include "blockMap.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "actionManager.h" +#include "adminState.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTree.h" +#include "constants.h" +#include "dataVIO.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "statusCodes.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +typedef struct { + PhysicalBlockNumber flatPageOrigin; + BlockCount flatPageCount; + PhysicalBlockNumber rootOrigin; + BlockCount rootCount; +} __attribute__((packed)) BlockMapState2_0; + +static const Header BLOCK_MAP_HEADER_2_0 = { + .id = BLOCK_MAP, + .version = { + .majorVersion = 2, + .minorVersion = 0, + }, + .size = sizeof(BlockMapState2_0), +}; + +/** + * State associated which each block map page while it is in the VDO page + * cache. + **/ +typedef struct { + /** + * The earliest recovery journal block containing uncommitted updates to the + * block map page associated with this context. A reference (lock) is held + * on that block to prevent it from being reaped. When this value changes, + * the reference on the old value must be released and a reference on the + * new value must be acquired. + **/ + SequenceNumber recoveryLock; +} BlockMapPageContext; + +/** + * Implements VDOPageReadFunction. + **/ +static int validatePageOnRead(void *buffer, + PhysicalBlockNumber pbn, + BlockMapZone *zone, + void *pageContext) +{ + BlockMapPage *page = buffer; + BlockMapPageContext *context = pageContext; + Nonce nonce = zone->blockMap->nonce; + + BlockMapPageValidity validity = validateBlockMapPage(page, nonce, pbn); + if (validity == BLOCK_MAP_PAGE_BAD) { + return logErrorWithStringError(VDO_BAD_PAGE, + "Expected page %" PRIu64 + " but got page %llu instead", + pbn, getBlockMapPagePBN(page)); + } + + if (validity == BLOCK_MAP_PAGE_INVALID) { + formatBlockMapPage(page, nonce, pbn, false); + } + + context->recoveryLock = 0; + return VDO_SUCCESS; +} + +/** + * Handle journal updates and torn write protection. + * + * Implements VDOPageWriteFunction. + **/ +static bool handlePageWrite(void *rawPage, + BlockMapZone *zone, + void *pageContext) +{ + BlockMapPage *page = rawPage; + BlockMapPageContext *context = pageContext; + + if (markBlockMapPageInitialized(page, true)) { + // Cause the page to be re-written. + return true; + } + + // Release the page's references on the recovery journal. + releaseRecoveryJournalBlockReference(zone->blockMap->journal, + context->recoveryLock, + ZONE_TYPE_LOGICAL, zone->zoneNumber); + context->recoveryLock = 0; + return false; +} + +/**********************************************************************/ +PageCount computeBlockMapPageCount(BlockCount entries) +{ + return computeBucketCount(entries, BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/**********************************************************************/ +int makeBlockMap(BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockCount flatPageCount, + PhysicalBlockNumber rootOrigin, + BlockCount rootCount, + BlockMap **mapPtr) +{ + STATIC_ASSERT(BLOCK_MAP_ENTRIES_PER_PAGE + == ((VDO_BLOCK_SIZE - sizeof(BlockMapPage)) + / sizeof(BlockMapEntry))); + + BlockMap *map; + int result = ALLOCATE_EXTENDED(BlockMap, threadConfig->logicalZoneCount, + BlockMapZone, __func__, &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->flatPageCount = flatPageCount; + map->rootOrigin = rootOrigin; + map->rootCount = rootCount; + map->entryCount = logicalBlocks; + + ZoneCount zoneCount = threadConfig->logicalZoneCount; + for (ZoneCount zone = 0; zone < zoneCount; zone++) { + BlockMapZone *blockMapZone = &map->zones[zone]; + blockMapZone->zoneNumber = zone; + blockMapZone->threadID = getLogicalZoneThread(threadConfig, zone); + blockMapZone->blockMap = map; + map->zoneCount++; + } + + *mapPtr = map; + return VDO_SUCCESS; +} + +/** + * Decode block map component state version 2.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeBlockMapState_2_0(Buffer *buffer, BlockMapState2_0 *state) +{ + size_t initialLength = contentLength(buffer); + + PhysicalBlockNumber flatPageOrigin; + int result = getUInt64LEFromBuffer(buffer, &flatPageOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount flatPageCount; + result = getUInt64LEFromBuffer(buffer, &flatPageCount); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber rootOrigin; + result = getUInt64LEFromBuffer(buffer, &rootOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount rootCount; + result = getUInt64LEFromBuffer(buffer, &rootCount); + if (result != UDS_SUCCESS) { + return result; + } + + *state = (BlockMapState2_0) { + .flatPageOrigin = flatPageOrigin, + .flatPageCount = flatPageCount, + .rootOrigin = rootOrigin, + .rootCount = rootCount, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(BLOCK_MAP_HEADER_2_0.size == decodedSize, + "decoded block map component size must match header size"); +} + +/**********************************************************************/ +int decodeBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&BLOCK_MAP_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + BlockMapState2_0 state; + result = decodeBlockMapState_2_0(buffer, &state); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(state.flatPageOrigin == BLOCK_MAP_FLAT_PAGE_ORIGIN, + "Flat page origin must be %u (recorded as %llu)", + BLOCK_MAP_FLAT_PAGE_ORIGIN, state.flatPageOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockMap *map; + result = makeBlockMap(logicalBlocks, threadConfig, + state.flatPageCount, state.rootOrigin, + state.rootCount, &map); + if (result != VDO_SUCCESS) { + return result; + } + + *mapPtr = map; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeSodiumBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) +{ + // Sodium uses state version 2.0. + return decodeBlockMap(buffer, logicalBlocks, threadConfig, mapPtr); +} + +/** + * Initialize the per-zone portions of the block map. + * + * @param zone The zone to initialize + * @param layer The physical layer on which the zone resides + * @param readOnlyNotifier The read-only context for the VDO + * @param cacheSize The size of the page cache for the zone + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and must be written out + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int initializeBlockMapZone(BlockMapZone *zone, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + PageCount cacheSize, + BlockCount maximumAge) +{ + zone->readOnlyNotifier = readOnlyNotifier; + int result = initializeTreeZone(zone, layer, maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + + return makeVDOPageCache(layer, cacheSize, validatePageOnRead, + handlePageWrite, sizeof(BlockMapPageContext), + maximumAge, zone, &zone->pageCache); +} + +/**********************************************************************/ +BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) +{ + return &map->zones[zoneNumber]; +} + +/** + * Get the ID of the thread on which a given block map zone operates. + * + *

Implements ZoneThreadGetter. + **/ +static ThreadID getBlockMapZoneThreadID(void *context, ZoneCount zoneNumber) +{ + return getBlockMapZone(context, zoneNumber)->threadID; +} + +/** + * Prepare for an era advance. + * + *

Implements ActionPreamble. + **/ +static void prepareForEraAdvance(void *context, VDOCompletion *parent) +{ + BlockMap *map = context; + map->currentEraPoint = map->pendingEraPoint; + completeCompletion(parent); +} + +/** + * Update the progress of the era in a zone. + * + *

Implements ZoneAction. + **/ +static void advanceBlockMapZoneEra(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + advanceVDOPageCachePeriod(zone->pageCache, zone->blockMap->currentEraPoint); + advanceZoneTreePeriod(&zone->treeZone, zone->blockMap->currentEraPoint); + finishCompletion(parent, VDO_SUCCESS); +} + +/** + * Schedule an era advance if necessary. This method should not be called + * directly. Rather, call scheduleDefaultAction() on the block map's action + * manager. + * + *

Implements ActionScheduler. + **/ +static bool scheduleEraAdvance(void *context) +{ + BlockMap *map = context; + if (map->currentEraPoint == map->pendingEraPoint) { + return false; + } + + return scheduleAction(map->actionManager, prepareForEraAdvance, + advanceBlockMapZoneEra, NULL, NULL); +} + +/**********************************************************************/ +int makeBlockMapCaches(BlockMap *map, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *journal, + Nonce nonce, + PageCount cacheSize, + BlockCount maximumAge) +{ + int result = ASSERT(cacheSize > 0, "block map cache size is specified"); + if (result != UDS_SUCCESS) { + return result; + } + + map->journal = journal; + map->nonce = nonce; + + result = makeForest(map, map->entryCount); + if (result != VDO_SUCCESS) { + return result; + } + + replaceForest(map); + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + result = initializeBlockMapZone(&map->zones[zone], layer, readOnlyNotifier, + cacheSize / map->zoneCount, maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + } + + return makeActionManager(map->zoneCount, getBlockMapZoneThreadID, + getRecoveryJournalThreadID(journal), map, + scheduleEraAdvance, layer, + &map->actionManager); +} + +/** + * Clean up a BlockMapZone. + * + * @param zone The zone to uninitialize + **/ +static void uninitializeBlockMapZone(BlockMapZone *zone) +{ + uninitializeBlockMapTreeZone(&zone->treeZone); + freeVDOPageCache(&zone->pageCache); +} + +/**********************************************************************/ +void freeBlockMap(BlockMap **mapPtr) +{ + BlockMap *map = *mapPtr; + if (map == NULL) { + return; + } + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + uninitializeBlockMapZone(&map->zones[zone]); + } + + abandonBlockMapGrowth(map); + freeForest(&map->forest); + freeActionManager(&map->actionManager); + + FREE(map); + *mapPtr = NULL; +} + +/**********************************************************************/ +size_t getBlockMapEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(BlockMapState2_0); +} + +/**********************************************************************/ +int encodeBlockMap(const BlockMap *map, Buffer *buffer) +{ + int result = encodeHeader(&BLOCK_MAP_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt64LEIntoBuffer(buffer, BLOCK_MAP_FLAT_PAGE_ORIGIN); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->flatPageCount); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->rootOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->rootCount); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(BLOCK_MAP_HEADER_2_0.size == encodedSize, + "encoded block map component size must match header size"); +} + +/**********************************************************************/ +void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal) +{ + map->currentEraPoint = getCurrentJournalSequenceNumber(journal); + map->pendingEraPoint = map->currentEraPoint; + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + setTreeZoneInitialPeriod(&map->zones[zone].treeZone, map->currentEraPoint); + setVDOPageCacheInitialPeriod(map->zones[zone].pageCache, + map->currentEraPoint); + } +} + +/**********************************************************************/ +ZoneCount computeLogicalZone(DataVIO *dataVIO) +{ + BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); + TreeLock *treeLock = &dataVIO->treeLock; + PageNumber pageNumber = computePageNumber(dataVIO->logical.lbn); + treeLock->treeSlots[0].pageIndex = pageNumber; + treeLock->rootIndex = pageNumber % map->rootCount; + return (treeLock->rootIndex % map->zoneCount); +} + +/**********************************************************************/ +void findBlockMapSlotAsync(DataVIO *dataVIO, + VDOAction *callback, + ThreadID threadID) +{ + BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); + if (dataVIO->logical.lbn >= map->entryCount) { + finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); + return; + } + + TreeLock *treeLock = &dataVIO->treeLock; + BlockMapTreeSlot *slot = &treeLock->treeSlots[0]; + slot->blockMapSlot.slot = computeSlot(dataVIO->logical.lbn); + if (slot->pageIndex < map->flatPageCount) { + slot->blockMapSlot.pbn = slot->pageIndex + BLOCK_MAP_FLAT_PAGE_ORIGIN; + launchCallback(dataVIOAsCompletion(dataVIO), callback, threadID); + return; + } + + treeLock->callback = callback; + treeLock->threadID = threadID; + lookupBlockMapPBN(dataVIO); +} + +/**********************************************************************/ +PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) +{ + return (map->flatPageCount + map->rootCount); +} + +/**********************************************************************/ +BlockCount getNumberOfBlockMapEntries(const BlockMap *map) +{ + return map->entryCount; +} + +/**********************************************************************/ +void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber) +{ + if (map == NULL) { + return; + } + + map->pendingEraPoint = recoveryBlockNumber; + scheduleDefaultAction(map->actionManager); +} + +/**********************************************************************/ +void checkForDrainComplete(BlockMapZone *zone) +{ + if (isDraining(&zone->state) + && !isTreeZoneActive(&zone->treeZone) + && !isPageCacheActive(zone->pageCache)) { + finishDrainingWithResult(&zone->state, + (isReadOnly(zone->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); + } +} + +/** + * Initiate a drain of the trees and page cache of a block map zone. + * + * Implements AdminInitiator + **/ +static void initiateDrain(AdminState *state) +{ + BlockMapZone *zone = container_of(state, BlockMapZone, state); + drainZoneTrees(&zone->treeZone); + drainVDOPageCache(zone->pageCache); + checkForDrainComplete(zone); +} + +/** + * Drain a zone of the block map. + * + *

Implements ZoneAction. + **/ +static void drainZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + startDraining(&zone->state, + getCurrentManagerOperation(zone->blockMap->actionManager), + parent, initiateDrain); +} + +/**********************************************************************/ +void drainBlockMap(BlockMap *map, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, operation, NULL, drainZone, NULL, + parent); +} + +/** + * Resume a zone of the block map. + * + *

Implements ZoneAction. + **/ +static void resumeBlockMapZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + finishCompletion(parent, resumeIfQuiescent(&zone->state)); +} + +/**********************************************************************/ +void resumeBlockMap(BlockMap *map, VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, ADMIN_STATE_RESUMING, NULL, + resumeBlockMapZone, NULL, parent); +} + +/**********************************************************************/ +int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) +{ + if (map->nextEntryCount == newLogicalBlocks) { + return VDO_SUCCESS; + } + + if (map->nextEntryCount > 0) { + abandonBlockMapGrowth(map); + } + + if (newLogicalBlocks < map->entryCount) { + map->nextEntryCount = map->entryCount; + return VDO_SUCCESS; + } + + return makeForest(map, newLogicalBlocks); +} + +/**********************************************************************/ +BlockCount getNewEntryCount(BlockMap *map) +{ + return map->nextEntryCount; +} + +/** + * Grow the block map by replacing the forest with the one which was prepared. + * + * Implements ActionPreamble + **/ +static void growForest(void *context, VDOCompletion *completion) +{ + replaceForest(context); + completeCompletion(completion); +} + +/**********************************************************************/ +void growBlockMap(BlockMap *map, VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, + growForest, NULL, NULL, parent); +} + +/**********************************************************************/ +void abandonBlockMapGrowth(BlockMap *map) +{ + abandonForest(map); +} + +/** + * Finish processing a block map get or put operation. This function releases + * the page completion and then continues the requester. + * + * @param completion The completion for the page fetch + * @param result The result of the block map operation + **/ +static inline void finishProcessingPage(VDOCompletion *completion, int result) +{ + VDOCompletion *parent = completion->parent; + releaseVDOPageCompletion(completion); + continueCompletion(parent, result); +} + +/** + * Handle an error fetching a page from the cache. This error handler is + * registered in setupMappedBlock(). + * + * @param completion The page completion which got an error + **/ +static void handlePageError(VDOCompletion *completion) +{ + finishProcessingPage(completion, completion->result); +} + +/** + * Get the mapping page for a get/put mapped block operation and dispatch to + * the appropriate handler. + * + * @param dataVIO The dataVIO + * @param modifiable Whether we intend to modify the mapping + * @param action The handler to process the mapping page + **/ +static void setupMappedBlock(DataVIO *dataVIO, + bool modifiable, + VDOAction *action) +{ + BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); + if (isDraining(&zone->state)) { + finishDataVIO(dataVIO, VDO_SHUTTING_DOWN); + return; + } + + initVDOPageCompletion(&dataVIO->pageCompletion, zone->pageCache, + dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn, + modifiable, dataVIOAsCompletion(dataVIO), action, + handlePageError); + getVDOPageAsync(&dataVIO->pageCompletion.completion); +} + +/** + * Decode and validate a block map entry and attempt to use it to set the + * mapped location of a DataVIO. + * + * @param dataVIO The DataVIO to update with the map entry + * @param entry The block map entry for the logical block + * + * @return VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid + * or an error code for any other failure + **/ +__attribute__((warn_unused_result)) +static int setMappedEntry(DataVIO *dataVIO, const BlockMapEntry *entry) +{ + // Unpack the PBN for logging purposes even if the entry is invalid. + DataLocation mapped = unpackBlockMapEntry(entry); + + if (isValidLocation(&mapped)) { + int result = setMappedLocation(dataVIO, mapped.pbn, mapped.state); + /* + * Return success and all errors not specifically known to be errors from + * validating the location. Yes, this expression is redundant; it is + * intentional. + */ + if ((result == VDO_SUCCESS) + || ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))) { + return result; + } + } + + // Log the corruption even if we wind up ignoring it for write VIOs, + // converting all cases to VDO_BAD_MAPPING. + logErrorWithStringError(VDO_BAD_MAPPING, "PBN %" PRIu64 + " with state %u read from the block map was invalid", + mapped.pbn, mapped.state); + + // A read VIO has no option but to report the bad mapping--reading + // zeros would be hiding known data loss. + if (isReadDataVIO(dataVIO)) { + return VDO_BAD_MAPPING; + } + + // A write VIO only reads this mapping to decref the old block. Treat + // this as an unmapped entry rather than fail the write. + clearMappedLocation(dataVIO); + return VDO_SUCCESS; +} + +/** + * This callback is registered in getMappedBlockAsync(). + **/ +static void getMappingFromFetchedPage(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + finishProcessingPage(completion, completion->result); + return; + } + + const BlockMapPage *page = dereferenceReadableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + finishProcessingPage(completion, result); + return; + } + + DataVIO *dataVIO = asDataVIO(completion->parent); + BlockMapTreeSlot *treeSlot = &dataVIO->treeLock.treeSlots[0]; + const BlockMapEntry *entry = &page->entries[treeSlot->blockMapSlot.slot]; + + result = setMappedEntry(dataVIO, entry); + finishProcessingPage(completion, result); +} + +/** + * This callback is registered in putMappedBlockAsync(). + **/ +static void putMappingInFetchedPage(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + finishProcessingPage(completion, completion->result); + return; + } + + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + finishProcessingPage(completion, result); + return; + } + + DataVIO *dataVIO = asDataVIO(completion->parent); + BlockMapPageContext *context = getVDOPageCompletionContext(completion); + SequenceNumber oldLock = context->recoveryLock; + updateBlockMapPage(page, dataVIO, dataVIO->newMapped.pbn, + dataVIO->newMapped.state, &context->recoveryLock); + markCompletedVDOPageDirty(completion, oldLock, context->recoveryLock); + finishProcessingPage(completion, VDO_SUCCESS); +} + +/**********************************************************************/ +void getMappedBlockAsync(DataVIO *dataVIO) +{ + if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { + // We know that the block map page for this LBN has not been allocated, + // so the block must be unmapped. + clearMappedLocation(dataVIO); + continueDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + setupMappedBlock(dataVIO, false, getMappingFromFetchedPage); +} + +/**********************************************************************/ +void putMappedBlockAsync(DataVIO *dataVIO) +{ + setupMappedBlock(dataVIO, true, putMappingInFetchedPage); +} + +/**********************************************************************/ +BlockMapStatistics getBlockMapStatistics(BlockMap *map) +{ + BlockMapStatistics stats; + memset(&stats, 0, sizeof(BlockMapStatistics)); + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + const AtomicPageCacheStatistics *atoms + = getVDOPageCacheStatistics(map->zones[zone].pageCache); + stats.dirtyPages += atomicLoad64(&atoms->counts.dirtyPages); + stats.cleanPages += atomicLoad64(&atoms->counts.cleanPages); + stats.freePages += atomicLoad64(&atoms->counts.freePages); + stats.failedPages += atomicLoad64(&atoms->counts.failedPages); + stats.incomingPages += atomicLoad64(&atoms->counts.incomingPages); + stats.outgoingPages += atomicLoad64(&atoms->counts.outgoingPages); + + stats.cachePressure += atomicLoad64(&atoms->cachePressure); + stats.readCount += atomicLoad64(&atoms->readCount); + stats.writeCount += atomicLoad64(&atoms->writeCount); + stats.failedReads += atomicLoad64(&atoms->failedReads); + stats.failedWrites += atomicLoad64(&atoms->failedWrites); + stats.reclaimed += atomicLoad64(&atoms->reclaimed); + stats.readOutgoing += atomicLoad64(&atoms->readOutgoing); + stats.foundInCache += atomicLoad64(&atoms->foundInCache); + stats.discardRequired += atomicLoad64(&atoms->discardRequired); + stats.waitForPage += atomicLoad64(&atoms->waitForPage); + stats.fetchRequired += atomicLoad64(&atoms->fetchRequired); + stats.pagesLoaded += atomicLoad64(&atoms->pagesLoaded); + stats.pagesSaved += atomicLoad64(&atoms->pagesSaved); + stats.flushCount += atomicLoad64(&atoms->flushCount); + } + + return stats; +} diff --git a/vdo/base/blockMap.h b/vdo/base/blockMap.h new file mode 100644 index 0000000..48073a9 --- /dev/null +++ b/vdo/base/blockMap.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.h#4 $ + */ + +#ifndef BLOCK_MAP_H +#define BLOCK_MAP_H + +#include "adminState.h" +#include "blockMapEntry.h" +#include "completion.h" +#include "fixedLayout.h" +#include "statistics.h" +#include "types.h" + +/** + * Create a block map. + * + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] flatPageCount The number of flat pages + * @param [in] rootOrigin The absolute PBN of the first root page + * @param [in] rootCount The number of tree roots + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int makeBlockMap(BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockCount flatPageCount, + PhysicalBlockNumber rootOrigin, + BlockCount rootCount, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Quiesce all block map I/O, possibly writing out all dirty metadata. + * + * @param map The block map to drain + * @param operation The type of drain to perform + * @param parent The completion to notify when the drain is complete + **/ +void drainBlockMap(BlockMap *map, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume I/O for a quiescent block map. + * + * @param map The block map to resume + * @param parent The completion to notify when the resume is complete + **/ +void resumeBlockMap(BlockMap *map, VDOCompletion *parent); + +/** + * Prepare to grow the block map by allocating an expanded collection of trees. + * + * @param map The block map to grow + * @param newLogicalBlocks The new logical size of the VDO + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) + __attribute__((warn_unused_result)); + +/** + * Get the logical size to which this block map is prepared to grow. + * + * @param map The block map + * + * @return The new number of entries the block map will be grown to or 0 if + * the block map is not prepared to grow + **/ +BlockCount getNewEntryCount(BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Grow a block map on which prepareToGrowBlockMap() has already been called. + * + * @param map The block map to grow + * @param parent The object to notify when the growth is complete + **/ +void growBlockMap(BlockMap *map, VDOCompletion *parent); + +/** + * Abandon any preparations which were made to grow this block map. + * + * @param map The map which won't be grown + **/ +void abandonBlockMapGrowth(BlockMap *map); + +/** + * Decode the state of a block map saved in a buffer, without creating page + * caches. + * + * @param [in] buffer A buffer containing the super block state + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int decodeBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Create a block map from the saved state of a Sodium block map, and do any + * necessary upgrade work. + * + * @param [in] buffer A buffer containing the super block state + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int decodeSodiumBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the page caches for a block map. + * + * @param map The block map needing caches. + * @param layer The physical layer for the cache + * @param readOnlyNotifier The read only mode context + * @param journal The recovery journal (may be NULL) + * @param nonce The nonce to distinguish initialized pages + * @param cacheSize The block map cache size, in pages + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and must be written out + * + * @return VDO_SUCCESS or an error code + **/ +int makeBlockMapCaches(BlockMap *map, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *journal, + Nonce nonce, + PageCount cacheSize, + BlockCount maximumAge) + __attribute__((warn_unused_result)); + +/** + * Free a block map and null out the reference to it. + * + * @param mapPtr A pointer to the block map to free + **/ +void freeBlockMap(BlockMap **mapPtr); + +/** + * Get the size of the encoded state of a block map. + * + * @return The encoded size of the map's state + **/ +size_t getBlockMapEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a block map into a buffer. + * + * @param map The block map to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeBlockMap(const BlockMap *map, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Obtain any necessary state from the recovery journal that is needed for + * normal block map operation. + * + * @param map The map in question + * @param journal The journal to initialize from + **/ +void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal); + +/** + * Get the portion of the block map for a given logical zone. + * + * @param map The map + * @param zoneNumber The number of the zone + * + * @return The requested block map zone + **/ +BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Compute the logical zone on which the entry for a DataVIO + * resides + * + * @param dataVIO The DataVIO + * + * @return The logical zone number for the DataVIO + **/ +ZoneCount computeLogicalZone(DataVIO *dataVIO); + +/** + * Compute the block map slot in which the block map entry for a DataVIO + * resides, and cache that number in the DataVIO. + * + * @param dataVIO The DataVIO + * @param callback The function to call once the slot has been found + * @param threadID The thread on which to run the callback + **/ +void findBlockMapSlotAsync(DataVIO *dataVIO, + VDOAction *callback, + ThreadID threadID); + +/** + * Get number of block map pages at predetermined locations. + * + * @param map The block map + * + * @return The number of fixed pages used by the map + **/ +PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Get number of block map entries. + * + * @param map The block map + * + * @return The number of entries stored in the map + **/ +BlockCount getNumberOfBlockMapEntries(const BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Notify the block map that the recovery journal has finished a new block. + * This method must be called from the journal zone thread. + * + * @param map The block map + * @param recoveryBlockNumber The sequence number of the finished recovery + * journal block + **/ +void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber); + +/** + * Get the block number of the physical block containing the data for the + * specified logical block number. All blocks are mapped to physical block + * zero by default, which is conventionally the zero block. + * + * @param dataVIO The DataVIO of the block to map + **/ +void getMappedBlockAsync(DataVIO *dataVIO); + +/** + * Associate the logical block number for a block represented by a DataVIO + * with the physical block number in its newMapped field. + * + * @param dataVIO The DataVIO of the block to map + **/ +void putMappedBlockAsync(DataVIO *dataVIO); + +/** + * Get the stats for the block map page cache. + * + * @param map The block map containing the cache + * + * @return The block map statistics + **/ +BlockMapStatistics getBlockMapStatistics(BlockMap *map) + __attribute__((warn_unused_result)); + +#endif // BLOCK_MAP_H diff --git a/vdo/base/blockMapEntry.h b/vdo/base/blockMapEntry.h new file mode 100644 index 0000000..78304e9 --- /dev/null +++ b/vdo/base/blockMapEntry.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapEntry.h#4 $ + */ + +#ifndef BLOCK_MAP_ENTRY_H +#define BLOCK_MAP_ENTRY_H + +#include "blockMappingState.h" +#include "constants.h" +#include "numeric.h" +#include "types.h" + +/** + * The entry for each logical block in the block map is encoded into five + * bytes, which saves space in both the on-disk and in-memory layouts. It + * consists of the 36 low-order bits of a PhysicalBlockNumber (addressing 256 + * terabytes with a 4KB block size) and a 4-bit encoding of a + * BlockMappingState. + **/ +typedef union __attribute__((packed)) blockMapEntry { + struct __attribute__((packed)) { + /** + * Bits 7..4: The four highest bits of the 36-bit physical block number + * Bits 3..0: The 4-bit BlockMappingState + **/ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned mappingState : 4; + unsigned pbnHighNibble : 4; +#else + unsigned pbnHighNibble : 4; + unsigned mappingState : 4; +#endif + + /** 32 low-order bits of the 36-bit PBN, in little-endian byte order */ + byte pbnLowWord[4]; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[5]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned mappingState : 4; + unsigned pbnHighNibble : 4; + uint32_t pbnLowWord; + } littleEndian; +#endif +} BlockMapEntry; + +/** + * Unpack the fields of a BlockMapEntry, returning them as a DataLocation. + * + * @param entry A pointer to the entry to unpack + * + * @return the location of the data mapped by the block map entry + **/ +static inline DataLocation unpackBlockMapEntry(const BlockMapEntry *entry) +{ + PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); + PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; + return (DataLocation) { + .pbn = ((high4 << 32) | low32), + .state = entry->fields.mappingState, + }; +} + +/**********************************************************************/ +static inline bool isMappedLocation(const DataLocation *location) +{ + return (location->state != MAPPING_STATE_UNMAPPED); +} + +/**********************************************************************/ +static inline bool isValidLocation(const DataLocation *location) +{ + if (location->pbn == ZERO_BLOCK) { + return !isCompressed(location->state); + } else { + return isMappedLocation(location); + } +} + +/** + * Pack a PhysicalBlockNumber into a BlockMapEntry. + * + * @param pbn The physical block number to convert to its + * packed five-byte representation + * @param mappingState The mapping state of the block + * + * @return the packed representation of the block number and mapping state + * + * @note unrepresentable high bits of the unpacked PBN are silently truncated + **/ +static inline BlockMapEntry packPBN(PhysicalBlockNumber pbn, + BlockMappingState mappingState) +{ + BlockMapEntry entry; + entry.fields.mappingState = (mappingState & 0x0F); + entry.fields.pbnHighNibble = ((pbn >> 32) & 0x0F), + storeUInt32LE(entry.fields.pbnLowWord, pbn & UINT_MAX); + return entry; +} + +#endif // BLOCK_MAP_ENTRY_H diff --git a/vdo/base/blockMapInternals.h b/vdo/base/blockMapInternals.h new file mode 100644 index 0000000..9b2f7a5 --- /dev/null +++ b/vdo/base/blockMapInternals.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapInternals.h#12 $ + */ + +#ifndef BLOCK_MAP_INTERNALS_H +#define BLOCK_MAP_INTERNALS_H + +#include "adminState.h" +#include "blockMapEntry.h" +#include "blockMapTree.h" +#include "completion.h" +#include "dirtyLists.h" +#include "header.h" +#include "intMap.h" +#include "ringNode.h" +#include "types.h" +#include "vdoPageCache.h" +#include "vioPool.h" + +/** + * The per-zone fields used by the block map tree. + **/ +struct blockMapTreeZone { + /** The BlockMapZone which owns this tree zone */ + BlockMapZone *mapZone; + /** The lists of dirty tree pages */ + DirtyLists *dirtyLists; + /** The number of tree lookups in progress */ + VIOCount activeLookups; + /** The map of pages currently being loaded */ + IntMap *loadingPages; + /** The pool of VIOs for tree I/O */ + VIOPool *vioPool; + /** The tree page which has issued or will be issuing a flush */ + TreePage *flusher; + /** The queue of pages waiting for a flush so they can be written out */ + WaitQueue flushWaiters; + /** The generation after the most recent flush */ + uint8_t generation; + /** The oldest active generation */ + uint8_t oldestGeneration; + /** The counts of dirty pages in each generation */ + uint32_t dirtyPageCounts[256]; +}; + +/** + * The per-zone fields of the block map. + **/ +struct blockMapZone { + /** The number of the zone this is */ + ZoneCount zoneNumber; + /** The ID of this zone's logical thread */ + ThreadID threadID; + /** The BlockMap which owns this BlockMapZone */ + BlockMap *blockMap; + /** The ReadOnlyNotifier of the VDO */ + ReadOnlyNotifier *readOnlyNotifier; + /** The page cache for this zone */ + VDOPageCache *pageCache; + /** The per-zone portion of the tree for this zone */ + BlockMapTreeZone treeZone; + /** The administrative state of the zone */ + AdminState state; +}; + +struct blockMap { + /** The manager for block map actions */ + ActionManager *actionManager; + /** The count of pages in the linear part of the block map */ + BlockCount flatPageCount; + /** The absolute PBN of the first root of the tree part of the block map */ + PhysicalBlockNumber rootOrigin; + /** The count of root pages of the tree part of the block map */ + BlockCount rootCount; + + /** The era point we are currently distributing to the zones */ + SequenceNumber currentEraPoint; + /** The next era point, not yet distributed to any zone */ + SequenceNumber pendingEraPoint; + + /** The number of entries in block map */ + BlockCount entryCount; + /** The VDO's nonce, for the pages */ + Nonce nonce; + /** The recovery journal for this map */ + RecoveryJournal *journal; + + /** The trees for finding block map pages */ + Forest *forest; + /** The expanded trees awaiting growth */ + Forest *nextForest; + /** The number of entries after growth */ + BlockCount nextEntryCount; + + /** The number of logical zones */ + ZoneCount zoneCount; + /** The per zone block map structure */ + BlockMapZone zones[]; +}; + +/** + * Compute the number of pages required for a block map with the specified + * parameters. + * + * @param entries The number of block map entries + * + * @return The number of pages required + **/ +PageCount computeBlockMapPageCount(BlockCount entries); + +/** + * Compute the number of the block map page on which the entry for a given + * logical block resides. + * + * @param lbn The logical block number whose page is desired + * + * @return The number of the block map page containing the entry for + * the given logical block number + **/ +__attribute__((warn_unused_result)) +static inline PageNumber computePageNumber(LogicalBlockNumber lbn) +{ + return (lbn / BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/** + * Find the block map page slot in which the entry for a given logical + * block resides. + * + * @param lbn The logical block number whose slot + * + * @return The slot containing the entry for the given logical block number + **/ +__attribute__((warn_unused_result)) +static inline SlotNumber computeSlot(LogicalBlockNumber lbn) +{ + return (lbn % BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/** + * Check whether a zone of the block map has drained, and if so, send a + * notification thereof. + * + * @param zone The zone to check + **/ +void checkForDrainComplete(BlockMapZone *zone); + + +#endif // BLOCK_MAP_INTERNALS_H diff --git a/vdo/base/blockMapPage.c b/vdo/base/blockMapPage.c new file mode 100644 index 0000000..8272e12 --- /dev/null +++ b/vdo/base/blockMapPage.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.c#8 $ + */ + +#include "blockMapPage.h" + +#include "permassert.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapTree.h" +#include "constants.h" +#include "dataVIO.h" +#include "recoveryJournal.h" +#include "statusCodes.h" +#include "types.h" + +enum { + PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, +}; + +static const VersionNumber BLOCK_MAP_4_1 = { + .majorVersion = 4, + .minorVersion = 1, +}; + +/**********************************************************************/ +bool isCurrentBlockMapPage(const BlockMapPage *page) +{ + return areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)); +} + +/**********************************************************************/ +BlockMapPage *formatBlockMapPage(void *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + bool initialized) +{ + memset(buffer, 0, VDO_BLOCK_SIZE); + BlockMapPage *page = (BlockMapPage *) buffer; + page->version = packVersionNumber(BLOCK_MAP_4_1); + storeUInt64LE(page->header.fields.nonce, nonce); + storeUInt64LE(page->header.fields.pbn, pbn); + page->header.fields.initialized = initialized; + return page; +} + +/**********************************************************************/ +BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, + Nonce nonce, + PhysicalBlockNumber pbn) +{ + // Make sure the page layout isn't accidentally changed by changing the + // length of the page header. + STATIC_ASSERT_SIZEOF(PageHeader, PAGE_HEADER_4_1_SIZE); + + if (!areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)) + || !isBlockMapPageInitialized(page) + || (nonce != getUInt64LE(page->header.fields.nonce))) { + return BLOCK_MAP_PAGE_INVALID; + } + + if (pbn != getBlockMapPagePBN(page)) { + return BLOCK_MAP_PAGE_BAD; + } + + return BLOCK_MAP_PAGE_VALID; +} + +/**********************************************************************/ +void updateBlockMapPage(BlockMapPage *page, + DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState mappingState, + SequenceNumber *recoveryLock) +{ + // Encode the new mapping. + TreeLock *treeLock = &dataVIO->treeLock; + SlotNumber slot = treeLock->treeSlots[treeLock->height].blockMapSlot.slot; + page->entries[slot] = packPBN(pbn, mappingState); + + // Adjust references (locks) on the recovery journal blocks. + BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); + BlockMap *blockMap = zone->blockMap; + RecoveryJournal *journal = blockMap->journal; + SequenceNumber oldLocked = *recoveryLock; + SequenceNumber newLocked = dataVIO->recoverySequenceNumber; + + if ((oldLocked == 0) || (oldLocked > newLocked)) { + // Acquire a lock on the newly referenced journal block. + acquireRecoveryJournalBlockReference(journal, newLocked, ZONE_TYPE_LOGICAL, + zone->zoneNumber); + + // If the block originally held a newer lock, release it. + if (oldLocked > 0) { + releaseRecoveryJournalBlockReference(journal, oldLocked, + ZONE_TYPE_LOGICAL, + zone->zoneNumber); + } + + *recoveryLock = newLocked; + } + + // Release the transferred lock from the DataVIO. + releasePerEntryLockFromOtherZone(journal, newLocked); + dataVIO->recoverySequenceNumber = 0; +} diff --git a/vdo/base/blockMapPage.h b/vdo/base/blockMapPage.h new file mode 100644 index 0000000..ee011b3 --- /dev/null +++ b/vdo/base/blockMapPage.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.h#8 $ + */ + +#ifndef BLOCK_MAP_PAGE_H +#define BLOCK_MAP_PAGE_H + +#include "numeric.h" + +#include "blockMapEntry.h" +#include "header.h" +#include "types.h" + +/** + * The packed, on-disk representation of a block map page header. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** + * The 64-bit nonce of the current VDO, in little-endian byte order. Used + * to determine whether or not a page has been formatted. + **/ + byte nonce[8]; + + /** The 64-bit PBN of this page, in little-endian byte order */ + byte pbn[8]; + + /** Formerly recoverySequenceNumber; may be non-zero on disk */ + byte unusedLongWord[8]; + + /** Whether this page has been initialized on disk (i.e. written twice) */ + bool initialized; + + /** Formerly entryOffset; now unused since it should always be zero */ + byte unusedByte1; + + /** Formerly interiorTreePageWriting; may be non-zero on disk */ + byte unusedByte2; + + /** Formerly generation (for dirty tree pages); may be non-zero on disk */ + byte unusedByte3; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 1 + 1 + 1 + 1]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + uint64_t nonce; + PhysicalBlockNumber pbn; + uint64_t unusedLongWord; + bool initialized; + uint8_t unusedByte1; + uint8_t unusedByte2; + uint8_t unusedByte3; + } littleEndian; +#endif +} PageHeader; + +/** + * The format of a block map page. + **/ +typedef struct __attribute__((packed)) { + PackedVersionNumber version; + PageHeader header; + BlockMapEntry entries[]; +} BlockMapPage; + +typedef enum { + // A block map page is correctly initialized + BLOCK_MAP_PAGE_VALID, + // A block map page is uninitialized + BLOCK_MAP_PAGE_INVALID, + // A block map page is intialized, but is the wrong page + BLOCK_MAP_PAGE_BAD, +} BlockMapPageValidity; + +/** + * Check whether a block map page has been initialized. + * + * @param page The page to check + * + * @return true if the page has been initialized + **/ +__attribute__((warn_unused_result)) +static inline bool isBlockMapPageInitialized(const BlockMapPage *page) +{ + return page->header.fields.initialized; +} + +/** + * Mark whether a block map page has been initialized. + * + * @param page The page to mark + * @param initialized The state to set + * + * @return true if the initialized flag was modified + **/ +static inline bool markBlockMapPageInitialized(BlockMapPage *page, + bool initialized) +{ + if (initialized == page->header.fields.initialized) { + return false; + } + + page->header.fields.initialized = initialized; + return true; +} + +/** + * Get the physical block number where a block map page is stored. + * + * @param page The page to query + * + * @return the page's physical block number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getBlockMapPagePBN(const BlockMapPage *page) +{ + return getUInt64LE(page->header.fields.pbn); +} + +/** + * Check whether a block map page is of the current version. + * + * @param page The page to check + * + * @return true if the page has the current version + **/ +bool isCurrentBlockMapPage(const BlockMapPage *page) + __attribute__((warn_unused_result)); + +/** + * Format a block map page in memory. + * + * @param buffer The buffer which holds the page + * @param nonce The VDO nonce + * @param pbn The absolute PBN of the page + * @param initialized Whether the page should be marked as initialized + * + * @return the buffer pointer, as a block map page (for convenience) + **/ +BlockMapPage *formatBlockMapPage(void *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + bool initialized); + +/** + * Check whether a newly read page is valid, upgrading its in-memory format if + * possible and necessary. If the page is valid, clear fields which are not + * meaningful on disk. + * + * @param page The page to validate + * @param nonce The VDO nonce + * @param pbn The expected absolute PBN of the page + * + * @return The validity of the page + **/ +BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, + Nonce nonce, + PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Update an entry on a block map page. + * + * @param [in] page The page to update + * @param [in] dataVIO The DataVIO making the update + * @param [in] pbn The new PBN for the entry + * @param [in] mappingState The new mapping state for the entry + * @param [in,out] recoveryLock A reference to the current recovery sequence + * number lock held by the page. Will be updated + * if the lock changes to protect the new entry + **/ +void updateBlockMapPage(BlockMapPage *page, + DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState mappingState, + SequenceNumber *recoveryLock); + +#endif // BLOCK_MAP_PAGE_H diff --git a/vdo/base/blockMapRecovery.c b/vdo/base/blockMapRecovery.c new file mode 100644 index 0000000..f70be42 --- /dev/null +++ b/vdo/base/blockMapRecovery.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $ + */ + +#include "blockMapRecovery.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "heap.h" +#include "numUtils.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +/** + * A completion to manage recovering the block map from the recovery journal. + * Note that the page completions kept in this structure are not immediately + * freed, so the corresponding pages will be locked down in the page cache + * until the recovery frees them. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the completion for flushing the block map */ + VDOCompletion subTaskCompletion; + /** the thread from which the block map may be flushed */ + ThreadID adminThread; + /** the thread on which all block map operations must be done */ + ThreadID logicalThreadID; + /** the block map */ + BlockMap *blockMap; + /** whether this recovery has been aborted */ + bool aborted; + /** whether we are currently launching the initial round of requests */ + bool launching; + + // Fields for the journal entries. + /** the journal entries to apply */ + NumberedBlockMapping *journalEntries; + /** + * a heap wrapping journalEntries. It re-orders and sorts journal entries in + * ascending LBN order, then original journal order. This permits efficient + * iteration over the journal entries in order. + **/ + Heap replayHeap; + + // Fields tracking progress through the journal entries. + /** a pointer to the next journal entry to apply */ + NumberedBlockMapping *currentEntry; + /** the next entry for which the block map page has not been requested */ + NumberedBlockMapping *currentUnfetchedEntry; + + // Fields tracking requested pages. + /** the absolute PBN of the current page being processed */ + PhysicalBlockNumber pbn; + /** number of pending (non-ready) requests */ + PageCount outstanding; + /** number of page completions */ + PageCount pageCount; + /** array of requested, potentially ready page completions */ + VDOPageCompletion pageCompletions[]; +} BlockMapRecoveryCompletion; + +/** + * This is a HeapComparator function that orders NumberedBlockMappings using + * the 'blockMapSlot' field as the primary key and the mapping 'number' field + * as the secondary key. Using the mapping number preserves the journal order + * of entries for the same slot, allowing us to sort by slot while still + * ensuring we replay all entries with the same slot in the exact order as they + * appeared in the journal. + * + *

The comparator order is reversed from the usual sense since Heap is a + * max-heap, returning larger elements before smaller ones, but we want to pop + * entries off the heap in ascending LBN order. + **/ +static int compareMappings(const void *item1, const void *item2) +{ + const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1; + const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2; + + if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) { + return + ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1); + } + + if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) { + return + ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1); + } + + if (mapping1->number != mapping2->number) { + return ((mapping1->number < mapping2->number) ? 1 : -1); + } + + return 0; +} + +/** + * Swap two NumberedBlockMapping structures. Implements HeapSwapper. + **/ +static void swapMappings(void *item1, void *item2) +{ + NumberedBlockMapping *mapping1 = item1; + NumberedBlockMapping *mapping2 = item2; + NumberedBlockMapping temp = *mapping1; + *mapping1 = *mapping2; + *mapping2 = temp; +} + +/** + * Convert a VDOCompletion to a BlockMapRecoveryCompletion. + * + * @param completion The completion to convert + * + * @return The completion as a BlockMapRecoveryCompletion + **/ +__attribute__((warn_unused_result)) +static inline BlockMapRecoveryCompletion * +asBlockMapRecoveryCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0); + assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION); + return (BlockMapRecoveryCompletion *) completion; +} + +/** + * Free a BlockMapRecoveryCompletion and null out the reference to it. + * + * @param completionPtr a pointer to the completion to free + **/ +static void freeRecoveryCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(*completionPtr); + destroyEnqueueable(completion); + destroyEnqueueable(&recovery->subTaskCompletion); + FREE(recovery); + *completionPtr = NULL; +} + +/** + * Free the BlockMapRecoveryCompletion and notify the parent that the block map + * recovery is done. This callback is registered in makeRecoveryCompletion(). + * + * @param completion The BlockMapRecoveryCompletion + **/ +static void finishBlockMapRecovery(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + freeRecoveryCompletion(&completion); + finishCompletion(parent, result); +} + +/** + * Make a new block map recovery completion. + * + * @param [in] vdo The VDO + * @param [in] entryCount The number of journal entries + * @param [in] journalEntries An array of journal entries to process + * @param [in] parent The parent of the recovery completion + * @param [out] recoveryPtr The new block map recovery completion + * + * @return a success or error code + **/ +static int makeRecoveryCompletion(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent, + BlockMapRecoveryCompletion **recoveryPtr) +{ + BlockMap *blockMap = getBlockMap(vdo); + PageCount pageCount + = minPageCount(getConfiguredCacheSize(vdo) >> 1, + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); + + BlockMapRecoveryCompletion *recovery; + int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount, + VDOPageCompletion, __func__, &recovery); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&recovery->completion, + BLOCK_MAP_RECOVERY_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &recovery->completion; + freeRecoveryCompletion(&completion); + return result; + } + + result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &recovery->completion; + freeRecoveryCompletion(&completion); + return result; + } + + recovery->blockMap = blockMap; + recovery->journalEntries = journalEntries; + recovery->pageCount = pageCount; + recovery->currentEntry = &recovery->journalEntries[entryCount - 1]; + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + recovery->adminThread = getAdminThread(threadConfig); + recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0); + + // Organize the journal entries into a binary heap so we can iterate over + // them in sorted order incrementally, avoiding an expensive sort call. + initializeHeap(&recovery->replayHeap, compareMappings, swapMappings, + journalEntries, entryCount, sizeof(NumberedBlockMapping)); + buildHeap(&recovery->replayHeap, entryCount); + + ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID), + "%s must be called on logical thread %u (not %u)", __func__, + recovery->logicalThreadID, getCallbackThreadID()); + prepareCompletion(&recovery->completion, finishBlockMapRecovery, + finishBlockMapRecovery, recovery->logicalThreadID, parent); + + // This message must be recognizable by VDOTest::RebuildBase. + logInfo("Replaying %zu recovery entries into block map", + recovery->replayHeap.count); + + *recoveryPtr = recovery; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void flushBlockMap(VDOCompletion *completion) +{ + logInfo("Flushing block map changes"); + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread), + "flushBlockMap() called on admin thread"); + + prepareToFinishParent(completion, completion->parent); + drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion); +} + +/** + * Check whether the recovery is done. If so, finish it by either flushing the + * block map (if the recovery was successful), or by cleaning up (if it + * wasn't). + * + * @param recovery The recovery completion + * + * @return true if the recovery or recovery is complete + **/ +static bool finishIfDone(BlockMapRecoveryCompletion *recovery) +{ + // Pages are still being launched or there is still work to do + if (recovery->launching || (recovery->outstanding > 0) + || (!recovery->aborted + && (recovery->currentEntry >= recovery->journalEntries))) { + return false; + } + + if (recovery->aborted) { + /* + * We need to be careful here to only free completions that exist. But + * since we know none are outstanding, we just go through the ready ones. + */ + for (size_t i = 0; i < recovery->pageCount; i++) { + VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i]; + if (recovery->pageCompletions[i].ready) { + releaseVDOPageCompletion(&pageCompletion->completion); + } + } + completeCompletion(&recovery->completion); + } else { + launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap, + recovery->adminThread, &recovery->completion); + } + + return true; +} + +/** + * Note that there has been an error during the recovery and finish it if there + * is nothing else outstanding. + * + * @param recovery The BlockMapRecoveryCompletion + * @param result The error result to use, if one is not already saved + **/ +static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result) +{ + recovery->aborted = true; + setCompletionResult(&recovery->completion, result); + finishIfDone(recovery); +} + +/** + * Find the first journal entry after a given entry which is not on the same + * block map page. + * + * @param recovery the BlockMapRecoveryCompletion + * @param currentEntry the entry to search from + * @param needsSort Whether sorting is needed to proceed + * + * @return Pointer to the first later journal entry on a different block map + * page, or a pointer to just before the journal entries if no + * subsequent entry is on a different block map page. + **/ +static NumberedBlockMapping * +findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery, + NumberedBlockMapping *currentEntry, + bool needsSort) +{ + // If currentEntry is invalid, return immediately. + if (currentEntry < recovery->journalEntries) { + return currentEntry; + } + size_t currentPage = currentEntry->blockMapSlot.pbn; + + // Decrement currentEntry until it's out of bounds or on a different page. + while ((currentEntry >= recovery->journalEntries) + && (currentEntry->blockMapSlot.pbn == currentPage)) { + if (needsSort) { + NumberedBlockMapping *justSortedEntry + = sortNextHeapElement(&recovery->replayHeap); + ASSERT_LOG_ONLY(justSortedEntry < currentEntry, + "heap is returning elements in an unexpected order"); + } + currentEntry--; + } + return currentEntry; +} + +/** + * Apply a range of journal entries to a block map page. + * + * @param page The block map page being modified + * @param startingEntry The first journal entry to apply + * @param endingEntry The entry just past the last journal entry to apply + **/ +static void applyJournalEntriesToPage(BlockMapPage *page, + NumberedBlockMapping *startingEntry, + NumberedBlockMapping *endingEntry) +{ + NumberedBlockMapping *currentEntry = startingEntry; + while (currentEntry != endingEntry) { + page->entries[currentEntry->blockMapSlot.slot] + = currentEntry->blockMapEntry; + currentEntry--; + } +} + +/**********************************************************************/ +static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion); + +/** + * Note that a page is now ready and attempt to process pages. This callback is + * registered in fetchPage(). + * + * @param completion The VDOPageCompletion for the fetched page + **/ +static void pageLoaded(VDOCompletion *completion) +{ + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + recovery->outstanding--; + if (!recovery->launching) { + recoverReadyPages(recovery, completion); + } +} + +/** + * Handle an error loading a page. + * + * @param completion The VDOPageCompletion + **/ +static void handlePageLoadError(VDOCompletion *completion) +{ + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + recovery->outstanding--; + abortRecovery(recovery, completion->result); +} + +/** + * Fetch a page from the block map. + * + * @param recovery the BlockMapRecoveryCompletion + * @param completion the page completion to use + **/ +static void fetchPage(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion) +{ + if (recovery->currentUnfetchedEntry < recovery->journalEntries) { + // Nothing left to fetch. + return; + } + + // Fetch the next page we haven't yet requested. + PhysicalBlockNumber newPBN + = recovery->currentUnfetchedEntry->blockMapSlot.pbn; + recovery->currentUnfetchedEntry + = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry, + true); + initVDOPageCompletion(((VDOPageCompletion *) completion), + recovery->blockMap->zones[0].pageCache, + newPBN, true, &recovery->completion, + pageLoaded, handlePageLoadError); + recovery->outstanding++; + getVDOPageAsync(completion); +} + +/** + * Get the next page completion to process. If it isn't ready, we'll try again + * when it is. + * + * @param recovery The recovery completion + * @param completion The current page completion + * + * @return The next page completion to process + **/ +static VDOPageCompletion * +getNextPageCompletion(BlockMapRecoveryCompletion *recovery, + VDOPageCompletion *completion) +{ + completion++; + if (completion == (&recovery->pageCompletions[recovery->pageCount])) { + completion = &recovery->pageCompletions[0]; + } + return completion; +} + +/** + * Recover from as many pages as possible. + * + * @param recovery The recovery completion + * @param completion The first page completion to process + **/ +static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion) +{ + if (finishIfDone(recovery)) { + return; + } + + VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion; + if (recovery->pbn != pageCompletion->pbn) { + return; + } + + while (pageCompletion->ready) { + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + abortRecovery(recovery, result); + return; + } + + NumberedBlockMapping *startOfNextPage + = findEntryStartingNextPage(recovery, recovery->currentEntry, false); + applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage); + recovery->currentEntry = startOfNextPage; + requestVDOPageWrite(completion); + releaseVDOPageCompletion(completion); + + if (finishIfDone(recovery)) { + return; + } + + recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; + fetchPage(recovery, completion); + pageCompletion = getNextPageCompletion(recovery, pageCompletion); + completion = &pageCompletion->completion; + } +} + +/**********************************************************************/ +void recoverBlockMap(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent) +{ + BlockMapRecoveryCompletion *recovery; + int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent, + &recovery); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + if (isHeapEmpty(&recovery->replayHeap)) { + finishCompletion(&recovery->completion, VDO_SUCCESS); + return; + } + + NumberedBlockMapping *firstSortedEntry + = sortNextHeapElement(&recovery->replayHeap); + ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry, + "heap is returning elements in an unexpected order"); + + // Prevent any page from being processed until all pages have been launched. + recovery->launching = true; + recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; + recovery->currentUnfetchedEntry = recovery->currentEntry; + for (PageCount i = 0; i < recovery->pageCount; i++) { + if (recovery->currentUnfetchedEntry < recovery->journalEntries) { + break; + } + + fetchPage(recovery, &recovery->pageCompletions[i].completion); + } + recovery->launching = false; + + // Process any ready pages. + recoverReadyPages(recovery, &recovery->pageCompletions[0].completion); +} diff --git a/vdo/base/blockMapRecovery.h b/vdo/base/blockMapRecovery.h new file mode 100644 index 0000000..9029bf0 --- /dev/null +++ b/vdo/base/blockMapRecovery.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.h#1 $ + */ + +#ifndef BLOCK_MAP_RECOVERY_H +#define BLOCK_MAP_RECOVERY_H + +#include "blockMap.h" +#include "blockMappingState.h" +#include "types.h" + +/** + * An explicitly numbered block mapping. Numbering the mappings allows them to + * be sorted by logical block number during recovery while still preserving + * the relative order of journal entries with the same logical block number. + **/ +typedef struct { + BlockMapSlot blockMapSlot; // Block map slot to map + BlockMapEntry blockMapEntry; // The encoded block map entry for the LBN + uint32_t number; // The serial number to use during replay +} __attribute__((packed)) NumberedBlockMapping; + +/** + * Recover the block map (normal rebuild). + * + * @param vdo The VDO + * @param entryCount The number of journal entries + * @param journalEntries An array of journal entries to process + * @param parent The completion to notify when the rebuild is complete + **/ +void recoverBlockMap(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent); + +#endif // BLOCK_MAP_RECOVERY_H diff --git a/vdo/base/blockMapTree.c b/vdo/base/blockMapTree.c new file mode 100644 index 0000000..fb2b4f4 --- /dev/null +++ b/vdo/base/blockMapTree.c @@ -0,0 +1,1272 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.c#21 $ + */ + +#include "blockMapTree.h" + +#include "logger.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTreeInternals.h" +#include "constants.h" +#include "dataVIO.h" +#include "dirtyLists.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "referenceOperation.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" +#include "vioPool.h" + +enum { + BLOCK_MAP_VIO_POOL_SIZE = 64, +}; + +typedef struct __attribute__((packed)) { + RootCount rootIndex; + Height height; + PageNumber pageIndex; + SlotNumber slot; +} PageDescriptor; + +typedef union { + PageDescriptor descriptor; + uint64_t key; +} PageKey; + +typedef struct { + BlockMapTreeZone *zone; + uint8_t generation; +} WriteIfNotDirtiedContext; + +/** + * An invalid PBN used to indicate that the page holding the location of a + * tree root has been "loaded". + **/ +const PhysicalBlockNumber INVALID_PBN = 0xFFFFFFFFFFFFFFFF; + +/** + * Convert a RingNode to a TreePage. + * + * @param ringNode The RingNode to convert + * + * @return The TreePage which owns the RingNode + **/ +static inline TreePage *treePageFromRingNode(RingNode *ringNode) +{ + return (TreePage *) ((byte *) ringNode - offsetof(TreePage, node)); +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *expired, void *context); + +/** + * Make VIOs for reading, writing, and allocating the arboreal block map. + * + * Implements VIOConstructor. + **/ +__attribute__((warn_unused_result)) +static int makeBlockMapVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) +{ + return createVIO(layer, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, + parent, buffer, vioPtr); +} + +/**********************************************************************/ +int initializeTreeZone(BlockMapZone *zone, + PhysicalLayer *layer, + BlockCount eraLength) +{ + STATIC_ASSERT_SIZEOF(PageDescriptor, sizeof(uint64_t)); + BlockMapTreeZone *treeZone = &zone->treeZone; + treeZone->mapZone = zone; + + int result = makeDirtyLists(eraLength, writeDirtyPagesCallback, treeZone, + &treeZone->dirtyLists); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeIntMap(LOCK_MAP_CAPACITY, 0, &treeZone->loadingPages); + if (result != VDO_SUCCESS) { + return result; + } + + return makeVIOPool(layer, BLOCK_MAP_VIO_POOL_SIZE, zone->threadID, + makeBlockMapVIOs, treeZone, &treeZone->vioPool); +} + +/**********************************************************************/ +int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, + PhysicalLayer *layer, + size_t poolSize) +{ + freeVIOPool(&zone->vioPool); + return makeVIOPool(layer, poolSize, zone->mapZone->threadID, + makeBlockMapVIOs, zone, &zone->vioPool); +} + +/**********************************************************************/ +void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone) +{ + freeDirtyLists(&treeZone->dirtyLists); + freeVIOPool(&treeZone->vioPool); + freeIntMap(&treeZone->loadingPages); +} + +/**********************************************************************/ +void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, + SequenceNumber period) +{ + setCurrentPeriod(treeZone->dirtyLists, period); +} + +/** + * Get the BlockMapTreeZone in which a DataVIO is operating. + * + * @param dataVIO The DataVIO + * + * @return The BlockMapTreeZone + **/ +__attribute__((warn_unused_result)) +static inline BlockMapTreeZone *getBlockMapTreeZone(DataVIO *dataVIO) +{ + return &(getBlockMapForZone(dataVIO->logical.zone)->treeZone); +} + +/** + * Get the TreePage for a given lock. This will be the page referred to by the + * lock's tree slot for the lock's current height. + * + * @param zone The tree zone of the tree + * @param lock The lock describing the page to get + * + * @return The requested page + **/ +static inline TreePage *getTreePage(const BlockMapTreeZone *zone, + const TreeLock *lock) +{ + return getTreePageByIndex(zone->mapZone->blockMap->forest, + lock->rootIndex, + lock->height, + lock->treeSlots[lock->height].pageIndex); +} + +/**********************************************************************/ +bool copyValidPage(char *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + BlockMapPage *page) +{ + BlockMapPage *loaded = (BlockMapPage *) buffer; + BlockMapPageValidity validity = validateBlockMapPage(loaded, nonce, pbn); + if (validity == BLOCK_MAP_PAGE_VALID) { + memcpy(page, loaded, VDO_BLOCK_SIZE); + return true; + } + + if (validity == BLOCK_MAP_PAGE_BAD) { + logErrorWithStringError(VDO_BAD_PAGE, + "Expected page %" PRIu64 + " but got page %llu instead", + pbn, getBlockMapPagePBN(loaded)); + } + + return false; +} + +/**********************************************************************/ +bool isTreeZoneActive(BlockMapTreeZone *zone) +{ + return ((zone->activeLookups != 0) + || hasWaiters(&zone->flushWaiters) + || isVIOPoolBusy(zone->vioPool)); +} + +/** + * Put the VDO in read-only mode and wake any VIOs waiting for a flush. + * + * @param zone The zone + * @param result The error which is causing read-only mode + **/ +static void enterZoneReadOnlyMode(BlockMapTreeZone *zone, int result) +{ + enterReadOnlyMode(zone->mapZone->readOnlyNotifier, result); + + // We are in read-only mode, so we won't ever write any page out. Just take + // all waiters off the queue so the tree zone can be closed. + while (hasWaiters(&zone->flushWaiters)) { + dequeueNextWaiter(&zone->flushWaiters); + } + + checkForDrainComplete(zone->mapZone); +} + +/** + * Check whether a generation is strictly older than some other generation in + * the context of a zone's current generation range. + * + * @param zone The zone in which to do the comparison + * @param a The generation in question + * @param b The generation to compare to + * + * @return true if generation a is not strictly older than + * generation b in the context of the zone + **/ +__attribute__((warn_unused_result)) +static bool isNotOlder(BlockMapTreeZone *zone, uint8_t a, uint8_t b) +{ + int result = ASSERT((inCyclicRange(zone->oldestGeneration, a, + zone->generation, 1 << 8) + && inCyclicRange(zone->oldestGeneration, b, + zone->generation, 1 << 8)), + "generation(s) %u, %u are out of range [%u, %u]", + a, b, zone->oldestGeneration, zone->generation); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return true; + } + + return inCyclicRange(b, a, zone->generation, 1 << 8); +} + +/** + * Decrement the count for a generation and roll the oldest generation if there + * are no longer any active pages in it. + * + * @param zone The zone + * @param generation The generation to release + **/ +static void releaseGeneration(BlockMapTreeZone *zone, uint8_t generation) +{ + int result = ASSERT((zone->dirtyPageCounts[generation] > 0), + "dirty page count underflow for generation %u", + generation); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return; + } + + zone->dirtyPageCounts[generation]--; + while ((zone->dirtyPageCounts[zone->oldestGeneration] == 0) + && (zone->oldestGeneration != zone->generation)) { + zone->oldestGeneration++; + } +} + +/** + * Set the generation of a page and update the dirty page count in the zone. + * + * @param zone The zone which owns the page + * @param page The page + * @param newGeneration The generation to set + * @param decrementOld Whether to decrement the count of the page's old + * generation + **/ +static void setGeneration(BlockMapTreeZone *zone, + TreePage *page, + uint8_t newGeneration, + bool decrementOld) +{ + uint8_t oldGeneration = page->generation; + if (decrementOld && (oldGeneration == newGeneration)) { + return; + } + + page->generation = newGeneration; + uint32_t newCount = ++zone->dirtyPageCounts[newGeneration]; + int result = ASSERT((newCount != 0), + "dirty page count overflow for generation %u", + newGeneration); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return; + } + + if (decrementOld) { + releaseGeneration(zone, oldGeneration); + } +} + +/**********************************************************************/ +static void writePage(TreePage *treePage, VIOPoolEntry *entry); + +/** + * Write out a dirty page if it is still covered by the most recent flush + * or if it is the flusher. + * + *

Implements WaiterCallback + * + * @param waiter The page to write + * @param context The VIOPoolEntry with which to do the write + **/ +static void writePageCallback(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + writePage((TreePage *) waiter, (VIOPoolEntry *) context); +} + +/** + * Acquire a VIO for writing a dirty page. + * + * @param waiter The page which needs a VIO + * @param zone The zone + **/ +static void acquireVIO(Waiter *waiter, BlockMapTreeZone *zone) +{ + waiter->callback = writePageCallback; + int result = acquireVIOFromPool(zone->vioPool, waiter); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + } +} + +/** + * Attempt to increment the generation. + * + * @param zone The zone whose generation is to be incremented + * + * @return true if all possible generations were not already + * active + **/ +static bool attemptIncrement(BlockMapTreeZone *zone) +{ + uint8_t generation = zone->generation + 1; + if (zone->oldestGeneration == generation) { + return false; + } + + zone->generation = generation; + return true; +} + +/** + * Enqueue a page to either launch a flush or wait for the current flush which + * is already in progress. + * + * @param page The page to enqueue + * @param zone The zone + **/ +static void enqueuePage(TreePage *page, BlockMapTreeZone *zone) +{ + if ((zone->flusher == NULL) && attemptIncrement(zone)) { + zone->flusher = page; + acquireVIO(&page->waiter, zone); + return; + } + + int result = enqueueWaiter(&zone->flushWaiters, &page->waiter); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + } +} + +/** + * Write pages which were waiting for a flush and have not been redirtied. + * Requeue those pages which were redirtied. + * + *

Implements WaiterCallback. + * + * @param waiter The dirty page + * @param context The zone and generation + **/ +static void writePageIfNotDirtied(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + TreePage *page = (TreePage *) waiter; + WriteIfNotDirtiedContext *writeContext = context; + if (page->generation == writeContext->generation) { + acquireVIO(waiter, writeContext->zone); + return; + } + + enqueuePage(page, writeContext->zone); +} + +/** + * Return a VIO to the zone's pool. + * + * @param zone The zone which owns the pool + * @param entry The pool entry to return + **/ +static void returnToPool(BlockMapTreeZone *zone, VIOPoolEntry *entry) +{ + returnVIOToPool(zone->vioPool, entry); + checkForDrainComplete(zone->mapZone); +} + +/** + * Handle the successful write of a tree page. This callback is registered in + * writeInitializedPage(). + * + * @param completion The VIO doing the write + **/ +static void finishPageWrite(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + TreePage *page = entry->parent; + BlockMapTreeZone *zone = entry->context; + releaseRecoveryJournalBlockReference(zone->mapZone->blockMap->journal, + page->writingRecoveryLock, + ZONE_TYPE_LOGICAL, + zone->mapZone->zoneNumber); + + bool dirty = (page->writingGeneration != page->generation); + releaseGeneration(zone, page->writingGeneration); + page->writing = false; + + if (zone->flusher == page) { + WriteIfNotDirtiedContext context = { + .zone = zone, + .generation = page->writingGeneration, + }; + notifyAllWaiters(&zone->flushWaiters, writePageIfNotDirtied, &context); + if (dirty && attemptIncrement(zone)) { + writePage(page, entry); + return; + } + + zone->flusher = NULL; + } + + if (dirty) { + enqueuePage(page, zone); + } else if ((zone->flusher == NULL) + && hasWaiters(&zone->flushWaiters) + && attemptIncrement(zone)) { + zone->flusher = (TreePage *) dequeueNextWaiter(&zone->flushWaiters); + writePage(zone->flusher, entry); + return; + } + + returnToPool(zone, entry); +} + +/** + * Handle an error writing a tree page. This error handler is registered in + * writePage() and writeInitializedPage(). + * + * @param completion The VIO doing the write + **/ +static void handleWriteError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + BlockMapTreeZone *zone = entry->context; + enterZoneReadOnlyMode(zone, result); + returnToPool(zone, entry); +} + +/** + * Write a page which has been written at least once. This callback is + * registered in (or called directly from) writePage(). + * + * @param completion The VIO which will do the write + **/ +static void writeInitializedPage(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + TreePage *treePage = (TreePage *) entry->parent; + + /* + * Set the initialized field of the copy of the page we are writing to true. + * We don't want to set it true on the real page in memory until after this + * write succeeds. + */ + BlockMapPage *page = (BlockMapPage *) entry->buffer; + markBlockMapPageInitialized(page, true); + launchWriteMetadataVIOWithFlush(entry->vio, getBlockMapPagePBN(page), + finishPageWrite, handleWriteError, + (zone->flusher == treePage), false); +} + +/** + * Write a dirty tree page now that we have a VIO with which to write it. + * + * @param treePage The page to write + * @param entry The VIOPoolEntry with which to write + **/ +static void writePage(TreePage *treePage, VIOPoolEntry *entry) +{ + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + if ((zone->flusher != treePage) + && (isNotOlder(zone, treePage->generation, zone->generation))) { + // This page was re-dirtied after the last flush was issued, hence we need + // to do another flush. + enqueuePage(treePage, zone); + returnToPool(zone, entry); + return; + } + + entry->parent = treePage; + memcpy(entry->buffer, treePage->pageBuffer, VDO_BLOCK_SIZE); + + VDOCompletion *completion = vioAsCompletion(entry->vio); + completion->callbackThreadID = zone->mapZone->threadID; + + treePage->writing = true; + treePage->writingGeneration = treePage->generation; + treePage->writingRecoveryLock = treePage->recoveryLock; + + // Clear this now so that we know this page is not on any dirty list. + treePage->recoveryLock = 0; + + BlockMapPage *page = asBlockMapPage(treePage); + if (!markBlockMapPageInitialized(page, true)) { + writeInitializedPage(completion); + return; + } + + launchWriteMetadataVIO(entry->vio, getBlockMapPagePBN(page), + writeInitializedPage, handleWriteError); +} + +/** + * Schedule a batch of dirty pages for writing. + * + *

Implements DirtyListsCallback. + * + * @param expired The pages to write + * @param context The zone + **/ +static void writeDirtyPagesCallback(RingNode *expired, void *context) +{ + BlockMapTreeZone *zone = (BlockMapTreeZone *) context; + uint8_t generation = zone->generation; + while (!isRingEmpty(expired)) { + TreePage *page = treePageFromRingNode(chopRingNode(expired)); + + int result = ASSERT(!isWaiting(&page->waiter), + "Newly expired page not already waiting to write"); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + continue; + } + + setGeneration(zone, page, generation, false); + if (!page->writing) { + enqueuePage(page, zone); + } + } +} + +/**********************************************************************/ +void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period) +{ + advancePeriod(zone->dirtyLists, period); +} + +/**********************************************************************/ +void drainZoneTrees(BlockMapTreeZone *zone) +{ + ASSERT_LOG_ONLY((zone->activeLookups == 0), + "drainZoneTrees() called with no active lookups"); + if (!isSuspending(&zone->mapZone->state)) { + flushDirtyLists(zone->dirtyLists); + } +} + +/** + * Release a lock on a page which was being loaded or allocated. + * + * @param dataVIO The DataVIO releasing the page lock + * @param what What the DataVIO was doing (for logging) + **/ +static void releasePageLock(DataVIO *dataVIO, char *what) +{ + TreeLock *lock = &dataVIO->treeLock; + ASSERT_LOG_ONLY(lock->locked, + "release of unlocked block map page %s for key %" PRIu64 + " in tree %u", + what, lock->key, lock->rootIndex); + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + TreeLock *lockHolder = intMapRemove(zone->loadingPages, lock->key); + ASSERT_LOG_ONLY((lockHolder == lock), + "block map page %s mismatch for key %llu in tree %u", + what, lock->key, lock->rootIndex); + lock->locked = false; +} + +/** + * Continue a DataVIO now that the lookup is complete. + * + * @param dataVIO The DataVIO + * @param result The result of the lookup + **/ +static void finishLookup(DataVIO *dataVIO, int result) +{ + dataVIO->treeLock.height = 0; + + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + --zone->activeLookups; + + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + setCompletionResult(completion, result); + launchCallback(completion, dataVIO->treeLock.callback, + dataVIO->treeLock.threadID); +} + +/** + * Abort a block map PBN lookup due to an error in the load or allocation on + * which we were waiting. + * + * @param waiter The DataVIO which was waiting for a page load or allocation + * @param context The error which caused the abort + **/ +static void abortLookupForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + int result = *((int *) context); + if (isReadDataVIO(dataVIO)) { + if (result == VDO_NO_SPACE) { + result = VDO_SUCCESS; + } + } else if (result != VDO_NO_SPACE) { + result = VDO_READ_ONLY; + } + + finishLookup(dataVIO, result); +} + +/** + * Abort a block map PBN lookup due to an error loading or allocating a page. + * + * @param dataVIO The DataVIO which was loading or allocating a page + * @param result The error code + * @param what What the DataVIO was doing (for logging) + **/ +static void abortLookup(DataVIO *dataVIO, int result, char *what) +{ + if (result != VDO_NO_SPACE) { + enterZoneReadOnlyMode(getBlockMapTreeZone(dataVIO), result); + } + + if (dataVIO->treeLock.locked) { + releasePageLock(dataVIO, what); + notifyAllWaiters(&dataVIO->treeLock.waiters, abortLookupForWaiter, + &result); + } + + finishLookup(dataVIO, result); +} + +/** + * Abort a block map PBN lookup due to an error loading a page. + * + * @param dataVIO The DataVIO doing the page load + * @param result The error code + **/ +static void abortLoad(DataVIO *dataVIO, int result) +{ + abortLookup(dataVIO, result, "load"); +} + +/** + * Determine if a location represents a valid mapping for a tree page. + * + * @param vdo The VDO + * @param mapping The DataLocation to check + * @param height The height of the entry in the tree + * + * @return true if the entry represents a invalid page mapping + **/ +__attribute__((warn_unused_result)) +static bool isInvalidTreeEntry(const VDO *vdo, + const DataLocation *mapping, + Height height) +{ + if (!isValidLocation(mapping) + || isCompressed(mapping->state) + || (isMappedLocation(mapping) && (mapping->pbn == ZERO_BLOCK))) { + return true; + } + + // Roots aren't physical data blocks, so we can't check their PBNs. + if (height == BLOCK_MAP_TREE_HEIGHT) { + return false; + } + + return !isPhysicalDataBlock(vdo->depot, mapping->pbn); +} + +/**********************************************************************/ +static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); +static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); + +/** + * Continue a block map PBN lookup now that a page has been loaded by + * descending one level in the tree. + * + * @param dataVIO The DataVIO doing the lookup + * @param page The page which was just loaded + **/ +static void continueWithLoadedPage(DataVIO *dataVIO, BlockMapPage *page) +{ + TreeLock *lock = &dataVIO->treeLock; + BlockMapTreeSlot slot = lock->treeSlots[lock->height]; + DataLocation mapping + = unpackBlockMapEntry(&page->entries[slot.blockMapSlot.slot]); + if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { + logErrorWithStringError(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with " + "state %u for page index %u at height %u", + mapping.pbn, mapping.state, + lock->treeSlots[lock->height - 1].pageIndex, + lock->height - 1); + abortLoad(dataVIO, VDO_BAD_MAPPING); + return; + } + + if (!isMappedLocation(&mapping)) { + // The page we need is unallocated + allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); + return; + } + + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; + if (lock->height == 1) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + // We know what page we need to load next + loadBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); +} + +/** + * Continue a block map PBN lookup now that the page load we were waiting on + * has finished. + * + * @param waiter The DataVIO waiting for a page to be loaded + * @param context The page which was just loaded + **/ +static void continueLoadForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIO->treeLock.height--; + continueWithLoadedPage(dataVIO, (BlockMapPage *) context); +} + +/** + * Finish loading a page now that it has been read in from disk. This callback + * is registered in loadPage(). + * + * @param completion The VIO doing the page read + **/ +static void finishBlockMapPageLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + DataVIO *dataVIO = entry->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + TreeLock *treeLock = &dataVIO->treeLock; + + treeLock->height--; + PhysicalBlockNumber pbn + = treeLock->treeSlots[treeLock->height].blockMapSlot.pbn; + TreePage *treePage = getTreePage(zone, treeLock); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + Nonce nonce = zone->mapZone->blockMap->nonce; + if (!copyValidPage(entry->buffer, nonce, pbn, page)) { + formatBlockMapPage(page, nonce, pbn, false); + } + returnVIOToPool(zone->vioPool, entry); + + // Release our claim to the load and wake any waiters + releasePageLock(dataVIO, "load"); + notifyAllWaiters(&treeLock->waiters, continueLoadForWaiter, page); + continueWithLoadedPage(dataVIO, page); +} + +/** + * Handle an error loading a tree page. + * + * @param completion The VIO doing the page read + **/ +static void handleIOError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + DataVIO *dataVIO = entry->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + returnVIOToPool(zone->vioPool, entry); + abortLoad(dataVIO, result); +} + +/** + * Read a tree page from disk now that we've gotten a VIO with which to do the + * read. This WaiterCallback is registered in loadBlockMapPage(). + * + * @param waiter The DataVIO which requires a page load + * @param context The VIOPool entry with which to do the read + **/ +static void loadPage(Waiter *waiter, void *context) +{ + VIOPoolEntry *entry = context; + DataVIO *dataVIO = waiterAsDataVIO(waiter); + + entry->parent = dataVIO; + entry->vio->completion.callbackThreadID + = getBlockMapForZone(dataVIO->logical.zone)->threadID; + + TreeLock *lock = &dataVIO->treeLock; + launchReadMetadataVIO(entry->vio, + lock->treeSlots[lock->height - 1].blockMapSlot.pbn, + finishBlockMapPageLoad, handleIOError); +} + +/** + * Attempt to acquire a lock on a page in the block map tree. If the page is + * already locked, queue up to wait for the lock to be released. If the lock is + * acquired, the DataVIO's treeLock.locked field will be set to true. + * + * @param zone The BlockMapTreeZone in which the DataVIO operates + * @param dataVIO The DataVIO which desires a page lock + * + * @return VDO_SUCCESS or an error + **/ +static int attemptPageLock(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + TreeLock *lock = &dataVIO->treeLock; + Height height = lock->height; + BlockMapTreeSlot treeSlot = lock->treeSlots[height]; + PageKey key; + key.descriptor = (PageDescriptor) { + .rootIndex = lock->rootIndex, + .height = height, + .pageIndex = treeSlot.pageIndex, + .slot = treeSlot.blockMapSlot.slot, + }; + lock->key = key.key; + + TreeLock *lockHolder; + int result = intMapPut(zone->loadingPages, lock->key, lock, false, + (void **) &lockHolder); + if (result != VDO_SUCCESS) { + return result; + } + + if (lockHolder == NULL) { + // We got the lock + dataVIO->treeLock.locked = true; + return VDO_SUCCESS; + } + + // Someone else is loading or allocating the page we need + return enqueueDataVIO(&lockHolder->waiters, dataVIO, + THIS_LOCATION("$F;cb=blockMapTreePage")); +} + +/** + * Load a block map tree page from disk. + * + * @param zone The BlockMapTreeZone in which the DataVIO operates + * @param dataVIO The DataVIO which requires a page to be loaded + **/ +static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + int result = attemptPageLock(zone, dataVIO); + if (result != VDO_SUCCESS) { + abortLoad(dataVIO, result); + return; + } + + if (dataVIO->treeLock.locked) { + Waiter *waiter = dataVIOAsWaiter(dataVIO); + waiter->callback = loadPage; + result = acquireVIOFromPool(zone->vioPool, waiter); + if (result != VDO_SUCCESS) { + abortLoad(dataVIO, result); + } + } +} + +/** + * Set the callback of a DataVIO after it has allocated a block map page. + * + * @param dataVIO The DataVIO + **/ +static void setPostAllocationCallback(DataVIO *dataVIO) +{ + setCallback(dataVIOAsCompletion(dataVIO), dataVIO->treeLock.callback, + dataVIO->treeLock.threadID); +} + +/** + * Abort a block map PBN lookup due to an error allocating a page. + * + * @param dataVIO The DataVIO doing the page allocation + * @param result The error code + **/ +static void abortAllocation(DataVIO *dataVIO, int result) +{ + setPostAllocationCallback(dataVIO); + abortLookup(dataVIO, result, "allocation"); +} + +/** + * Callback to handle an error while attempting to allocate a page. This + * callback is used to transfer back to the logical zone along the block map + * page allocation path. + * + * @param completion The DataVIO doing the allocation + **/ +static void allocationFailure(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + abortAllocation(dataVIO, completion->result); +} + +/** + * Continue with page allocations now that a parent page has been allocated. + * + * @param waiter The DataVIO which was waiting for a page to be allocated + * @param context The physical block number of the page which was just + * allocated + **/ +static void continueAllocationForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + TreeLock *treeLock = &dataVIO->treeLock; + PhysicalBlockNumber pbn = *((PhysicalBlockNumber *) context); + + treeLock->height--; + dataVIO->treeLock.treeSlots[treeLock->height].blockMapSlot.pbn = pbn; + + if (treeLock->height == 0) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); +} + +/** + * Finish the page allocation process by recording the allocation in the tree + * and waking any waiters now that the write lock has been released. This + * callback is registered in releaseBlockMapWriteLock(). + * + * @param completion The DataVIO doing the allocation + **/ +static void finishBlockMapAllocation(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + allocationFailure(completion); + return; + } + + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + TreeLock *treeLock = &dataVIO->treeLock; + TreePage *treePage = getTreePage(zone, treeLock); + Height height = treeLock->height; + + PhysicalBlockNumber pbn = treeLock->treeSlots[height - 1].blockMapSlot.pbn; + + // Record the allocation. + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + SequenceNumber oldLock = treePage->recoveryLock; + updateBlockMapPage(page, dataVIO, pbn, MAPPING_STATE_UNCOMPRESSED, + &treePage->recoveryLock); + + if (isWaiting(&treePage->waiter)) { + // This page is waiting to be written out. + if (zone->flusher != treePage) { + // The outstanding flush won't cover the update we just made, so mark + // the page as needing another flush. + setGeneration(zone, treePage, zone->generation, true); + } + } else { + // Put the page on a dirty list + if (oldLock == 0) { + initializeRing(&treePage->node); + } + addToDirtyLists(zone->dirtyLists, &treePage->node, oldLock, + treePage->recoveryLock); + } + + treeLock->height--; + if (height > 1) { + // Format the interior node we just allocated (in memory). + treePage = getTreePage(zone, treeLock); + formatBlockMapPage(treePage->pageBuffer, zone->mapZone->blockMap->nonce, + pbn, false); + } + + // Release our claim to the allocation and wake any waiters + releasePageLock(dataVIO, "allocation"); + notifyAllWaiters(&treeLock->waiters, continueAllocationForWaiter, &pbn); + if (treeLock->height == 0) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + allocateBlockMapPage(zone, dataVIO); +} + +/** + * Release the write lock on a newly allocated block map page now that we + * have made its journal entries and reference count updates. This callback + * is registered in setBlockMapPageReferenceCount(). + * + * @param completion The DataVIO doing the allocation + **/ +static void releaseBlockMapWriteLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + assertInAllocatedZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + releaseAllocationLock(allocatingVIO); + resetAllocation(allocatingVIO); + launchLogicalCallback(dataVIO, finishBlockMapAllocation, + THIS_LOCATION("$F;cb=finishBlockMapAllocation")); +} + +/** + * Set the reference count of a newly allocated block map page to + * MAXIMUM_REFERENCES now that we have made a recovery journal entry for it. + * MAXIMUM_REFERENCES is used to prevent deduplication against the block after + * we release the write lock on it, but before we write out the page. + * + * @param completion The DataVIO doing the allocation + **/ +static void setBlockMapPageReferenceCount(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + TreeLock *lock = &dataVIO->treeLock; + PhysicalBlockNumber pbn = lock->treeSlots[lock->height - 1].blockMapSlot.pbn; + completion->callback = releaseBlockMapWriteLock; + addSlabJournalEntry(getSlabJournal(getVDOFromDataVIO(dataVIO)->depot, pbn), + dataVIO); +} + +/** + * Make a recovery journal entry for a newly allocated block map page. + * This callback is registered in continueBlockMapPageAllocation(). + * + * @param completion The DataVIO doing the allocation + **/ +static void journalBlockMapAllocation(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + setAllocatedZoneCallback(dataVIO, setBlockMapPageReferenceCount, + THIS_LOCATION(NULL)); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Continue the process of allocating a block map page now that the + * BlockAllocator has given us a block. This method is supplied as the callback + * to allocateDataBlock() by allocateBlockMapPage(). + * + * @param allocatingVIO The DataVIO which is doing the allocation + **/ +static void continueBlockMapPageAllocation(AllocatingVIO *allocatingVIO) +{ + DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); + if (!hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + continueDataVIO(dataVIO, VDO_NO_SPACE); + return; + } + + PhysicalBlockNumber pbn = allocatingVIO->allocation; + TreeLock *lock = &dataVIO->treeLock; + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = pbn; + setUpReferenceOperationWithLock(BLOCK_MAP_INCREMENT, pbn, + MAPPING_STATE_UNCOMPRESSED, + allocatingVIO->allocationLock, + &dataVIO->operation); + launchJournalCallback(dataVIO, journalBlockMapAllocation, + THIS_LOCATION("$F;cb=journalBlockMapAllocation")); +} + +/** + * Allocate a block map page. + * + * @param zone The zone in which the DataVIO is operating + * @param dataVIO The DataVIO which needs to allocate a page + **/ +static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + if (!isWriteDataVIO(dataVIO) || isTrimDataVIO(dataVIO)) { + // This is a pure read, the read phase of a read-modify-write, or a trim, + // so there's nothing left to do here. + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + int result = attemptPageLock(zone, dataVIO); + if (result != VDO_SUCCESS) { + abortAllocation(dataVIO, result); + return; + } + + if (!dataVIO->treeLock.locked) { + return; + } + + allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), + getAllocationSelector(dataVIO->logical.zone), + VIO_BLOCK_MAP_WRITE_LOCK, + continueBlockMapPageAllocation); +} + +/**********************************************************************/ +void lookupBlockMapPBN(DataVIO *dataVIO) +{ + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + zone->activeLookups++; + if (isDraining(&zone->mapZone->state)) { + finishLookup(dataVIO, VDO_SHUTTING_DOWN); + return; + } + + TreeLock *lock = &dataVIO->treeLock; + PageNumber pageIndex + = ((lock->treeSlots[0].pageIndex - zone->mapZone->blockMap->flatPageCount) + / zone->mapZone->blockMap->rootCount); + BlockMapTreeSlot treeSlot = { + .pageIndex = pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE, + .blockMapSlot = { + .pbn = 0, + .slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE, + }, + }; + + BlockMapPage *page = NULL; + for (lock->height = 1; lock->height <= BLOCK_MAP_TREE_HEIGHT; + lock->height++) { + lock->treeSlots[lock->height] = treeSlot; + page = (BlockMapPage *) (getTreePage(zone, lock)->pageBuffer); + PhysicalBlockNumber pbn = getBlockMapPagePBN(page); + if (pbn != ZERO_BLOCK) { + lock->treeSlots[lock->height].blockMapSlot.pbn = pbn; + break; + } + + // Calculate the index and slot for the next level. + treeSlot.blockMapSlot.slot + = treeSlot.pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; + treeSlot.pageIndex + = treeSlot.pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE; + } + + // The page at this height has been allocated and loaded. + DataLocation mapping + = unpackBlockMapEntry(&page->entries[treeSlot.blockMapSlot.slot]); + if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { + logErrorWithStringError(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with " + "state %u for page index %u at height %u", + mapping.pbn, mapping.state, + lock->treeSlots[lock->height - 1].pageIndex, + lock->height - 1); + abortLoad(dataVIO, VDO_BAD_MAPPING); + return; + } + + if (!isMappedLocation(&mapping)) { + // The page we want one level down has not been allocated, so allocate it. + allocateBlockMapPage(zone, dataVIO); + return; + } + + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; + if (lock->height == 1) { + // This is the ultimate block map page, so we're done + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + // We know what page we need to load. + loadBlockMapPage(zone, dataVIO); +} + +/**********************************************************************/ +PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber) +{ + if (pageNumber < map->flatPageCount) { + return (BLOCK_MAP_FLAT_PAGE_ORIGIN + pageNumber); + } + + RootCount rootIndex = pageNumber % map->rootCount; + PageNumber pageIndex = ((pageNumber - map->flatPageCount) / map->rootCount); + SlotNumber slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; + pageIndex /= BLOCK_MAP_ENTRIES_PER_PAGE; + + TreePage *treePage + = getTreePageByIndex(map->forest, rootIndex, 1, pageIndex); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + if (!isBlockMapPageInitialized(page)) { + return ZERO_BLOCK; + } + + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (!isValidLocation(&mapping) || isCompressed(mapping.state)) { + return ZERO_BLOCK; + } + return mapping.pbn; +} + +/**********************************************************************/ +void writeTreePage(TreePage *page, BlockMapTreeZone *zone) +{ + bool waiting = isWaiting(&page->waiter); + if (waiting && (zone->flusher == page)) { + return; + } + + setGeneration(zone, page, zone->generation, waiting); + if (waiting || page->writing) { + return; + } + + enqueuePage(page, zone); +} diff --git a/vdo/base/blockMapTree.h b/vdo/base/blockMapTree.h new file mode 100644 index 0000000..c581454 --- /dev/null +++ b/vdo/base/blockMapTree.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.h#7 $ + */ + +#ifndef BLOCK_MAP_TREE_H +#define BLOCK_MAP_TREE_H + +#include "constants.h" +#include "types.h" + +typedef struct treePage TreePage; + +/** + * Intialize a BlockMapTreeZone. + * + * @param zone The BlockMapZone of the tree zone to intialize + * @param layer The physical layer + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and may be written out + * + * @return VDO_SUCCESS or an error + **/ +int initializeTreeZone(BlockMapZone *zone, + PhysicalLayer *layer, + BlockCount maximumAge) + __attribute__((warn_unused_result)); + +/** + * Clean up a BlockMapTreeZone. + * + * @param treeZone The zone to clean up + **/ +void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone); + +/** + * Set the initial dirty period for a tree zone. + * + * @param treeZone The tree zone + * @param period The initial dirty period to set + **/ +void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, + SequenceNumber period); + +/** + * Check whether a tree zone is active (i.e. has any active lookups, + * outstanding I/O, or pending I/O). + * + * @param zone The zone to check + * + * @return true if the zone is active + **/ +bool isTreeZoneActive(BlockMapTreeZone *zone) + __attribute__((warn_unused_result)); + +/** + * Advance the dirty period for a tree zone. + * + * @param zone The BlockMapTreeZone to advance + * @param period The new dirty period + **/ +void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period); + +/** + * Drain the zone trees, i.e. ensure that all I/O is quiesced. If required by + * the drain type, all dirty block map trees will be written to disk. This + * method must not be called when lookups are active. + * + * @param zone The BlockMapTreeZone to drain + **/ +void drainZoneTrees(BlockMapTreeZone *zone); + +/** + * Look up the PBN of the block map page for a DataVIO's LBN in the arboreal + * block map. If necessary, the block map page will be allocated. Also, the + * ancestors of the block map page will be allocated or loaded if necessary. + * + * @param dataVIO The DataVIO requesting the lookup + **/ +void lookupBlockMapPBN(DataVIO *dataVIO); + +/** + * Find the PBN of a leaf block map page. This method may only be used after + * all allocated tree pages have been loaded, otherwise, it may give the wrong + * answer (0). + * + * @param map The block map containing the forest + * @param pageNumber The page number of the desired block map page + * + * @return The PBN of the page + **/ +PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber); + +/** + * Write a tree page or indicate that it has been re-dirtied if it is already + * being written. This method is used when correcting errors in the tree during + * read-only rebuild. + * + * @param page The page to write + * @param zone The tree zone managing the page + **/ +void writeTreePage(TreePage *page, BlockMapTreeZone *zone); + +#endif // BLOCK_MAP_TREE_H diff --git a/vdo/base/blockMapTreeInternals.h b/vdo/base/blockMapTreeInternals.h new file mode 100644 index 0000000..49b69eb --- /dev/null +++ b/vdo/base/blockMapTreeInternals.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTreeInternals.h#4 $ + */ + +#ifndef BLOCK_MAP_TREE_INTERNALS_H +#define BLOCK_MAP_TREE_INTERNALS_H + +#include "blockMapTree.h" + +#include "blockMapPage.h" +#include "types.h" + +/** A single page of a block map tree */ +struct treePage { + /** Waiter for a VIO to write out this page */ + Waiter waiter; + + /** Dirty list node */ + RingNode node; + + /** + * If this is a dirty tree page, the tree zone flush generation in which it + * was last dirtied. + */ + uint8_t generation; + + /** Whether this page is an interior tree page being written out. */ + bool writing; + + /** + * If this page is being written, the tree zone flush generation of the + * copy of the page being written. + **/ + uint8_t writingGeneration; + + /** The earliest journal block containing uncommitted updates to this page */ + SequenceNumber recoveryLock; + + /** The value of recoveryLock when the this page last started writing */ + SequenceNumber writingRecoveryLock; + + /** The buffer to hold the on-disk representation of this page */ + char pageBuffer[VDO_BLOCK_SIZE]; +}; + +typedef struct { + PageNumber levels[BLOCK_MAP_TREE_HEIGHT]; +} Boundary; + +/** + * An invalid PBN used to indicate that the page holding the location of a + * tree root has been "loaded". + **/ +extern const PhysicalBlockNumber INVALID_PBN; + +/** + * Extract the BlockMapPage from a TreePage. + * + * @param treePage The TreePage + * + * @return The BlockMapPage of the TreePage + **/ +__attribute__((warn_unused_result)) +static inline BlockMapPage *asBlockMapPage(TreePage *treePage) +{ + return (BlockMapPage *) treePage->pageBuffer; +} + +/** + * Replace the VIOPool in a tree zone. This method is used by unit tests. + * + * @param zone The zone whose pool is to be replaced + * @param layer The physical layer from which to make VIOs + * @param poolSize The size of the new pool + * + * @return VDO_SUCCESS or an error + **/ +int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, + PhysicalLayer *layer, + size_t poolSize) + __attribute__((warn_unused_result)); + +/** + * Check whether a buffer contains a valid page. If the page is bad, log an + * error. If the page is valid, copy it to the supplied page. + * + * @param buffer The buffer to validate (and copy) + * @param nonce The VDO nonce + * @param pbn The absolute PBN of the page + * @param page The page to copy into if valid + * + * @return true if the page was copied (valid) + **/ +bool copyValidPage(char *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + BlockMapPage *page); + +#endif // BLOCK_MAP_TREE_INTERNALS_H diff --git a/vdo/base/blockMappingState.h b/vdo/base/blockMappingState.h new file mode 100644 index 0000000..ad2460a --- /dev/null +++ b/vdo/base/blockMappingState.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMappingState.h#1 $ + */ + +#ifndef BLOCK_MAPPING_STATE_H +#define BLOCK_MAPPING_STATE_H + +#include "common.h" + +/** + * Four bits of each five-byte block map entry contain a mapping state value + * used to distinguish unmapped or trimmed logical blocks (which are treated + * as mapped to the zero block) from entries that have been mapped to a + * physical block, including the zero block. + **/ +typedef enum { + MAPPING_STATE_UNMAPPED = 0, // Must be zero to be the default value + MAPPING_STATE_UNCOMPRESSED = 1, // A normal (uncompressed) block + MAPPING_STATE_COMPRESSED_BASE = 2, // Compressed in slot 0 + MAPPING_STATE_COMPRESSED_MAX = 15, // Compressed in slot 13 +} BlockMappingState; + +/** + * The total number of compressed blocks that can live in a physical block. + **/ +enum { + MAX_COMPRESSION_SLOTS = + MAPPING_STATE_COMPRESSED_MAX - MAPPING_STATE_COMPRESSED_BASE + 1, +}; + +/**********************************************************************/ +static inline BlockMappingState getStateForSlot(byte slotNumber) +{ + return (slotNumber + MAPPING_STATE_COMPRESSED_BASE); +} + +/**********************************************************************/ +static inline byte getSlotFromState(BlockMappingState mappingState) +{ + return (mappingState - MAPPING_STATE_COMPRESSED_BASE); +} + +/**********************************************************************/ +static inline bool isCompressed(const BlockMappingState mappingState) +{ + return (mappingState > MAPPING_STATE_UNCOMPRESSED); +} + +#endif // BLOCK_MAPPING_STATE_H diff --git a/vdo/base/completion.c b/vdo/base/completion.c new file mode 100644 index 0000000..d27fd72 --- /dev/null +++ b/vdo/base/completion.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.c#10 $ + */ + +#include "completion.h" + +#include "logger.h" +#include "statusCodes.h" + +static const char *VDO_COMPLETION_TYPE_NAMES[] = { + // Keep UNSET_COMPLETION_TYPE at the top. + "UNSET_COMPLETION_TYPE", + + // Keep the rest of these in sorted order. If you add or remove an entry, + // be sure to update the corresponding list in completion.h. + "ACTION_COMPLETION", + "ADMIN_COMPLETION", + "ASYNC_ACTION_CONTEXT", + "BLOCK_ALLOCATOR_COMPLETION", + "BLOCK_MAP_RECOVERY_COMPLETION", + "CHECK_IDENTIFIER_COMPLETION", + "EXTERNAL_COMPLETION", + "FLUSH_NOTIFICATION_COMPLETION", + "GENERATION_FLUSHED_COMPLETION", + "HEARTBEAT_COMPLETION", + "LOCK_COUNTER_COMPLETION", + "PARTITION_COPY_COMPLETION", + "READ_ONLY_MODE_COMPLETION", + "READ_ONLY_REBUILD_COMPLETION", + "RECOVERY_COMPLETION", + "REFERENCE_COUNT_REBUILD_COMPLETION", + "SLAB_SCRUBBER_COMPLETION", + "SUB_TASK_COMPLETION", + "TEST_COMPLETION", + "VDO_COMMAND_COMPLETION", + "VDO_COMMAND_SUB_COMPLETION", + "VDO_EXTENT_COMPLETION", + "VDO_PAGE_COMPLETION", + "VIO_COMPLETION", + "WRAPPING_COMPLETION", +}; + +/**********************************************************************/ +void initializeCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) +{ + memset(completion, 0, sizeof(*completion)); + completion->layer = layer; + completion->type = type; + resetCompletion(completion); +} + +/**********************************************************************/ +int initializeEnqueueableCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) +{ + initializeCompletion(completion, type, layer); + return ((layer->createEnqueueable == NULL) + ? VDO_SUCCESS : layer->createEnqueueable(completion)); +} + +/**********************************************************************/ +void resetCompletion(VDOCompletion *completion) +{ + completion->result = VDO_SUCCESS; + completion->complete = false; +} + +/** + * Assert that a completion is not complete. + * + * @param completion The completion to check + **/ +static inline void assertIncomplete(VDOCompletion *completion) +{ + ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); +} + +/**********************************************************************/ +void setCompletionResult(VDOCompletion *completion, int result) +{ + assertIncomplete(completion); + if (completion->result == VDO_SUCCESS) { + completion->result = result; + } +} + +/** + * Check whether a completion's callback must be enqueued, or if it can be run + * on the current thread. Side effect: clears the requeue flag if it is set, + * so the caller MUST requeue if this returns true. + * + * @param completion The completion whose callback is to be invoked + * + * @return false if the callback must be run on this thread + * true if the callback must be enqueued + **/ +__attribute__((warn_unused_result)) +static inline bool requiresEnqueue(VDOCompletion *completion) +{ + if (completion->requeue) { + completion->requeue = false; + return true; + } + + ThreadID callbackThread = completion->callbackThreadID; + return (callbackThread != completion->layer->getCurrentThreadID()); +} + +/**********************************************************************/ +void invokeCallback(VDOCompletion *completion) +{ + if (requiresEnqueue(completion)) { + if (completion->enqueueable != NULL) { + completion->layer->enqueue(completion->enqueueable); + return; + } + ASSERT_LOG_ONLY(false, + "non-enqueueable completion (type %s) on correct thread", + getCompletionTypeName(completion->type)); + } + + runCallback(completion); +} + +/**********************************************************************/ +void continueCompletion(VDOCompletion *completion, int result) +{ + setCompletionResult(completion, result); + invokeCallback(completion); +} + +/**********************************************************************/ +void completeCompletion(VDOCompletion *completion) +{ + assertIncomplete(completion); + completion->complete = true; + if (completion->callback != NULL) { + invokeCallback(completion); + } +} + +/**********************************************************************/ +void releaseCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + *completionPtr = NULL; + completeCompletion(completion); +} + +/**********************************************************************/ +void releaseCompletionWithResult(VDOCompletion **completionPtr, int result) +{ + if (*completionPtr == NULL) { + return; + } + + setCompletionResult(*completionPtr, result); + releaseCompletion(completionPtr); +} + +/**********************************************************************/ +void finishParentCallback(VDOCompletion *completion) +{ + finishCompletion((VDOCompletion *) completion->parent, completion->result); +} + +/**********************************************************************/ +void preserveErrorAndContinue(VDOCompletion *completion) +{ + if (completion->parent != NULL) { + setCompletionResult(completion->parent, completion->result); + } + + resetCompletion(completion); + invokeCallback(completion); +} + +/**********************************************************************/ +const char *getCompletionTypeName(VDOCompletionType completionType) +{ + // Try to catch failures to update the array when the enum values change. + STATIC_ASSERT(COUNT_OF(VDO_COMPLETION_TYPE_NAMES) + == (MAX_COMPLETION_TYPE - UNSET_COMPLETION_TYPE)); + + if (completionType >= MAX_COMPLETION_TYPE) { + static char numeric[100]; + snprintf(numeric, 99, "%d (%#x)", completionType, completionType); + return numeric; + } + + return VDO_COMPLETION_TYPE_NAMES[completionType]; +} + +/**********************************************************************/ +void destroyEnqueueable(VDOCompletion *completion) +{ + if ((completion == NULL) || (completion->layer == NULL) + || (completion->layer->destroyEnqueueable == NULL)) { + return; + } + + completion->layer->destroyEnqueueable(&completion->enqueueable); +} + +/**********************************************************************/ +int assertCompletionType(VDOCompletionType actual, + VDOCompletionType expected) +{ + return ASSERT((expected == actual), + "completion type is %s instead of %s", + getCompletionTypeName(actual), + getCompletionTypeName(expected)); +} diff --git a/vdo/base/completion.h b/vdo/base/completion.h new file mode 100644 index 0000000..d245814 --- /dev/null +++ b/vdo/base/completion.h @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.h#11 $ + */ + +#ifndef COMPLETION_H +#define COMPLETION_H + +#include "permassert.h" + +#include "physicalLayer.h" +#include "ringNode.h" +#include "types.h" + +typedef enum __attribute__((packed)) { + // Keep UNSET_COMPLETION_TYPE at the top. + UNSET_COMPLETION_TYPE = 0, + + // Keep the rest of these in sorted order. If you add or remove an entry, + // be sure to update the corresponding list in completion.c. + ACTION_COMPLETION, + ADMIN_COMPLETION, + ASYNC_ACTION_CONTEXT, + BLOCK_ALLOCATOR_COMPLETION, + BLOCK_MAP_RECOVERY_COMPLETION, + CHECK_IDENTIFIER_COMPLETION, + EXTERNAL_COMPLETION, + FLUSH_NOTIFICATION_COMPLETION, + GENERATION_FLUSHED_COMPLETION, + HEARTBEAT_COMPLETION, + LOCK_COUNTER_COMPLETION, + PARTITION_COPY_COMPLETION, + READ_ONLY_MODE_COMPLETION, + READ_ONLY_REBUILD_COMPLETION, + RECOVERY_COMPLETION, + REFERENCE_COUNT_REBUILD_COMPLETION, + SLAB_SCRUBBER_COMPLETION, + SUB_TASK_COMPLETION, + TEST_COMPLETION, // each unit test may define its own + VDO_COMMAND_COMPLETION, + VDO_COMMAND_SUB_COMPLETION, + VDO_EXTENT_COMPLETION, + VDO_PAGE_COMPLETION, + VIO_COMPLETION, + WRAPPING_COMPLETION, + + // Keep MAX_COMPLETION_TYPE at the bottom. + MAX_COMPLETION_TYPE +} VDOCompletionType; + +/** + * An asynchronous VDO operation. + * + * @param completion the completion of the operation + **/ +typedef void VDOAction(VDOCompletion *completion); + +struct vdoCompletion { + /** The type of completion this is */ + VDOCompletionType type; + + /** + * true once the processing of the operation is complete. + * This flag should not be used by waiters external to the VDO base as + * it is used to gate calling the callback. + **/ + bool complete; + + /** + * If true, queue this completion on the next callback invocation, even if + * it is already running on the correct thread. + **/ + bool requeue; + + /** The ID of the thread which should run the next callback */ + ThreadID callbackThreadID; + + /** The result of the operation */ + int result; + + /** The physical layer on which this completion operates */ + PhysicalLayer *layer; + + /** The callback which will be called once the operation is complete */ + VDOAction *callback; + + /** The callback which, if set, will be called if an error result is set */ + VDOAction *errorHandler; + + /** The parent object, if any, that spawned this completion */ + void *parent; + + /** The enqueueable for this completion (may be NULL) */ + Enqueueable *enqueueable; +}; + +/** + * Actually run the callback. This function must be called from the correct + * callback thread. + **/ +static inline void runCallback(VDOCompletion *completion) +{ + if ((completion->result != VDO_SUCCESS) + && (completion->errorHandler != NULL)) { + completion->errorHandler(completion); + return; + } + + completion->callback(completion); +} + +/** + * Set the result of a completion. Older errors will not be masked. + * + * @param completion The completion whose result is to be set + * @param result The result to set + **/ +void setCompletionResult(VDOCompletion *completion, int result); + +/** + * Initialize a completion to a clean state, for reused completions. + * + * @param completion The completion to initialize + * @param type The type of the completion + * @param layer The physical layer of the completion + **/ +void initializeCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer); + +/** + * Initialize a completion to a clean state and make an enqueueable for it. + * + * @param completion The completion to initialize + * @param type The type of the completion + * @param layer The physical layer of the completion + * + * @return VDO_SUCCESS or an error + **/ +int initializeEnqueueableCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Reset a completion to a clean state, while keeping + * the type, layer and parent information. + * + * @param completion the completion to reset + **/ +void resetCompletion(VDOCompletion *completion); + +/** + * Invoke the callback of a completion. If called on the correct thread (i.e. + * the one specified in the completion's callbackThreadID field), the + * completion will be run immediately. Otherwise, the completion will be + * enqueued on the correct callback thread. + **/ +void invokeCallback(VDOCompletion *completion); + +/** + * Continue processing a completion by setting the current result and calling + * invokeCallback(). + * + * @param completion The completion to continue + * @param result The current result (will not mask older errors) + **/ +void continueCompletion(VDOCompletion *completion, int result); + +/** + * Complete a completion. + * + * @param completion The completion to complete + **/ +void completeCompletion(VDOCompletion *completion); + +/** + * Finish a completion. + * + * @param completion The completion to finish + * @param result The result of the completion (will not mask older errors) + **/ +static inline void finishCompletion(VDOCompletion *completion, int result) +{ + setCompletionResult(completion, result); + completeCompletion(completion); +} + +/** + * Complete a completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the completion to release + **/ +void releaseCompletion(VDOCompletion **completionPtr); + +/** + * Finish a completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the completion to release + * @param result The result of the completion + **/ +void releaseCompletionWithResult(VDOCompletion **completionPtr, int result); + +/** + * A callback to finish the parent of a completion. + * + * @param completion The completion which has finished and whose parent should + * be finished + **/ +void finishParentCallback(VDOCompletion *completion); + +/** + * Error handler which preserves an error in the parent (if there is one), + * and then resets the failing completion and calls its non-error callback. + * + * @param completion The completion which failed + **/ +void preserveErrorAndContinue(VDOCompletion *completion); + +/** + * A callback which does nothing. This callback is intended to be set as an + * error handler in the case where an error should do nothing. + * + * @param completion The completion being called back + **/ +static inline +void noopCallback(VDOCompletion *completion __attribute__((unused))) +{ +} + +/** + * Destroy the enqueueable associated with this completion. + * + * @param completion The completion + **/ +void destroyEnqueueable(VDOCompletion *completion); + +/** + * Assert that a completion is of the correct type + * + * @param actual The actual completion type + * @param expected The expected completion type + * + * @return VDO_SUCCESS or VDO_PARAMETER_MISMATCH + **/ +int assertCompletionType(VDOCompletionType actual, + VDOCompletionType expected); + +/** + * Return the name of a completion type. + * + * @param completionType the completion type + * + * @return a pointer to a static string; if the completionType is unknown + * this is to a static buffer that may be overwritten. + **/ +const char *getCompletionTypeName(VDOCompletionType completionType); + +/** + * Set the callback for a completion. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + **/ +static inline void setCallback(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID) +{ + completion->callback = callback; + completion->callbackThreadID = threadID; +} + +/** + * Set the callback for a completion and invoke it immediately. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + **/ +static inline void launchCallback(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID) +{ + setCallback(completion, callback, threadID); + invokeCallback(completion); +} + +/** + * Set the callback and parent for a completion. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void setCallbackWithParent(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID, + void *parent) +{ + setCallback(completion, callback, threadID); + completion->parent = parent; +} + +/** + * Set the callback and parent for a completion and invoke the callback + * immediately. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void launchCallbackWithParent(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID, + void *parent) +{ + setCallbackWithParent(completion, callback, threadID, parent); + invokeCallback(completion); +} + +/** + * Prepare a completion for launch. Reset it, and then set its callback, error + * handler, callback thread, and parent. + * + * @param completion The completion + * @param callback The callback to register + * @param errorHandler The error handler to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void prepareCompletion(VDOCompletion *completion, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + void *parent) +{ + resetCompletion(completion); + setCallbackWithParent(completion, callback, threadID, parent); + completion->errorHandler = errorHandler; +} + +/** + * Prepare a completion for launch ensuring that it will always be requeued. + * Reset it, and then set its callback, error handler, callback thread, and + * parent. + * + * @param completion The completion + * @param callback The callback to register + * @param errorHandler The error handler to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void prepareForRequeue(VDOCompletion *completion, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + void *parent) +{ + prepareCompletion(completion, callback, errorHandler, threadID, parent); + completion->requeue = true; +} + +/** + * Prepare a completion for launch which will complete its parent when + * finished. + * + * @param completion The completion + * @param parent The parent to complete + **/ +static inline void prepareToFinishParent(VDOCompletion *completion, + VDOCompletion *parent) +{ + prepareCompletion(completion, finishParentCallback, finishParentCallback, + parent->callbackThreadID, parent); +} + +#endif // COMPLETION_H diff --git a/vdo/base/compressedBlock.c b/vdo/base/compressedBlock.c new file mode 100644 index 0000000..d9f93e8 --- /dev/null +++ b/vdo/base/compressedBlock.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.c#3 $ + */ + +#include "compressedBlock.h" + +#include "memoryAlloc.h" +#include "numeric.h" + +static const VersionNumber COMPRESSED_BLOCK_1_0 = { + .majorVersion = 1, + .minorVersion = 0, +}; + +/**********************************************************************/ +void resetCompressedBlockHeader(CompressedBlockHeader *header) +{ + STATIC_ASSERT(sizeof(header->fields) == sizeof(header->raw)); + + header->fields.version = packVersionNumber(COMPRESSED_BLOCK_1_0); + memset(header->fields.sizes, 0, sizeof(header->fields.sizes)); +} + +/**********************************************************************/ +static uint16_t +getCompressedFragmentSize(const CompressedBlockHeader *header, byte slot) +{ + return getUInt16LE(header->fields.sizes[slot]); +} + +/**********************************************************************/ +int getCompressedBlockFragment(BlockMappingState mappingState, + char *buffer, + BlockSize blockSize, + uint16_t *fragmentOffset, + uint16_t *fragmentSize) +{ + if (!isCompressed(mappingState)) { + return VDO_INVALID_FRAGMENT; + } + + CompressedBlockHeader *header = (CompressedBlockHeader *) buffer; + VersionNumber version = unpackVersionNumber(header->fields.version); + if (!areSameVersion(version, COMPRESSED_BLOCK_1_0)) { + return VDO_INVALID_FRAGMENT; + } + + byte slot = getSlotFromState(mappingState); + if (slot >= MAX_COMPRESSION_SLOTS) { + return VDO_INVALID_FRAGMENT; + } + + uint16_t compressedSize = getCompressedFragmentSize(header, slot); + uint16_t offset = sizeof(CompressedBlockHeader); + for (unsigned int i = 0; i < slot; i++) { + offset += getCompressedFragmentSize(header, i); + if (offset >= blockSize) { + return VDO_INVALID_FRAGMENT; + } + } + + if ((offset + compressedSize) > blockSize) { + return VDO_INVALID_FRAGMENT; + } + + *fragmentOffset = offset; + *fragmentSize = compressedSize; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void putCompressedBlockFragment(CompressedBlock *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size) +{ + storeUInt16LE(block->header.fields.sizes[fragment], size); + memcpy(&block->data[offset], data, size); +} diff --git a/vdo/base/compressedBlock.h b/vdo/base/compressedBlock.h new file mode 100644 index 0000000..603841f --- /dev/null +++ b/vdo/base/compressedBlock.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.h#3 $ + */ + +#ifndef COMPRESSED_BLOCK_H +#define COMPRESSED_BLOCK_H + +#include "blockMappingState.h" +#include "header.h" + +/** + * The header of a compressed block. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** Unsigned 32-bit major and minor versions, in little-endian byte order */ + PackedVersionNumber version; + + /** List of unsigned 16-bit compressed block sizes, in little-endian order */ + byte sizes[MAX_COMPRESSION_SLOTS][2]; + } fields; + + // A raw view of the packed encoding. + byte raw[4 + 4 + (2 * MAX_COMPRESSION_SLOTS)]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining compressed block headers in GDB. + struct __attribute__((packed)) { + VersionNumber version; + uint16_t sizes[MAX_COMPRESSION_SLOTS]; + } littleEndian; +#endif +} CompressedBlockHeader; + +/** + * The compressed block overlay. + **/ +typedef struct { + CompressedBlockHeader header; + char data[]; +} __attribute__((packed)) CompressedBlock; + +/** + * Initializes/resets a compressed block header. + * + * @param header the header + * + * When done, the version number is set to the current version, and all + * fragments are empty. + **/ +void resetCompressedBlockHeader(CompressedBlockHeader *header); + +/** + * Get a reference to a compressed fragment from a compression block. + * + * @param [in] mappingState the mapping state for the look up + * @param [in] buffer buffer that contains compressed data + * @param [in] blockSize size of a data block + * @param [out] fragmentOffset the offset of the fragment within a + * compressed block + * @param [out] fragmentSize the size of the fragment + * + * @return If a valid compressed fragment is found, VDO_SUCCESS; + * otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid. + **/ +int getCompressedBlockFragment(BlockMappingState mappingState, + char *buffer, + BlockSize blockSize, + uint16_t *fragmentOffset, + uint16_t *fragmentSize); + +/** + * Copy a fragment into the compressed block. + * + * @param block the compressed block + * @param fragment the number of the fragment + * @param offset the byte offset of the fragment in the data area + * @param data a pointer to the compressed data + * @param size the size of the data + * + * @note no bounds checking -- the data better fit without smashing other stuff + **/ +void putCompressedBlockFragment(CompressedBlock *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size); + +#endif // COMPRESSED_BLOCK_H diff --git a/vdo/base/compressionState.c b/vdo/base/compressionState.c new file mode 100644 index 0000000..d773756 --- /dev/null +++ b/vdo/base/compressionState.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.c#2 $ + */ + +#include "compressionStateInternals.h" + +#include "dataVIO.h" +#include "packer.h" + +static const uint32_t STATUS_MASK = 0xff; +static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000; + +/**********************************************************************/ +VIOCompressionState getCompressionState(DataVIO *dataVIO) +{ + uint32_t packedValue = atomicLoad32(&dataVIO->compression.state); + return (VIOCompressionState) { + .status = packedValue & STATUS_MASK, + .mayNotCompress = ((packedValue & MAY_NOT_COMPRESS_MASK) != 0), + }; +} + +/** + * Convert a VIOCompressionState into a uint32_t which may be stored + * atomically. + * + * @param state The state to convert + * + * @return The compression state packed into a uint32_t + **/ +__attribute__((warn_unused_result)) +static uint32_t packState(VIOCompressionState state) +{ + return state.status | (state.mayNotCompress ? MAY_NOT_COMPRESS_MASK : 0); +} + +/**********************************************************************/ +bool setCompressionState(DataVIO *dataVIO, + VIOCompressionState state, + VIOCompressionState newState) +{ + return compareAndSwap32(&dataVIO->compression.state, packState(state), + packState(newState)); +} + +/** + * Advance to the next compression state along the compression path. + * + * @param dataVIO The DataVIO to advance + * + * @return The new compression status of the DataVIO + **/ +static VIOCompressionStatus advanceStatus(DataVIO *dataVIO) +{ + for (;;) { + VIOCompressionState state = getCompressionState(dataVIO); + if (state.status == VIO_POST_PACKER) { + // We're already in the last state. + return state.status; + } + + VIOCompressionState newState = state; + if (state.mayNotCompress) { + // Compression has been dis-allowed for this VIO, so skip the rest of the + // path and go to the end. + newState.status = VIO_POST_PACKER; + } else { + // Go to the next state. + newState.status++; + } + + if (setCompressionState(dataVIO, state, newState)) { + return newState.status; + } + + // Another thread changed the state out from under us so try again. + } +} + +/**********************************************************************/ +bool mayCompressDataVIO(DataVIO *dataVIO) +{ + if (!hasAllocation(dataVIO) + || ((getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC) + && vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))) + || !getVDOCompressing(getVDOFromDataVIO(dataVIO))) { + /* + * If this VIO didn't get an allocation, the compressed write probably + * won't either, so don't try compressing it. Also, if compression is off, + * don't compress. + */ + setCompressionDone(dataVIO); + return false; + } + + if (dataVIO->hashLock == NULL) { + // DataVIOs without a HashLock (which should be extremely rare) aren't + // able to share the packer's PBN lock, so don't try to compress them. + return false; + } + + return (advanceStatus(dataVIO) == VIO_COMPRESSING); +} + +/**********************************************************************/ +bool mayPackDataVIO(DataVIO *dataVIO) +{ + if (!isSufficientlyCompressible(dataVIO) + || !getVDOCompressing(getVDOFromDataVIO(dataVIO)) + || getCompressionState(dataVIO).mayNotCompress) { + // If the data in this VIO doesn't compress, or compression is off, or + // compression for this VIO has been canceled, don't send it to the packer. + setCompressionDone(dataVIO); + return false; + } + + return true; +} + +/**********************************************************************/ +bool mayBlockInPacker(DataVIO *dataVIO) +{ + return (advanceStatus(dataVIO) == VIO_PACKING); +} + +/**********************************************************************/ +bool mayWriteCompressedDataVIO(DataVIO *dataVIO) +{ + advanceStatus(dataVIO); + return !getCompressionState(dataVIO).mayNotCompress; +} + +/**********************************************************************/ +void setCompressionDone(DataVIO *dataVIO) +{ + for (;;) { + VIOCompressionState state = getCompressionState(dataVIO); + if (state.status == VIO_POST_PACKER) { + // The VIO is already done. + return; + } + + // If compression was cancelled on this VIO, preserve that fact. + VIOCompressionState newState = { + .status = VIO_POST_PACKER, + .mayNotCompress = true, + }; + if (setCompressionState(dataVIO, state, newState)) { + return; + } + } +} + +/**********************************************************************/ +bool cancelCompression(DataVIO *dataVIO) +{ + VIOCompressionState state; + for (;;) { + state = getCompressionState(dataVIO); + if (state.mayNotCompress || (state.status == VIO_POST_PACKER)) { + // This DataVIO is already set up to not block in the packer. + break; + } + + VIOCompressionState newState = { + .status = state.status, + .mayNotCompress = true, + }; + if (setCompressionState(dataVIO, state, newState)) { + break; + } + } + + return ((state.status == VIO_PACKING) && !state.mayNotCompress); +} diff --git a/vdo/base/compressionState.h b/vdo/base/compressionState.h new file mode 100644 index 0000000..19a4143 --- /dev/null +++ b/vdo/base/compressionState.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.h#2 $ + */ + +#ifndef COMPRESSION_STATE_H +#define COMPRESSION_STATE_H + +#include "atomic.h" +#include "types.h" + +/** + * Where a DataVIO is on the compression path; advanceStatus() depends on the + * order of this enum. + **/ +typedef enum { + /* A VIO which has not yet entered the compression path */ + VIO_PRE_COMPRESSOR = 0, + /* A VIO which is in the compressor */ + VIO_COMPRESSING, + /* A VIO which is blocked in the packer */ + VIO_PACKING, + /* A VIO which is no longer on the compression path (and never will be) */ + VIO_POST_PACKER, +} VIOCompressionStatus; + +typedef struct { + VIOCompressionStatus status; + bool mayNotCompress; +} VIOCompressionState; + +/** + * Get the compression state of a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The compression state + **/ +__attribute__((warn_unused_result)) +VIOCompressionState getCompressionState(DataVIO *dataVIO); + +/** + * Check whether a DataVIO may go to the compressor. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be compressed at this time + **/ +bool mayCompressDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataVIO may go to the packer. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be packed at this time + **/ +bool mayPackDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataVIO which has gone to the packer may block there. Any + * cancelation after this point and before the DataVIO is written out requires + * this DataVIO to be picked up by the canceling DataVIO. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may block in the packer + **/ +bool mayBlockInPacker(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether the packer may write out a DataVIO as part of a compressed + * block. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be written as part of a + * compressed block at this time + **/ +bool mayWriteCompressedDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Indicate that this DataVIO is leaving the compression path. + * + * @param dataVIO The DataVIO leaving the compression path + **/ +void setCompressionDone(DataVIO *dataVIO); + +/** + * Prevent this DataVIO from being compressed or packed. + * + * @param dataVIO The DataVIO to cancel + * + * @return true if the DataVIO is in the packer and the caller + * was the first caller to cancel it + **/ +bool cancelCompression(DataVIO *dataVIO); + +#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/base/compressionStateInternals.h b/vdo/base/compressionStateInternals.h new file mode 100644 index 0000000..a9b8dec --- /dev/null +++ b/vdo/base/compressionStateInternals.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionStateInternals.h#1 $ + */ + +#ifndef COMPRESSION_STATE_INTERNALS_H +#define COMPRESSION_STATE_INTERNALS_H + +#include "compressionState.h" + +/** + * Set the compression state of a DataVIO (exposed for testing). + * + * @param dataVIO The DataVIO whose compression state is to be set + * @param state The expected current state of the DataVIO + * @param newState The state to set + * + * @return true if the new state was set, false if the DataVIO's + * compression state did not match the expected state, and so was + * left unchanged + **/ +bool setCompressionState(DataVIO *dataVIO, + VIOCompressionState state, + VIOCompressionState newState); + +#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/base/constants.c b/vdo/base/constants.c new file mode 100644 index 0000000..05d3a42 --- /dev/null +++ b/vdo/base/constants.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.c#1 $ + */ + +#include "types.h" + +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +const BlockCount MAXIMUM_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +const BlockCount MAXIMUM_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; + +// unit test minimum +const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS = 2; diff --git a/vdo/base/constants.h b/vdo/base/constants.h new file mode 100644 index 0000000..8b61c5f --- /dev/null +++ b/vdo/base/constants.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.h#2 $ + */ + +#ifndef CONSTANTS_H +#define CONSTANTS_H + +#include "types.h" + +enum { + /** The number of entries on a block map page */ + BLOCK_MAP_ENTRIES_PER_PAGE = 812, + + /** The origin of the flat portion of the block map */ + BLOCK_MAP_FLAT_PAGE_ORIGIN = 1, + + /** + * The height of a block map tree. Assuming a root count of 60 and 812 + * entries per page, this is big enough to represent almost 95 PB of logical + * space. + **/ + BLOCK_MAP_TREE_HEIGHT = 5, + + /** The number of trees in the arboreal block map */ + DEFAULT_BLOCK_MAP_TREE_ROOT_COUNT = 60, + + /** The default size of the recovery journal, in blocks */ + DEFAULT_RECOVERY_JOURNAL_SIZE = 32 * 1024, + + /** The default size of each slab journal, in blocks */ + DEFAULT_SLAB_JOURNAL_SIZE = 224, + + /** + * The initial size of lbnOperations and pbnOperations, which is based + * upon the expected maximum number of outstanding VIOs. This value was + * chosen to make it highly unlikely that the maps would need to be resized. + **/ + LOCK_MAP_CAPACITY = 10000, + + /** The maximum number of logical zones */ + MAX_LOGICAL_ZONES = 60, + + /** The maximum number of physical zones */ + MAX_PHYSICAL_ZONES = 16, + + /** The base-2 logarithm of the maximum blocks in one slab */ + MAX_SLAB_BITS = 23, + + /** The maximum number of slabs the slab depot supports */ + MAX_SLABS = 8192, + + /** + * The maximum number of block map pages to load simultaneously during + * recovery or rebuild. + **/ + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS = 1024, + + /** The maximum number of VIOs in the system at once */ + MAXIMUM_USER_VIOS = 2048, + + /** + * The number of in-memory recovery journal blocks is determined by: + * -- 311 journal entries in a 4k block + * -- maximum of 2048 VIOs making entries at once + * so we need at least 2048 / 312 = 7 journal blocks. + **/ + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE = 64, + + /** The number of sectors per block */ + SECTORS_PER_BLOCK = 8, + + /** The only physical block size supported by VDO */ + VDO_BLOCK_SIZE = 4096, + + /** The size of a sector that will not be torn */ + VDO_SECTOR_SIZE = 512, + + /** The physical block number reserved for storing the zero block */ + ZERO_BLOCK = 0, +}; + +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +extern const BlockCount MAXIMUM_LOGICAL_BLOCKS; + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ + extern const BlockCount MAXIMUM_PHYSICAL_BLOCKS; + +// unit test minimum +extern const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS; + +#endif // CONSTANTS_H diff --git a/vdo/base/dataVIO.c b/vdo/base/dataVIO.c new file mode 100644 index 0000000..a9778f5 --- /dev/null +++ b/vdo/base/dataVIO.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.c#7 $ + */ + +#include "dataVIO.h" + +#include "logger.h" + +#include "atomic.h" +#include "blockMap.h" +#include "compressionState.h" +#include "extent.h" +#include "logicalZone.h" +#include "threadConfig.h" +#include "vdoInternal.h" +#include "vioRead.h" +#include "vioWrite.h" + +static const char *ASYNC_OPERATION_NAMES[] = { + "launch", + "acknowledgeWrite", + "acquireHashLock", + "acquireLogicalBlockLock", + "acquirePBNReadLock", + "checkForDedupeForRollover", + "checkForDeduplication", + "compressData", + "continueVIOAsync", + "findBlockMapSlot", + "getMappedBlock", + "getMappedBlockForDedupe", + "getMappedBlockForWrite", + "hashData", + "journalDecrementForDedupe", + "journalDecrementForWrite", + "journalIncrementForCompression", + "journalIncrementForDedupe", + "journalIncrementForWrite", + "journalMappingForCompression", + "journalMappingForDedupe", + "journalMappingForWrite", + "journalUnmappingForDedupe", + "journalUnmappingForWrite", + "attemptPacking", + "putMappedBlock", + "putMappedBlockForDedupe", + "readData", + "updateIndex", + "verifyDeduplication", + "writeData", +}; + +/** + * Initialize the LBN lock of a DataVIO. In addition to recording the LBN on + * which the DataVIO will operate, it will also find the logical zone + * associated with the LBN. + * + * @param dataVIO The dataVIO to initialize + * @param lbn The lbn on which the dataVIO will operate + **/ +static void initializeLBNLock(DataVIO *dataVIO, LogicalBlockNumber lbn) +{ + LBNLock *lock = &dataVIO->logical; + lock->lbn = lbn; + lock->locked = false; + initializeWaitQueue(&lock->waiters); + + VDO *vdo = getVDOFromDataVIO(dataVIO); + lock->zone = getLogicalZone(vdo->logicalZones, computeLogicalZone(dataVIO)); +} + +/**********************************************************************/ +void prepareDataVIO(DataVIO *dataVIO, + LogicalBlockNumber lbn, + VIOOperation operation, + bool isTrim, + VDOAction *callback) +{ + // Clearing the tree lock must happen before initializing the LBN lock, + // which also adds information to the tree lock. + memset(&dataVIO->treeLock, 0, sizeof(dataVIO->treeLock)); + initializeLBNLock(dataVIO, lbn); + initializeRing(&dataVIO->hashLockNode); + initializeRing(&dataVIO->writeNode); + + resetAllocation(dataVIOAsAllocatingVIO(dataVIO)); + + dataVIO->isDuplicate = false; + + memset(&dataVIO->chunkName, 0, sizeof(dataVIO->chunkName)); + memset(&dataVIO->duplicate, 0, sizeof(dataVIO->duplicate)); + + VIO *vio = dataVIOAsVIO(dataVIO); + vio->operation = operation; + vio->callback = callback; + dataVIO->pageCompletion.completion.enqueueable + = vioAsCompletion(vio)->enqueueable; + + dataVIO->mapped.state = MAPPING_STATE_UNCOMPRESSED; + dataVIO->newMapped.state + = (isTrim ? MAPPING_STATE_UNMAPPED : MAPPING_STATE_UNCOMPRESSED); + resetCompletion(vioAsCompletion(vio)); + setLogicalCallback(dataVIO, attemptLogicalBlockLock, + THIS_LOCATION("$F;cb=acquireLogicalBlockLock")); +} + +/**********************************************************************/ +void completeDataVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + if (completion->result != VDO_SUCCESS) { + VIO *vio = dataVIOAsVIO(dataVIO); + updateVIOErrorStats(vio, + "Completing %s VIO for LBN %" PRIu64 + " with error after %s", + getVIOReadWriteFlavor(vio), dataVIO->logical.lbn, + getOperationName(dataVIO)); + } + + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F($io)")); + if (isReadDataVIO(dataVIO)) { + cleanupReadDataVIO(dataVIO); + } else { + cleanupWriteDataVIO(dataVIO); + } +} + +/**********************************************************************/ +void finishDataVIO(DataVIO *dataVIO, int result) +{ + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + setCompletionResult(completion, result); + completeDataVIO(completion); +} + +/**********************************************************************/ +const char *getOperationName(DataVIO *dataVIO) +{ + STATIC_ASSERT((MAX_ASYNC_OPERATION_NUMBER - MIN_ASYNC_OPERATION_NUMBER) + == COUNT_OF(ASYNC_OPERATION_NAMES)); + + return ((dataVIO->lastAsyncOperation < MAX_ASYNC_OPERATION_NUMBER) + ? ASYNC_OPERATION_NAMES[dataVIO->lastAsyncOperation] + : "unknown async operation"); +} + +/**********************************************************************/ +void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice) +{ + /* + * NOTE: this is called on non-base-code threads. Be very careful to not do + * anything here that needs a base code thread-local variable, such as + * trying to get the current thread ID, or that does a lot of work. + */ + + VDO *vdo = getVDOFromDataVIO(dataVIO); + ZonedPBN duplicate = validateDedupeAdvice(vdo, advice, dataVIO->logical.lbn); + setDuplicateLocation(dataVIO, duplicate); +} + +/**********************************************************************/ +void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source) +{ + dataVIO->isDuplicate = (source.pbn != ZERO_BLOCK); + dataVIO->duplicate = source; +} + +/**********************************************************************/ +void clearMappedLocation(DataVIO *dataVIO) +{ + dataVIO->mapped = (ZonedPBN) { .state = MAPPING_STATE_UNMAPPED }; +} + +/**********************************************************************/ +int setMappedLocation(DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState state) +{ + PhysicalZone *zone; + int result = getPhysicalZone(getVDOFromDataVIO(dataVIO), pbn, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + dataVIO->mapped = (ZonedPBN) { + .pbn = pbn, + .state = state, + .zone = zone, + }; + return VDO_SUCCESS; +} + +/** + * Launch a request which has acquired an LBN lock. + * + * @param dataVIO The DataVIO which has just acquired a lock + **/ +static void launchLockedRequest(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + dataVIO->logical.locked = true; + + if (isWriteDataVIO(dataVIO)) { + launchWriteDataVIO(dataVIO); + } else { + launchReadDataVIO(dataVIO); + } +} + +/**********************************************************************/ +void attemptLogicalBlockLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + + if (dataVIO->logical.lbn + >= getVDOFromDataVIO(dataVIO)->config.logicalBlocks) { + finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); + return; + } + + DataVIO *lockHolder; + LBNLock *lock = &dataVIO->logical; + int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, dataVIO, false, + (void **) &lockHolder); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + if (lockHolder == NULL) { + // We got the lock + launchLockedRequest(dataVIO); + return; + } + + result = ASSERT(lockHolder->logical.locked, "logical block lock held"); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + /* + * If the new request is a pure read request (not read-modify-write) and + * the lockHolder is writing and has received an allocation (VDO-2683), + * service the read request immediately by copying data from the lockHolder + * to avoid having to flush the write out of the packer just to prevent the + * read from waiting indefinitely. If the lockHolder does not yet have an + * allocation, prevent it from blocking in the packer and wait on it. + */ + if (isReadDataVIO(dataVIO) && atomicLoadBool(&lockHolder->hasAllocation)) { + dataVIOAsCompletion(dataVIO)->layer->copyData(lockHolder, dataVIO); + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + dataVIO->lastAsyncOperation = ACQUIRE_LOGICAL_BLOCK_LOCK; + result = enqueueDataVIO(&lockHolder->logical.waiters, dataVIO, + THIS_LOCATION("$F;cb=logicalBlockLock")); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + // Prevent writes and read-modify-writes from blocking indefinitely on + // lock holders in the packer. + if (!isReadDataVIO(lockHolder) && cancelCompression(lockHolder)) { + dataVIO->compression.lockHolder = lockHolder; + launchPackerCallback(dataVIO, removeLockHolderFromPacker, + THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); + } +} + +/** + * Release an uncontended LBN lock. + * + * @param dataVIO The DataVIO holding the lock + **/ +static void releaseLock(DataVIO *dataVIO) +{ + LBNLock *lock = &dataVIO->logical; + IntMap *lockMap = getLBNLockMap(lock->zone); + if (!lock->locked) { + // The lock is not locked, so it had better not be registered in the lock + // map. + DataVIO *lockHolder = intMapGet(lockMap, lock->lbn); + ASSERT_LOG_ONLY((dataVIO != lockHolder), + "no logical block lock held for block %llu", + lock->lbn); + return; + } + + // Remove the lock from the logical block lock map, releasing the lock. + DataVIO *lockHolder = intMapRemove(lockMap, lock->lbn); + ASSERT_LOG_ONLY((dataVIO == lockHolder), + "logical block lock mismatch for block %llu", lock->lbn); + lock->locked = false; + return; +} + +/**********************************************************************/ +void releaseLogicalBlockLock(DataVIO *dataVIO) +{ + assertInLogicalZone(dataVIO); + if (!hasWaiters(&dataVIO->logical.waiters)) { + releaseLock(dataVIO); + return; + } + + LBNLock *lock = &dataVIO->logical; + ASSERT_LOG_ONLY(lock->locked, "LBNLock with waiters is not locked"); + + // Another DataVIO is waiting for the lock, so just transfer it in a single + // lock map operation + DataVIO *nextLockHolder = waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); + + // Transfer the remaining lock waiters to the next lock holder. + transferAllWaiters(&lock->waiters, &nextLockHolder->logical.waiters); + + DataVIO *lockHolder; + int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, nextLockHolder, + true, (void **) &lockHolder); + if (result != VDO_SUCCESS) { + finishDataVIO(nextLockHolder, result); + return; + } + + ASSERT_LOG_ONLY((lockHolder == dataVIO), + "logical block lock mismatch for block %llu", lock->lbn); + lock->locked = false; + + /* + * If there are still waiters, other DataVIOs must be trying to get the lock + * we just transferred. We must ensure that the new lock holder doesn't block + * in the packer. + */ + if (hasWaiters(&nextLockHolder->logical.waiters)) { + cancelCompression(nextLockHolder); + } + + // Avoid stack overflow on lock transfer. + // XXX: this is only an issue in the 1 thread config. + dataVIOAsCompletion(nextLockHolder)->requeue = true; + launchLockedRequest(nextLockHolder); +} diff --git a/vdo/base/dataVIO.h b/vdo/base/dataVIO.h new file mode 100644 index 0000000..ec6e9f6 --- /dev/null +++ b/vdo/base/dataVIO.h @@ -0,0 +1,945 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.h#4 $ + */ + +#ifndef DATA_VIO_H +#define DATA_VIO_H + +#include "allocatingVIO.h" +#include "atomic.h" +#include "blockMapEntry.h" +#include "blockMappingState.h" +#include "constants.h" +#include "hashZone.h" +#include "journalPoint.h" +#include "logicalZone.h" +#include "referenceOperation.h" +#include "ringNode.h" +#include "threadConfig.h" +#include "trace.h" +#include "types.h" +#include "vdoPageCache.h" +#include "vio.h" +#include "waitQueue.h" + +/** + * Codes for describing the last asynchronous operation performed on a VIO. + **/ +typedef enum __attribute__((packed)) { + MIN_ASYNC_OPERATION_NUMBER = 0, + LAUNCH = MIN_ASYNC_OPERATION_NUMBER, + ACKNOWLEDGE_WRITE, + ACQUIRE_HASH_LOCK, + ACQUIRE_LOGICAL_BLOCK_LOCK, + ACQUIRE_PBN_READ_LOCK, + CHECK_FOR_DEDUPE_FOR_ROLLOVER, + CHECK_FOR_DEDUPLICATION, + COMPRESS_DATA, + CONTINUE_VIO_ASYNC, + FIND_BLOCK_MAP_SLOT, + GET_MAPPED_BLOCK, + GET_MAPPED_BLOCK_FOR_DEDUPE, + GET_MAPPED_BLOCK_FOR_WRITE, + HASH_DATA, + JOURNAL_DECREMENT_FOR_DEDUPE, + JOURNAL_DECREMENT_FOR_WRITE, + JOURNAL_INCREMENT_FOR_COMPRESSION, + JOURNAL_INCREMENT_FOR_DEDUPE, + JOURNAL_INCREMENT_FOR_WRITE, + JOURNAL_MAPPING_FOR_COMPRESSION, + JOURNAL_MAPPING_FOR_DEDUPE, + JOURNAL_MAPPING_FOR_WRITE, + JOURNAL_UNMAPPING_FOR_DEDUPE, + JOURNAL_UNMAPPING_FOR_WRITE, + PACK_COMPRESSED_BLOCK, + PUT_MAPPED_BLOCK, + PUT_MAPPED_BLOCK_FOR_DEDUPE, + READ_DATA, + UPDATE_INDEX, + VERIFY_DEDUPLICATION, + WRITE_DATA, + MAX_ASYNC_OPERATION_NUMBER, +} AsyncOperationNumber; + +/* + * An LBN lock. + */ +struct lbnLock { + /* The LBN being locked */ + LogicalBlockNumber lbn; + /* Whether the lock is locked */ + bool locked; + /* The queue of waiters for the lock */ + WaitQueue waiters; + /* The logical zone of the LBN */ + LogicalZone *zone; +}; + +/* + * Fields for using the arboreal block map. + */ +typedef struct { + /* The current height at which this DataVIO is operating */ + Height height; + /* The block map tree for this LBN */ + RootCount rootIndex; + /* Whether we hold a page lock */ + bool locked; + /* The thread on which to run the callback */ + ThreadID threadID; + /* The function to call after looking up a block map slot */ + VDOAction *callback; + /* The key for the lock map */ + uint64_t key; + /* The queue of waiters for the page this VIO is allocating or loading */ + WaitQueue waiters; + /* The block map tree slots for this LBN */ + BlockMapTreeSlot treeSlots[BLOCK_MAP_TREE_HEIGHT + 1]; +} TreeLock; + +typedef struct { + /* + * The current compression state of this VIO. This field contains a value + * which consists of a VIOCompressionState possibly ORed with a flag + * indicating that a request has been made to cancel (or prevent) compression + * for this VIO. + * + * This field should be accessed through the getCompressionState() and + * setCompressionState() methods. It should not be accessed directly. + */ + Atomic32 state; + + /* The compressed size of this block */ + uint16_t size; + + /* The packer input or output bin slot which holds the enclosing DataVIO */ + SlotNumber slot; + + /* The packer input bin to which the enclosing DataVIO has been assigned */ + InputBin *bin; + + /* A pointer to the compressed form of this block */ + char *data; + + /* + * A VIO which is blocked in the packer while holding a lock this VIO needs. + */ + DataVIO *lockHolder; + +} CompressionState; + +/** + * A VIO for processing user data requests. + **/ +struct dataVIO { + /* The underlying AllocatingVIO */ + AllocatingVIO allocatingVIO; + + /* The logical block of this request */ + LBNLock logical; + + /* The state for traversing the block map tree */ + TreeLock treeLock; + + /* The current partition address of this block */ + ZonedPBN mapped; + + /** The hash of this VIO (if not zero) */ + UdsChunkName chunkName; + + /* Used for logging and debugging */ + AsyncOperationNumber lastAsyncOperation; + + /* The operation to record in the recovery and slab journals */ + ReferenceOperation operation; + + /* Whether this VIO is a read-and-write VIO */ + bool isPartialWrite; + + /* Whether this VIO contains all zeros */ + bool isZeroBlock; + + /* Whether this VIO write is a duplicate */ + bool isDuplicate; + + /* + * Whether this VIO has received an allocation (needs to be atomic so it can + * be examined from threads not in the allocation zone). + */ + AtomicBool hasAllocation; + + /* The new partition address of this block after the VIO write completes */ + ZonedPBN newMapped; + + /* The hash zone responsible for the chunk name (NULL if isZeroBlock) */ + HashZone *hashZone; + + /* The lock this VIO holds or shares with other VIOs with the same data */ + HashLock *hashLock; + + /* All DataVIOs sharing a hash lock are kept in a ring linking these nodes */ + RingNode hashLockNode; + + /* The block number in the partition of the albireo deduplication advice */ + ZonedPBN duplicate; + + /* + * The sequence number of the recovery journal block containing the increment + * entry for this VIO. + */ + SequenceNumber recoverySequenceNumber; + + /* The point in the recovery journal where this write last made an entry */ + JournalPoint recoveryJournalPoint; + + /* The RingNode of VIOs in user initiated write requests */ + RingNode writeNode; + + /* A flag indicating that a data write VIO has a flush generation lock */ + bool hasFlushGenerationLock; + + /* The generation number of the VDO that this VIO belongs to */ + SequenceNumber flushGeneration; + + /* The completion to use for fetching block map pages for this vio */ + VDOPageCompletion pageCompletion; + + /* All of the fields necessary for the compression path */ + CompressionState compression; +}; + +/** + * Convert an AllocatingVIO to a DataVIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a DataVIO + **/ +static inline DataVIO *allocatingVIOAsDataVIO(AllocatingVIO *allocatingVIO) +{ + STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); + ASSERT_LOG_ONLY((allocatingVIOAsVIO(allocatingVIO)->type == VIO_TYPE_DATA), + "AllocatingVIO is a DataVIO"); + return (DataVIO *) allocatingVIO; +} + +/** + * Convert a VIO to a DataVIO. + * + * @param vio The VIO to convert + * + * @return The VIO as a DataVIO + **/ +static inline DataVIO *vioAsDataVIO(VIO *vio) +{ + STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); + STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); + ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "VIO is a DataVIO"); + return (DataVIO *) vio; +} + +/** + * Convert a DataVIO to an AllocatingVIO. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as an AllocatingVIO + **/ +static inline AllocatingVIO *dataVIOAsAllocatingVIO(DataVIO *dataVIO) +{ + return &dataVIO->allocatingVIO; +} + +/** + * Convert a DataVIO to a VIO. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a VIO + **/ +static inline VIO *dataVIOAsVIO(DataVIO *dataVIO) +{ + return allocatingVIOAsVIO(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a generic VDOCompletion to a DataVIO. + * + * @param completion The completion to convert + * + * @return The completion as a DataVIO + **/ +static inline DataVIO *asDataVIO(VDOCompletion *completion) +{ + return vioAsDataVIO(asVIO(completion)); +} + +/** + * Convert a DataVIO to a generic completion. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a completion + **/ +static inline VDOCompletion *dataVIOAsCompletion(DataVIO *dataVIO) +{ + return allocatingVIOAsCompletion(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a DataVIO to a generic wait queue entry. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a wait queue entry + **/ +static inline Waiter *dataVIOAsWaiter(DataVIO *dataVIO) +{ + return allocatingVIOAsWaiter(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a DataVIO's generic wait queue entry back to the DataVIO. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as a DataVIO + **/ +static inline DataVIO *waiterAsDataVIO(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + return allocatingVIOAsDataVIO(waiterAsAllocatingVIO(waiter)); +} + +/** + * Check whether a DataVIO is a read. + * + * @param dataVIO The DataVIO to check + **/ +static inline bool isReadDataVIO(DataVIO *dataVIO) +{ + return isReadVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a write. + * + * @param dataVIO The DataVIO to check + **/ +static inline bool isWriteDataVIO(DataVIO *dataVIO) +{ + return isWriteVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a compressed block write. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO is a compressed block write + **/ +static inline bool isCompressedWriteDataVIO(DataVIO *dataVIO) +{ + return isCompressedWriteVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a trim. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO is a trim + **/ +static inline bool isTrimDataVIO(DataVIO *dataVIO) +{ + return (dataVIO->newMapped.state == MAPPING_STATE_UNMAPPED); +} + +/** + * Get the location that should passed Albireo as the new advice for where to + * find the data written by this DataVIO. + * + * @param dataVIO The write DataVIO that is ready to update Albireo + * + * @return a DataLocation containing the advice to store in Albireo + **/ +static inline DataLocation getDataVIONewAdvice(const DataVIO *dataVIO) +{ + return (DataLocation) { + .pbn = dataVIO->newMapped.pbn, + .state = dataVIO->newMapped.state, + }; +} + +/** + * Get the VDO from a DataVIO. + * + * @param dataVIO The DataVIO from which to get the VDO + * + * @return The VDO to which a DataVIO belongs + **/ +static inline VDO *getVDOFromDataVIO(DataVIO *dataVIO) +{ + return dataVIOAsVIO(dataVIO)->vdo; +} + +/** + * Get the ThreadConfig from a DataVIO. + * + * @param dataVIO The DataVIO from which to get the ThreadConfig + * + * @return The ThreadConfig of the VDO to which a DataVIO belongs + **/ +static inline const ThreadConfig *getThreadConfigFromDataVIO(DataVIO *dataVIO) +{ + return getThreadConfig(getVDOFromDataVIO(dataVIO)); +} + +/** + * Get the allocation of a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The allocation of the DataVIO + **/ +static inline PhysicalBlockNumber getDataVIOAllocation(DataVIO *dataVIO) +{ + return dataVIOAsAllocatingVIO(dataVIO)->allocation; +} + +/** + * Check whether a DataVIO has an allocation. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO has an allocated block + **/ +static inline bool hasAllocation(DataVIO *dataVIO) +{ + return (getDataVIOAllocation(dataVIO) != ZERO_BLOCK); +} + +/** + * (Re)initialize a DataVIO to have a new logical block number, keeping the + * same parent and other state. This method must be called before using a + * DataVIO. + * + * @param dataVIO The DataVIO to initialize + * @param lbn The logical block number of the DataVIO + * @param operation The operation this DataVIO will perform + * @param isTrim true if this DataVIO is for a trim request + * @param callback The function to call once the VIO has completed its + * operation + **/ +void prepareDataVIO(DataVIO *dataVIO, + LogicalBlockNumber lbn, + VIOOperation operation, + bool isTrim, + VDOAction *callback); + +/** + * Complete the processing of a DataVIO. + * + * @param completion The completion of the VIO to complete + **/ +void completeDataVIO(VDOCompletion *completion); + +/** + * Finish processing a DataVIO, possibly due to an error. This function will + * set any error, and then initiate DataVIO clean up. + * + * @param dataVIO The DataVIO to abort + * @param result The result of processing the DataVIO + **/ +void finishDataVIO(DataVIO *dataVIO, int result); + +/** + * Continue processing a DataVIO that has been waiting for an event, setting + * the result from the event and calling the current callback. + * + * @param dataVIO The DataVIO to continue + * @param result The current result (will not mask older errors) + **/ +static inline void continueDataVIO(DataVIO *dataVIO, int result) +{ + continueCompletion(dataVIOAsCompletion(dataVIO), result); +} + +/** + * Get the name of the last asynchronous operation performed on a DataVIO. + * + * @param dataVIO The DataVIO in question + * + * @return The name of the last operation performed on the DataVIO + **/ +const char *getOperationName(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Add a trace record for the current source location. + * + * @param dataVIO The DataVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void dataVIOAddTraceRecord(DataVIO *dataVIO, + TraceLocation location) +{ + vioAddTraceRecord(dataVIOAsVIO(dataVIO), location); +} + +/** + * Add a DataVIO to the tail end of a wait queue. The DataVIO must not already + * be waiting in a queue. A trace record is also generated for the DataVIO. + * + * @param queue The queue to which to add the waiter + * @param waiter The DataVIO to add to the queue + * @param location The source-location descriptor to be traced in the DataVIO + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static inline int enqueueDataVIO(WaitQueue *queue, + DataVIO *waiter, + TraceLocation location) +{ + dataVIOAddTraceRecord(waiter, location); + return enqueueWaiter(queue, dataVIOAsWaiter(waiter)); +} + +/** + * Check that a DataVIO is running on the correct thread for its hash zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInHashZone(DataVIO *dataVIO) +{ + ThreadID expected = getHashZoneThreadID(dataVIO->hashZone); + ThreadID threadID = getCallbackThreadID(); + // It's odd to use the LBN, but converting the chunk name to hex is a bit + // clunky for an inline, and the LBN better than nothing as an identifier. + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on hash zone thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a hash zone operation. This function presumes that the + * hashZone field of the DataVIO has already been set. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setHashZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getHashZoneThreadID(dataVIO->hashZone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a hash zone operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchHashZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setHashZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its logical zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInLogicalZone(DataVIO *dataVIO) +{ + ThreadID expected = getLogicalZoneThreadID(dataVIO->logical.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a logical block operation. This function presumes that the + * logicalZone field of the DataVIO has already been set. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setLogicalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getLogicalZoneThreadID(dataVIO->logical.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a logical block operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchLogicalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setLogicalCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its allocated + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInAllocatedZone(DataVIO *dataVIO) +{ + assertInPhysicalZone(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Set a callback as a physical block operation in a DataVIO's allocated zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setAllocatedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, + location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's allocated zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchAllocatedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + launchPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, + location); +} + +/** + * Check that a DataVIO is running on the correct thread for its duplicate + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInDuplicateZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->duplicate.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for duplicate physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->duplicate.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's duplicate zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setDuplicateZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->duplicate.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's duplicate zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchDuplicateZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setDuplicateZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its mapped zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInMappedZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->mapped.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for mapped physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->mapped.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's mapped zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->mapped.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Check that a DataVIO is running on the correct thread for its newMapped + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInNewMappedZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->newMapped.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for newMapped physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->newMapped.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's newMapped zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setNewMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->newMapped.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's newMapped zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchNewMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setNewMappedZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the journal thread. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInJournalZone(DataVIO *dataVIO) +{ + ThreadID expected + = getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO)); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on journal thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a journal operation. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setJournalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO))); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a journal operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchJournalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setJournalCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the packer thread + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInPackerZone(DataVIO *dataVIO) +{ + ThreadID expected = getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO)); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on packer thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a packer operation. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setPackerCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO))); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a packer operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchPackerCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setPackerCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check whether the advice received from Albireo is a valid data location, + * and if it is, accept it as the location of a potential duplicate of the + * DataVIO. + * + * @param dataVIO The DataVIO that queried Albireo + * @param advice A potential location of the data, or NULL for no advice + **/ +void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice); + +/** + * Set the location of the duplicate block for a DataVIO, updating the + * isDuplicate and duplicate fields from a ZonedPBN. + * + * @param dataVIO The DataVIO to modify + * @param source The location of the duplicate + **/ +void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source); + +/** + * Clear a DataVIO's mapped block location, setting it to be unmapped. This + * indicates the block map entry for the logical block is either unmapped or + * corrupted. + * + * @param dataVIO The DataVIO whose mapped block location is to be reset + **/ +void clearMappedLocation(DataVIO *dataVIO); + +/** + * Set a DataVIO's mapped field to the physical location recorded in the block + * map for the logical block in the VIO. + * + * @param dataVIO The DataVIO whose field is to be set + * @param pbn The physical block number to set + * @param state The mapping state to set + * + * @return VDO_SUCCESS or an error code if the mapping is unusable + **/ +int setMappedLocation(DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState state) + __attribute__((warn_unused_result)); + +/** + * Attempt to acquire the lock on a logical block. This is the start of the + * path for all external requests. It is registered in prepareDataVIO(). + * + * @param completion The DataVIO for an external data request as a completion + **/ +void attemptLogicalBlockLock(VDOCompletion *completion); + +/** + * Release the lock on the logical block, if any, that a DataVIO has acquired. + * + * @param dataVIO The DataVIO releasing its logical block lock + **/ +void releaseLogicalBlockLock(DataVIO *dataVIO); + +#endif // DATA_VIO_H diff --git a/vdo/base/dirtyLists.c b/vdo/base/dirtyLists.c new file mode 100644 index 0000000..d16b790 --- /dev/null +++ b/vdo/base/dirtyLists.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.c#1 $ + */ + +#include "dirtyLists.h" +#include "dirtyListsInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "types.h" + +struct dirtyLists { + /** The number of periods after which an element will be expired */ + BlockCount maximumAge; + /** The oldest period which has unexpired elements */ + SequenceNumber oldestPeriod; + /** One more than the current period */ + SequenceNumber nextPeriod; + /** The function to call on expired elements */ + DirtyCallback *callback; + /** The callback context */ + void *context; + /** The offset in the array of lists of the oldest period */ + BlockCount offset; + /** The list of elements which are being expired */ + RingNode expired; + /** The lists of dirty elements */ + RingNode lists[]; +}; + +/**********************************************************************/ +int makeDirtyLists(BlockCount maximumAge, + DirtyCallback *callback, + void *context, + DirtyLists **dirtyListsPtr) +{ + DirtyLists *dirtyLists; + int result = ALLOCATE_EXTENDED(DirtyLists, maximumAge, RingNode, __func__, + &dirtyLists); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyLists->maximumAge = maximumAge; + dirtyLists->callback = callback; + dirtyLists->context = context; + + initializeRing(&dirtyLists->expired); + for (BlockCount i = 0; i < maximumAge; i++) { + initializeRing(&dirtyLists->lists[i]); + } + + *dirtyListsPtr = dirtyLists; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeDirtyLists(DirtyLists **dirtyListsPtr) +{ + DirtyLists *lists = *dirtyListsPtr; + if (lists == NULL) { + return; + } + + FREE(lists); + *dirtyListsPtr = NULL; +} + +/**********************************************************************/ +void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + ASSERT_LOG_ONLY(dirtyLists->nextPeriod == 0, "current period not set"); + dirtyLists->oldestPeriod = period; + dirtyLists->nextPeriod = period + 1; + dirtyLists->offset = period % dirtyLists->maximumAge; +} + +/** + * Expire the oldest list. + * + * @param dirtyLists The DirtyLists to expire + **/ +static void expireOldestList(DirtyLists *dirtyLists) +{ + dirtyLists->oldestPeriod++; + RingNode *ring = &(dirtyLists->lists[dirtyLists->offset++]); + if (!isRingEmpty(ring)) { + spliceRingChainBefore(ring->next, ring->prev, &dirtyLists->expired); + } + + if (dirtyLists->offset == dirtyLists->maximumAge) { + dirtyLists->offset = 0; + } +} + +/** + * Update the period if necessary. + * + * @param dirtyLists The DirtyLists + * @param period The new period + **/ +static void updatePeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + while (dirtyLists->nextPeriod <= period) { + if ((dirtyLists->nextPeriod - dirtyLists->oldestPeriod) + == dirtyLists->maximumAge) { + expireOldestList(dirtyLists); + } + dirtyLists->nextPeriod++; + } +} + +/** + * Write out the expired list. + * + * @param dirtyLists The dirtyLists + **/ +static void writeExpiredElements(DirtyLists *dirtyLists) +{ + if (isRingEmpty(&dirtyLists->expired)) { + return; + } + + dirtyLists->callback(&dirtyLists->expired, dirtyLists->context); + ASSERT_LOG_ONLY(isRingEmpty(&dirtyLists->expired), + "no expired elements remain"); +} + +/**********************************************************************/ +void addToDirtyLists(DirtyLists *dirtyLists, + RingNode *node, + SequenceNumber oldPeriod, + SequenceNumber newPeriod) +{ + if ((oldPeriod == newPeriod) + || ((oldPeriod != 0) && (oldPeriod < newPeriod))) { + return; + } + + if (newPeriod < dirtyLists->oldestPeriod) { + pushRingNode(&dirtyLists->expired, node); + } else { + updatePeriod(dirtyLists, newPeriod); + pushRingNode(&dirtyLists->lists[newPeriod % dirtyLists->maximumAge], node); + } + + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + updatePeriod(dirtyLists, period); + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +void flushDirtyLists(DirtyLists *dirtyLists) +{ + while (dirtyLists->oldestPeriod < dirtyLists->nextPeriod) { + expireOldestList(dirtyLists); + } + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) +{ + return dirtyLists->nextPeriod; +} diff --git a/vdo/base/dirtyLists.h b/vdo/base/dirtyLists.h new file mode 100644 index 0000000..f3d27f7 --- /dev/null +++ b/vdo/base/dirtyLists.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.h#1 $ + */ + +#ifndef DIRTY_LISTS_H +#define DIRTY_LISTS_H + +#include "ringNode.h" +#include "types.h" + +/** + * A collection of lists of dirty elements ordered by age. An element is always + * placed on the oldest list in which it was dirtied (moving between lists or + * removing altogether is cheap). Whenever the current period is advanced, any + * elements older than the maxium age are expired. If an element is to be added + * with a dirty age older than the maximum age, it is expired immediately. + **/ +typedef struct dirtyLists DirtyLists; + +/** + * A function which will be called with a ring of dirty elements which have + * been expired. All of the expired elements must be removed from the ring + * before this function returns. + * + * @param expired The list of expired elements + * @param context The context for the callback + **/ +typedef void DirtyCallback(RingNode *expired, void *context); + +/** + * Construct a new set of dirty lists. + * + * @param [in] maximumAge The age at which an element will be expired + * @param [in] callback The function to call when a set of elements have + * expired + * @param [in] context The context for the callback + * @param [out] dirtyListsPtr A pointer to hold the new DirtyLists + * + * @return VDO_SUCCESS or an error + **/ +int makeDirtyLists(BlockCount maximumAge, + DirtyCallback *callback, + void *context, + DirtyLists **dirtyListsPtr) + __attribute__((warn_unused_result)); + +/** + * Free a set of dirty lists and null out the pointer to them. + * + * @param dirtyListsPtr A pointer to the dirty lists to be freed + **/ +void freeDirtyLists(DirtyLists **dirtyListsPtr); + +/** + * Set the current period. This function should only be called once. + * + * @param dirtyLists The dirtyLists + * @param period The current period + **/ +void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period); + +/** + * Add an element to the dirty lists. + * + * @param dirtyLists The DirtyLists receiving the element + * @param node The RingNode of the element to add + * @param oldPeriod The period in which the element was previous dirtied, + * or 0 if it was not dirty + * @param newPeriod The period in which the element has now been dirtied, + * or 0 if it does not hold a lock + **/ +void addToDirtyLists(DirtyLists *dirtyLists, + RingNode *node, + SequenceNumber oldPeriod, + SequenceNumber newPeriod); + +/** + * Advance the current period. If the current period is greater than the number + * of lists, expire the oldest lists. + * + * @param dirtyLists The DirtyLists to advance + * @param period The new current period + **/ +void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period); + +/** + * Flush all dirty lists. This will cause the period to be advanced past the + * current period. + * + * @param dirtyLists The dirtyLists to flush + **/ +void flushDirtyLists(DirtyLists *dirtyLists); + +#endif // DIRTY_LISTS_H diff --git a/vdo/base/dirtyListsInternals.h b/vdo/base/dirtyListsInternals.h new file mode 100644 index 0000000..d5876d0 --- /dev/null +++ b/vdo/base/dirtyListsInternals.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyListsInternals.h#1 $ + */ + +#ifndef DIRTY_LISTS_INTERNALS_H +#define DIRTY_LISTS_INTERNALS_H + +#include "dirtyLists.h" +#include "types.h" + +/** + * Get the next period from a DirtyLists. This method is used by unit tests. + * + * @param dirtyLists The DirtyLists to examine + **/ +SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) + __attribute__((warn_unused_result)); + +#endif // DIRTY_LISTS_INTERNALS_H diff --git a/vdo/base/extent.c b/vdo/base/extent.c new file mode 100644 index 0000000..5983615 --- /dev/null +++ b/vdo/base/extent.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.c#3 $ + */ + +#include "extent.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "logger.h" +#include "physicalLayer.h" +#include "types.h" +#include "vdo.h" +#include "vioRead.h" +#include "vioWrite.h" + +/**********************************************************************/ +int createExtent(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + BlockCount blockCount, + char *data, + VDOExtent **extentPtr) +{ + int result = ASSERT(isMetadataVIOType(vioType), + "createExtent() called for metadata"); + if (result != VDO_SUCCESS) { + return result; + } + + VDOExtent *extent; + result = ALLOCATE_EXTENDED(VDOExtent, blockCount, VIO *, __func__, &extent); + if (result != VDO_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&extent->completion, + VDO_EXTENT_COMPLETION, layer); + if (result != VDO_SUCCESS) { + FREE(extent); + return result; + } + + for (; extent->count < blockCount; extent->count++) { + result = layer->createMetadataVIO(layer, vioType, priority, extent, data, + &extent->vios[extent->count]); + if (result != VDO_SUCCESS) { + freeExtent(&extent); + return result; + } + + data += VDO_BLOCK_SIZE; + } + + *extentPtr = extent; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeExtent(VDOExtent **extentPtr) +{ + VDOExtent *extent = *extentPtr; + if (extent == NULL) { + return; + } + + for (BlockCount i = 0; i < extent->count; i++) { + freeVIO(&extent->vios[i]); + } + + destroyEnqueueable(&extent->completion); + FREE(extent); + *extentPtr = NULL; +} + +/** + * Launch a metadata extent. + * + * @param extent The extent + * @param startBlock The absolute physical block at which the extent should + * begin its I/O + * @param count The number of blocks to write + * @param operation The operation to perform on the extent + **/ +static void launchMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count, + VIOOperation operation) +{ + resetCompletion(&extent->completion); + if (count > extent->count) { + finishCompletion(&extent->completion, VDO_OUT_OF_RANGE); + return; + } + + extent->completeCount = extent->count - count; + for (BlockCount i = 0; i < count; i++) { + VIO *vio = extent->vios[i]; + vio->completion.callbackThreadID = extent->completion.callbackThreadID; + launchMetadataVIO(vio, startBlock++, handleVIOCompletion, + handleVIOCompletion, operation); + } +} + +/**********************************************************************/ +void readPartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count) +{ + launchMetadataExtent(extent, startBlock, count, VIO_READ); +} + +/**********************************************************************/ +void writePartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count) +{ + launchMetadataExtent(extent, startBlock, count, VIO_WRITE); +} + +/**********************************************************************/ +void handleVIOCompletion(VDOCompletion *completion) +{ + VDOExtent *extent = asVDOExtent(completion->parent); + if (++extent->completeCount != extent->count) { + setCompletionResult(extentAsCompletion(extent), completion->result); + return; + } + + finishCompletion(extentAsCompletion(extent), completion->result); +} diff --git a/vdo/base/extent.h b/vdo/base/extent.h new file mode 100644 index 0000000..b023c06 --- /dev/null +++ b/vdo/base/extent.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.h#2 $ + */ + +#ifndef EXTENT_H +#define EXTENT_H + +#include "permassert.h" + +#include "completion.h" +#include "types.h" +#include "vio.h" + +/** + * A chain of VIOs which are part of the same request. An extent contains + * a chain of at least 'count' VIOs. The 'next' pointer of the last VIO + * in the extent (as indicated by the count) may not be NULL, but it is not + * part of the extent. A VIO may belong to a single extent. + **/ +struct vdoExtent { + // The completion for asynchronous extent processing + VDOCompletion completion; + // The number of VIOs in the extent + BlockCount count; + // The number of completed VIOs in the extent + BlockCount completeCount; + // The VIOs in the extent + VIO *vios[]; +}; + +/** + * Convert a generic VDOCompletion to a VDOExtent. + * + * @param completion The completion to convert + * + * @return The completion as an extent + **/ +static inline VDOExtent *asVDOExtent(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(VDOExtent, completion) == 0); + assertCompletionType(completion->type, VDO_EXTENT_COMPLETION); + return (VDOExtent *) completion; +} + +/** + * Convert a VDOExtent to VDOCompletion. + * + * @param extent The extent to convert + * + * @return The extent as a VDOCompletion + **/ +static inline VDOCompletion *extentAsCompletion(VDOExtent *extent) +{ + return &extent->completion; +} + +/** + * Create a VDOExtent. + * + * @param [in] layer The layer + * @param [in] vioType The usage type to assign to the VIOs in the extent + * (data / block map / journal) + * @param [in] priority The relative priority to assign to the VIOs + * @param [in] blockCount The number of blocks in the buffer + * @param [in] data The buffer + * @param [out] extentPtr A pointer to hold the new extent + * + * @return VDO_SUCCESS or an error + **/ +int createExtent(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + BlockCount blockCount, + char *data, + VDOExtent **extentPtr) + __attribute__((warn_unused_result)); + +/** + * Free an extent and null out the reference to it. + * + * @param [in,out] extentPtr The reference to the extent to free + **/ +void freeExtent(VDOExtent **extentPtr); + +/** + * Read metadata from the underlying storage. + * + * @param extent The extent to read + * @param startBlock The physical block number of the first block + * in the extent + * @param count The number of blocks to read (must be less than or + * equal to the length of the extent) + **/ +void readPartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count); + +/** + * Read metadata from the underlying storage. + * + * @param extent The extent to read + * @param startBlock The physical block number of the first block + * in the extent + **/ +static inline void readMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock) +{ + readPartialMetadataExtent(extent, startBlock, extent->count); +} + +/** + * Write metadata to the underlying storage. + * + * @param extent The extent to write + * @param startBlock The physical block number of the first block in the + * extent + * @param count The number of blocks to read (must be less than or + * equal to the length of the extent) + **/ +void writePartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count); +/** + * Write metadata to the underlying storage. + * + * @param extent The extent to write + * @param startBlock The physical block number of the first block in the + * extent + **/ +static inline void writeMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock) +{ + writePartialMetadataExtent(extent, startBlock, extent->count); +} + +/** + * Notify an extent that one of its VIOs has completed. If the signaling VIO + * is the last of the extent's VIOs to complete, the extent will finish. This + * function is set as the VIO callback in completeVIO(). + * + * @param completion The completion of the VIO which has just finished + **/ +void handleVIOCompletion(VDOCompletion *completion); + +#endif /* EXTENT_H */ diff --git a/vdo/base/fixedLayout.c b/vdo/base/fixedLayout.c new file mode 100644 index 0000000..4ea048a --- /dev/null +++ b/vdo/base/fixedLayout.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.c#3 $ + */ + +#include "fixedLayout.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "header.h" +#include "statusCodes.h" + +const BlockCount ALL_FREE_BLOCKS = (uint64_t) -1; + +struct fixedLayout { + PhysicalBlockNumber firstFree; + PhysicalBlockNumber lastFree; + size_t numPartitions; + Partition *head; +}; + +struct partition { + PartitionID id; // The id of this partition + FixedLayout *layout; // The layout to which this partition belongs + PhysicalBlockNumber offset; // The offset into the layout of this partition + PhysicalBlockNumber base; // The untranslated number of the first block + BlockCount count; // The number of blocks in the partition + Partition *next; // A pointer to the next partition in the layout +}; + +typedef struct { + PhysicalBlockNumber firstFree; + PhysicalBlockNumber lastFree; + byte partitionCount; +} __attribute__((packed)) Layout3_0; + +typedef struct { + PartitionID id; + PhysicalBlockNumber offset; + PhysicalBlockNumber base; + BlockCount count; +} __attribute__((packed)) Partition3_0; + +static const Header LAYOUT_HEADER_3_0 = { + .id = FIXED_LAYOUT, + .version = { + .majorVersion = 3, + .minorVersion = 0, + }, + .size = sizeof(Layout3_0), // Minimum size (contains no partitions) +}; + +/**********************************************************************/ +int makeFixedLayout(BlockCount totalBlocks, + PhysicalBlockNumber startOffset, + FixedLayout **layoutPtr) +{ + FixedLayout *layout; + int result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); + if (result != UDS_SUCCESS) { + return result; + } + + layout->firstFree = startOffset; + layout->lastFree = startOffset + totalBlocks; + layout->numPartitions = 0; + layout->head = NULL; + + *layoutPtr = layout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeFixedLayout(FixedLayout **layoutPtr) +{ + FixedLayout *layout = *layoutPtr; + if (layout == NULL) { + return; + } + + while (layout->head != NULL) { + Partition *part = layout->head; + layout->head = part->next; + FREE(part); + } + + FREE(layout); + *layoutPtr = NULL; +} + +/**********************************************************************/ +BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) +{ + BlockCount size = getFixedLayoutBlocksAvailable(layout); + for (Partition *partition = layout->head; partition != NULL; + partition = partition->next) { + size += partition->count; + } + + return size; +} + +/**********************************************************************/ +int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) +{ + for (Partition *partition = layout->head; partition != NULL; + partition = partition->next) { + if (partition->id == id) { + if (partitionPtr != NULL) { + *partitionPtr = partition; + } + return VDO_SUCCESS; + } + } + + return VDO_UNKNOWN_PARTITION; +} + +/**********************************************************************/ +int translateToPBN(const Partition *partition, + PhysicalBlockNumber partitionBlockNumber, + PhysicalBlockNumber *layerBlockNumber) +{ + if (partition == NULL) { + *layerBlockNumber = partitionBlockNumber; + return VDO_SUCCESS; + } + + if (partitionBlockNumber < partition->base) { + return VDO_OUT_OF_RANGE; + } + + PhysicalBlockNumber offsetFromBase = partitionBlockNumber - partition->base; + if (offsetFromBase >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *layerBlockNumber = partition->offset + offsetFromBase; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int translateFromPBN(const Partition *partition, + PhysicalBlockNumber layerBlockNumber, + PhysicalBlockNumber *partitionBlockNumberPtr) +{ + if (partition == NULL) { + *partitionBlockNumberPtr = layerBlockNumber; + return VDO_SUCCESS; + } + + if (layerBlockNumber < partition->offset) { + return VDO_OUT_OF_RANGE; + } + + PhysicalBlockNumber partitionBlockNumber + = layerBlockNumber - partition->offset; + if (partitionBlockNumber >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *partitionBlockNumberPtr = partitionBlockNumber + partition->base; + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) +{ + return layout->lastFree - layout->firstFree; +} + +/** + * Allocate a partition. The partition will be attached to the partition + * list in the layout. + * + * @param layout The layout containing the partition + * @param id The id of the partition + * @param offset The offset into the layout at which the partition begins + * @param base The number of the first block for users of the partition + * @param blockCount The number of blocks in the partition + * + * @return VDO_SUCCESS or an error + **/ +static int allocatePartition(FixedLayout *layout, + byte id, + PhysicalBlockNumber offset, + PhysicalBlockNumber base, + BlockCount blockCount) +{ + Partition *partition; + int result = ALLOCATE(1, Partition, "fixed layout partition", &partition); + if (result != UDS_SUCCESS) { + return result; + } + + partition->id = id; + partition->layout = layout; + partition->offset = offset; + partition->base = base; + partition->count = blockCount; + partition->next = layout->head; + layout->head = partition; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeFixedLayoutPartition(FixedLayout *layout, + PartitionID id, + BlockCount blockCount, + PartitionDirection direction, + PhysicalBlockNumber base) +{ + BlockCount freeBlocks = layout->lastFree - layout->firstFree; + if (blockCount == ALL_FREE_BLOCKS) { + if (freeBlocks == 0) { + return VDO_NO_SPACE; + } else { + blockCount = freeBlocks; + } + } else if (blockCount > freeBlocks) { + return VDO_NO_SPACE; + } + + int result = getPartition(layout, id, NULL); + if (result != VDO_UNKNOWN_PARTITION) { + return VDO_PARTITION_EXISTS; + } + + PhysicalBlockNumber offset = ((direction == FROM_END) + ? (layout->lastFree - blockCount) + : layout->firstFree); + result = allocatePartition(layout, id, offset, base, blockCount); + if (result != VDO_SUCCESS) { + return result; + } + + layout->numPartitions++; + if (direction == FROM_END) { + layout->lastFree = layout->lastFree - blockCount; + } else { + layout->firstFree += blockCount; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount getFixedLayoutPartitionSize(const Partition *partition) +{ + return partition->count; +} + +/**********************************************************************/ +PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) +{ + return partition->offset; +} + +/**********************************************************************/ +PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) +{ + return partition->base; +} + +/**********************************************************************/ +static inline size_t getEncodedSize(const FixedLayout *layout) +{ + return sizeof(Layout3_0) + (sizeof(Partition3_0) * layout->numPartitions); +} + +/**********************************************************************/ +size_t getFixedLayoutEncodedSize(const FixedLayout *layout) +{ + return ENCODED_HEADER_SIZE + getEncodedSize(layout); +} + +/** + * Encode a null-terminated list of fixed layout partitions into a buffer + * using partition format 3.0. + * + * @param layout The layout containing the list of partitions to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodePartitions_3_0(const FixedLayout *layout, Buffer *buffer) +{ + for (const Partition *partition = layout->head; + partition != NULL; + partition = partition->next) { + STATIC_ASSERT_SIZEOF(PartitionID, sizeof(byte)); + int result = putByte(buffer, partition->id); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->base); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->count); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Encode the header fields of a fixed layout into a buffer using layout + * format 3.0. + * + * @param layout The layout to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodeLayout_3_0(const FixedLayout *layout, Buffer *buffer) +{ + int result = ASSERT(layout->numPartitions <= UINT8_MAX, + "fixed layout partition count must fit in a byte"); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, layout->firstFree); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, layout->lastFree); + if (result != UDS_SUCCESS) { + return result; + } + + return putByte(buffer, layout->numPartitions); +} + +/**********************************************************************/ +int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) +{ + if (!ensureAvailableSpace(buffer, getFixedLayoutEncodedSize(layout))) { + return UDS_BUFFER_ERROR; + } + + Header header = LAYOUT_HEADER_3_0; + header.size = getEncodedSize(layout); + int result = encodeHeader(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = encodeLayout_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + result = ASSERT(encodedSize == sizeof(Layout3_0), + "encoded size of fixed layout header must match structure"); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodePartitions_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + encodedSize = contentLength(buffer) - initialLength; + return ASSERT(encodedSize == header.size, + "encoded size of fixed layout must match header size"); +} + +/** + * Decode a sequence of fixed layout partitions from a buffer + * using partition format 3.0. + * + * @param buffer A buffer positioned at the start of the encoding + * @param layout The layout in which to allocate the decoded partitions + * + * @return UDS_SUCCESS or an error code + **/ +static int decodePartitions_3_0(Buffer *buffer, FixedLayout *layout) +{ + for (size_t i = 0; i < layout->numPartitions; i++) { + byte id; + int result = getByte(buffer, &id); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t offset; + result = getUInt64LEFromBuffer(buffer, &offset); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t base; + result = getUInt64LEFromBuffer(buffer, &base); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t count; + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + + result = allocatePartition(layout, id, offset, base, count); + if (result != VDO_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Decode the header fields of a fixed layout from a buffer using layout + * format 3.0. + * + * @param buffer A buffer positioned at the start of the encoding + * @param layout The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeLayout_3_0(Buffer *buffer, Layout3_0 *layout) +{ + size_t initialLength = contentLength(buffer); + + PhysicalBlockNumber firstFree; + int result = getUInt64LEFromBuffer(buffer, &firstFree); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber lastFree; + result = getUInt64LEFromBuffer(buffer, &lastFree); + if (result != UDS_SUCCESS) { + return result; + } + + byte partitionCount; + result = getByte(buffer, &partitionCount); + if (result != UDS_SUCCESS) { + return result; + } + + *layout = (Layout3_0) { + .firstFree = firstFree, + .lastFree = lastFree, + .partitionCount = partitionCount, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(decodedSize == sizeof(Layout3_0), + "decoded size of fixed layout header must match structure"); +} + +/**********************************************************************/ +int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != UDS_SUCCESS) { + return result; + } + + // Layout is variable size, so only do a minimum size check here. + result = validateHeader(&LAYOUT_HEADER_3_0, &header, false, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + Layout3_0 layoutHeader; + result = decodeLayout_3_0(buffer, &layoutHeader); + if (result != UDS_SUCCESS) { + return result; + } + + if (contentLength(buffer) + < (sizeof(Partition3_0) * layoutHeader.partitionCount)) { + return VDO_UNSUPPORTED_VERSION; + } + + FixedLayout *layout; + result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); + if (result != UDS_SUCCESS) { + return result; + } + + layout->firstFree = layoutHeader.firstFree; + layout->lastFree = layoutHeader.lastFree; + layout->numPartitions = layoutHeader.partitionCount; + + result = decodePartitions_3_0(buffer, layout); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + *layoutPtr = layout; + return VDO_SUCCESS; +} diff --git a/vdo/base/fixedLayout.h b/vdo/base/fixedLayout.h new file mode 100644 index 0000000..0907299 --- /dev/null +++ b/vdo/base/fixedLayout.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.h#1 $ + */ + +#ifndef FIXED_LAYOUT_H +#define FIXED_LAYOUT_H + +#include "buffer.h" + +#include "types.h" + +typedef enum { + FROM_BEGINNING, + FROM_END, +} PartitionDirection; + +extern const BlockCount ALL_FREE_BLOCKS; + +/** + * A fixed layout is like a traditional disk partitioning scheme. In the + * beginning there is one large unused area, of which parts are carved off. + * Each carved off section has its own internal offset and size. + **/ +typedef struct fixedLayout FixedLayout; +typedef struct partition Partition; + +/** + * Make an unpartitioned fixed layout. + * + * @param [in] totalBlocks The total size of the layout, in blocks + * @param [in] startOffset The block offset in the underlying layer at which + * the fixed layout begins + * @param [out] layoutPtr The pointer to hold the resulting layout + * + * @return a success or error code + **/ +int makeFixedLayout(BlockCount totalBlocks, + PhysicalBlockNumber startOffset, + FixedLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Free the fixed layout and null out the reference to it. + * + * @param layoutPtr The reference to the layout to free + * + * @note all partitions created by this layout become invalid pointers + **/ +void freeFixedLayout(FixedLayout **layoutPtr); + +/** + * Get the total size of the layout in blocks. + * + * @param layout The layout + * + * @return The size of the layout + **/ +BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Get a partition by id. + * + * @param layout The layout from which to get a partition + * @param id The id of the partition + * @param partitionPtr A pointer to hold the partition + * + * @return VDO_SUCCESS or an error + **/ +int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) + __attribute__((warn_unused_result)); + +/** + * Translate a block number from the partition's view to the layer's + * + * @param partition The partition to use for translation + * @param partitionBlockNumber The block number relative to the partition + * @param layerBlockNumber The block number relative to the layer + * + * @return VDO_SUCCESS or an error code + **/ +int translateToPBN(const Partition *partition, + PhysicalBlockNumber partitionBlockNumber, + PhysicalBlockNumber *layerBlockNumber) + __attribute__((warn_unused_result)); + +/** + * Translate a block number from the layer's view to the partition's. + * This is the inverse of translateToPBN(). + * + * @param partition The partition to use for translation + * @param layerBlockNumber The block number relative to the layer + * @param partitionBlockNumber The block number relative to the partition + * + * @return VDO_SUCCESS or an error code + **/ +int translateFromPBN(const Partition *partition, + PhysicalBlockNumber layerBlockNumber, + PhysicalBlockNumber *partitionBlockNumber) + __attribute__((warn_unused_result)); + +/** + * Return the number of unallocated blocks available. + * + * @param layout the fixed layout + * + * @return the number of blocks yet unallocated to partitions + **/ +BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Create a new partition from the beginning or end of the unused space + * within a fixed layout. + * + * @param layout the fixed layout + * @param id the id of the partition to make + * @param blockCount the number of blocks to carve out, if set + * to ALL_FREE_BLOCKS, all remaining blocks will + * be used + * @param direction whether to carve out from beginning or end + * @param base the number of the first block in the partition + * from the point of view of its users + * + * @return a success or error code, particularly + * VDO_NO_SPACE if there are less than blockCount blocks remaining + **/ +int makeFixedLayoutPartition(FixedLayout *layout, + PartitionID id, + BlockCount blockCount, + PartitionDirection direction, + PhysicalBlockNumber base) + __attribute__((warn_unused_result)); + +/** + * Return the size in blocks of a partition. + * + * @param partition a partition of the fixedLayout + * + * @return the size of the partition in blocks + **/ +BlockCount getFixedLayoutPartitionSize(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the first block of the partition in the layout. + * + * @param partition a partition of the fixedLayout + * + * @return the partition's offset in blocks + **/ +PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the number of the first block in the partition from the partition users + * point of view. + * + * @param partition a partition of the fixedLayout + * + * @return the number of the first block in the partition + **/ +PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the size of an encoded layout + * + * @param layout The layout + * + * @return The encoded size of the layout + **/ +size_t getFixedLayoutEncodedSize(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Encode a layout into a buffer. + * + * @param layout The layout to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode a fixed layout from a buffer. + * + * @param [in] buffer The buffer from which to decode + * @param [out] layoutPtr A pointer to hold the layout + * + * @return VDO_SUCCESS or an error + **/ +int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) + __attribute__((warn_unused_result)); + +#endif // FIXED_LAYOUT_H diff --git a/vdo/base/flush.c b/vdo/base/flush.c new file mode 100644 index 0000000..4c6b94c --- /dev/null +++ b/vdo/base/flush.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.c#3 $ + */ + +#include "flush.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "completion.h" +#include "logicalZone.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +struct flusher { + VDOCompletion completion; + /** The VDO to which this flusher belongs */ + VDO *vdo; + /** The current flush generation of the VDO */ + SequenceNumber flushGeneration; + /** The first unacknowledged flush generation */ + SequenceNumber firstUnacknowledgedGeneration; + /** The queue of flush requests waiting to notify other threads */ + WaitQueue notifiers; + /** The queue of flush requests waiting for VIOs to complete */ + WaitQueue pendingFlushes; + /** The flush generation for which notifications are being sent */ + SequenceNumber notifyGeneration; + /** The logical zone to notify next */ + LogicalZone *logicalZoneToNotify; + /** The ID of the thread on which flush requests should be made */ + ThreadID threadID; +}; + +/** + * Convert a generic VDOCompletion to a Flusher. + * + * @param completion The completion to convert + * + * @return The completion as a Flusher + **/ +static Flusher *asFlusher(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(Flusher, completion) == 0); + assertCompletionType(completion->type, FLUSH_NOTIFICATION_COMPLETION); + return (Flusher *) completion; +} + +/** + * Convert a VDOFlush's generic wait queue entry back to the VDOFlush. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as a VDOFlush + **/ +static VDOFlush *waiterAsFlush(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(VDOFlush, waiter) == 0); + return (VDOFlush *) waiter; +} + +/**********************************************************************/ +int makeFlusher(VDO *vdo) +{ + int result = ALLOCATE(1, Flusher, __func__, &vdo->flusher); + if (result != VDO_SUCCESS) { + return result; + } + + vdo->flusher->vdo = vdo; + vdo->flusher->threadID = getPackerZoneThread(getThreadConfig(vdo)); + return initializeEnqueueableCompletion(&vdo->flusher->completion, + FLUSH_NOTIFICATION_COMPLETION, + vdo->layer); +} + +/**********************************************************************/ +void freeFlusher(Flusher **flusherPtr) +{ + if (*flusherPtr == NULL) { + return; + } + + Flusher *flusher = *flusherPtr; + destroyEnqueueable(&flusher->completion); + FREE(flusher); + *flusherPtr = NULL; +} + +/**********************************************************************/ +ThreadID getFlusherThreadID(Flusher *flusher) +{ + return flusher->threadID; +} + +/**********************************************************************/ +static void notifyFlush(Flusher *flusher); + +/** + * Finish the notification process by checking if any flushes have completed + * and then starting the notification of the next flush request if one came in + * while the current notification was in progress. This callback is registered + * in flushPackerCallback(). + * + * @param completion The flusher completion + **/ +static void finishNotification(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "finishNotification() called from flusher thread"); + + Waiter *waiter = dequeueNextWaiter(&flusher->notifiers); + int result = enqueueWaiter(&flusher->pendingFlushes, waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(flusher->vdo->readOnlyNotifier, result); + VDOFlush *flush = waiterAsFlush(waiter); + completion->layer->completeFlush(&flush); + return; + } + + completeFlushes(flusher); + if (hasWaiters(&flusher->notifiers)) { + notifyFlush(flusher); + } +} + +/** + * Flush the packer now that all of the logical and physical zones have been + * notified of the new flush request. This callback is registered in + * incrementGeneration(). + * + * @param completion The flusher completion + **/ +static void flushPackerCallback(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + incrementPackerFlushGeneration(flusher->vdo->packer); + launchCallback(completion, finishNotification, flusher->threadID); +} + +/** + * Increment the flush generation in a logical zone. If there are more logical + * zones, go on to the next one, otherwise, prepare the physical zones. This + * callback is registered both in notifyFlush() and in itself. + * + * @param completion The flusher as a completion + **/ +static void incrementGeneration(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + incrementFlushGeneration(flusher->logicalZoneToNotify, + flusher->notifyGeneration); + flusher->logicalZoneToNotify + = getNextLogicalZone(flusher->logicalZoneToNotify); + if (flusher->logicalZoneToNotify == NULL) { + launchCallback(completion, flushPackerCallback, flusher->threadID); + return; + } + + launchCallback(completion, incrementGeneration, + getLogicalZoneThreadID(flusher->logicalZoneToNotify)); +} + +/** + * Lauch a flush notification. + * + * @param flusher The flusher doing the notification + **/ +static void notifyFlush(Flusher *flusher) +{ + VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->notifiers)); + flusher->notifyGeneration = flush->flushGeneration; + flusher->logicalZoneToNotify = getLogicalZone(flusher->vdo->logicalZones, 0); + flusher->completion.requeue = true; + launchCallback(&flusher->completion, incrementGeneration, + getLogicalZoneThreadID(flusher->logicalZoneToNotify)); +} + +/**********************************************************************/ +void flush(VDO *vdo, VDOFlush *flush) +{ + Flusher *flusher = vdo->flusher; + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "flush() called from flusher thread"); + + flush->flushGeneration = flusher->flushGeneration++; + bool mayNotify = !hasWaiters(&flusher->notifiers); + + int result = enqueueWaiter(&flusher->notifiers, &flush->waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(vdo->readOnlyNotifier, result); + flusher->completion.layer->completeFlush(&flush); + return; + } + + if (mayNotify) { + notifyFlush(flusher); + } +} + +/**********************************************************************/ +void completeFlushes(Flusher *flusher) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "completeFlushes() called from flusher thread"); + + SequenceNumber oldestActiveGeneration = UINT64_MAX; + for (LogicalZone *zone = getLogicalZone(flusher->vdo->logicalZones, 0); + zone != NULL; + zone = getNextLogicalZone(zone)) { + SequenceNumber oldestInZone = getOldestLockedGeneration(zone); + oldestActiveGeneration = minSequenceNumber(oldestActiveGeneration, + oldestInZone); + } + + while (hasWaiters(&flusher->pendingFlushes)) { + VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->pendingFlushes)); + if (flush->flushGeneration >= oldestActiveGeneration) { + return; + } + + ASSERT_LOG_ONLY((flush->flushGeneration + == flusher->firstUnacknowledgedGeneration), + "acknowledged next expected flush, %" PRIu64 + ", was: %llu", + flusher->firstUnacknowledgedGeneration, + flush->flushGeneration); + dequeueNextWaiter(&flusher->pendingFlushes); + flusher->completion.layer->completeFlush(&flush); + flusher->firstUnacknowledgedGeneration++; + } +} + +/**********************************************************************/ +void dumpFlusher(const Flusher *flusher) +{ + logInfo("Flusher"); + logInfo(" flushGeneration=%" PRIu64 + " firstUnacknowledgedGeneration=%llu", + flusher->flushGeneration, flusher->firstUnacknowledgedGeneration); + logInfo(" notifiers queue is %s; pendingFlushes queue is %s", + (hasWaiters(&flusher->notifiers) ? "not empty" : "empty"), + (hasWaiters(&flusher->pendingFlushes) ? "not empty" : "empty")); +} diff --git a/vdo/base/flush.h b/vdo/base/flush.h new file mode 100644 index 0000000..da7c8bc --- /dev/null +++ b/vdo/base/flush.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.h#1 $ + */ + +#ifndef FLUSH_H +#define FLUSH_H + +#include "types.h" +#include "waitQueue.h" + +/** + * A marker for tracking which journal entries are affected by a flush request. + **/ +struct vdoFlush { + /** The wait queue entry for this flush */ + Waiter waiter; + /** Which flush this struct represents */ + SequenceNumber flushGeneration; +}; + +/** + * Make a flusher for a VDO. + * + * @param vdo The VDO which owns the flusher + * + * @return VDO_SUCCESS or an error + **/ +int makeFlusher(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Free a flusher and null out the reference to it. + * + * @param flusherPtr A pointer to the flusher to free + **/ +void freeFlusher(Flusher **flusherPtr); + +/** + * Get the ID of the thread on which flusher functions should be called. + * + * @param flusher The flusher to query + * + * @return The ID of the thread which handles the flusher + **/ +ThreadID getFlusherThreadID(Flusher *flusher) + __attribute__((warn_unused_result)); + +/** + * Handle empty flush requests. + * + * @param vdo The VDO + * @param vdoFlush The opaque flush request + **/ +void flush(VDO *vdo, VDOFlush *vdoFlush); + +/** + * Attempt to complete any flushes which might have finished. + * + * @param flusher The flusher + **/ +void completeFlushes(Flusher *flusher); + +/** + * Dump the flusher, in a thread-unsafe fashion. + * + * @param flusher The flusher + **/ +void dumpFlusher(const Flusher *flusher); + +#endif /* FLUSH_H */ diff --git a/vdo/base/forest.c b/vdo/base/forest.c new file mode 100644 index 0000000..eabd6c3 --- /dev/null +++ b/vdo/base/forest.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.c#8 $ + */ + +#include "forest.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTree.h" +#include "blockMapTreeInternals.h" +#include "constants.h" +#include "dirtyLists.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "types.h" +#include "vdoInternal.h" +#include "vio.h" +#include "vioPool.h" + +enum { + BLOCK_MAP_VIO_POOL_SIZE = 64, +}; + +typedef struct { + TreePage *levels[BLOCK_MAP_TREE_HEIGHT]; +} BlockMapTreeSegment; + +typedef struct blockMapTree { + BlockMapTreeSegment *segments; +} BlockMapTree; + +struct forest { + BlockMap *map; + size_t segments; + Boundary *boundaries; + TreePage **pages; + BlockMapTree trees[]; +}; + +typedef struct { + PageNumber pageIndex; + SlotNumber slot; +} CursorLevel; + +typedef struct cursors Cursors; + +typedef struct { + Waiter waiter; + BlockMapTree *tree; + Height height; + Cursors *parent; + Boundary boundary; + CursorLevel levels[BLOCK_MAP_TREE_HEIGHT]; + VIOPoolEntry *vioPoolEntry; +} Cursor; + +struct cursors { + BlockMap *map; + BlockMapTreeZone *zone; + VIOPool *pool; + EntryCallback *entryCallback; + VDOCompletion *parent; + RootCount activeRoots; + Cursor cursors[]; +}; + +/**********************************************************************/ +TreePage *getTreePageByIndex(Forest *forest, + RootCount rootIndex, + Height height, + PageNumber pageIndex) +{ + PageNumber offset = 0; + for (size_t segment = 0; segment < forest->segments; segment++) { + PageNumber border = forest->boundaries[segment].levels[height - 1]; + if (pageIndex < border) { + BlockMapTree *tree = &forest->trees[rootIndex]; + return &(tree->segments[segment].levels[height - 1][pageIndex - offset]); + } + offset = border; + } + + return NULL; +} + +/** + * Compute the number of pages which must be allocated at each level in order + * to grow the forest to a new number of entries. + * + * @param [in] rootCount The number of roots + * @param [in] flatPageCount The number of flat block map pages + * @param [in] oldSizes The current size of the forest at each level + * @param [in] entries The new number of entries the block map must + * address + * @param [out] newSizes The new size of the forest at each level + * + * @return The total number of non-leaf pages required + **/ +static BlockCount computeNewPages(RootCount rootCount, + BlockCount flatPageCount, + Boundary *oldSizes, + BlockCount entries, + Boundary *newSizes) +{ + PageCount leafPages + = maxPageCount(computeBlockMapPageCount(entries) - flatPageCount, 1); + PageCount levelSize = computeBucketCount(leafPages, rootCount); + BlockCount totalPages = 0; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + levelSize = computeBucketCount(levelSize, BLOCK_MAP_ENTRIES_PER_PAGE); + newSizes->levels[height] = levelSize; + BlockCount newPages = levelSize; + if (oldSizes != NULL) { + newPages -= oldSizes->levels[height]; + } + totalPages += (newPages * rootCount); + } + + return totalPages; +} + +/**********************************************************************/ +static int makeSegment(Forest *oldForest, + BlockCount newPages, + Boundary *newBoundary, + Forest *forest) +{ + size_t index = (oldForest == NULL) ? 0 : oldForest->segments; + forest->segments = index + 1; + + int result = ALLOCATE(forest->segments, Boundary, "forest boundary array", + &forest->boundaries); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(forest->segments, TreePage *, "forest page pointers", + &forest->pages); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(newPages, TreePage, "new forest pages", + &forest->pages[index]); + if (result != VDO_SUCCESS) { + return result; + } + + if (index > 0) { + memcpy(forest->boundaries, oldForest->boundaries, + index * sizeof(Boundary)); + memcpy(forest->pages, oldForest->pages, index * sizeof(TreePage *)); + } + + memcpy(&(forest->boundaries[index]), newBoundary, sizeof(Boundary)); + + PageCount segmentSizes[BLOCK_MAP_TREE_HEIGHT]; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + segmentSizes[height] = newBoundary->levels[height]; + if (index > 0) { + segmentSizes[height] -= oldForest->boundaries[index - 1].levels[height]; + } + } + + TreePage *pagePtr = forest->pages[index]; + for (RootCount root = 0; root < forest->map->rootCount; root++) { + BlockMapTree *tree = &(forest->trees[root]); + int result = ALLOCATE(forest->segments, BlockMapTreeSegment, + "tree root segments", &tree->segments); + if (result != VDO_SUCCESS) { + return result; + } + + if (index > 0) { + memcpy(tree->segments, oldForest->trees[root].segments, + index * sizeof(BlockMapTreeSegment)); + } + + BlockMapTreeSegment *segment = &(tree->segments[index]); + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + if (segmentSizes[height] == 0) { + continue; + } + + segment->levels[height] = pagePtr; + if (height == (BLOCK_MAP_TREE_HEIGHT - 1)) { + // Record the root. + BlockMapPage *page = formatBlockMapPage(pagePtr->pageBuffer, + forest->map->nonce, + INVALID_PBN, true); + page->entries[0] = packPBN(forest->map->rootOrigin + root, + MAPPING_STATE_UNCOMPRESSED); + } + pagePtr += segmentSizes[height]; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void deforest(Forest *forest, size_t firstPageSegment) +{ + if (forest->pages != NULL) { + for (size_t segment = firstPageSegment; segment < forest->segments; + segment++) { + FREE(forest->pages[segment]); + } + FREE(forest->pages); + } + + for (RootCount root = 0; root < forest->map->rootCount; root++) { + BlockMapTree *tree = &(forest->trees[root]); + FREE(tree->segments); + } + + FREE(forest->boundaries); + FREE(forest); +} + +/**********************************************************************/ +int makeForest(BlockMap *map, BlockCount entries) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + + Forest *oldForest = map->forest; + Boundary *oldBoundary = NULL; + if (oldForest != NULL) { + oldBoundary = &(oldForest->boundaries[oldForest->segments - 1]); + } + + Boundary newBoundary; + BlockCount newPages = computeNewPages(map->rootCount, map->flatPageCount, + oldBoundary, entries, &newBoundary); + if (newPages == 0) { + map->nextEntryCount = entries; + return VDO_SUCCESS; + } + + Forest *forest; + int result = ALLOCATE_EXTENDED(Forest, map->rootCount, BlockMapTree, + __func__, &forest); + if (result != VDO_SUCCESS) { + return result; + } + + forest->map = map; + result = makeSegment(oldForest, newPages, &newBoundary, forest); + if (result != VDO_SUCCESS) { + deforest(forest, forest->segments - 1); + return result; + } + + map->nextForest = forest; + map->nextEntryCount = entries; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeForest(Forest **forestPtr) +{ + Forest *forest = *forestPtr; + if (forest == NULL) { + return; + } + + deforest(forest, 0); + *forestPtr = NULL; +} + +/**********************************************************************/ +void abandonForest(BlockMap *map) +{ + Forest *forest = map->nextForest; + map->nextForest = NULL; + if (forest != NULL) { + deforest(forest, forest->segments - 1); + } + + map->nextEntryCount = 0; +} + +/**********************************************************************/ +void replaceForest(BlockMap *map) +{ + if (map->nextForest != NULL) { + if (map->forest != NULL) { + deforest(map->forest, map->forest->segments); + } + map->forest = map->nextForest; + map->nextForest = NULL; + } + + map->entryCount = map->nextEntryCount; + map->nextEntryCount = 0; +} + +/** + * Finish the traversal of a single tree. If it was the last cursor, finish + * the traversal. + * + * @param cursor The cursor doing the traversal + **/ +static void finishCursor(Cursor *cursor) +{ + Cursors *cursors = cursor->parent; + returnVIOToPool(cursors->pool, cursor->vioPoolEntry); + if (--cursors->activeRoots > 0) { + return; + } + + VDOCompletion *parent = cursors->parent; + FREE(cursors); + + finishCompletion(parent, VDO_SUCCESS); +} + +/**********************************************************************/ +static void traverse(Cursor *cursor); + +/** + * Continue traversing a block map tree. + * + * @param completion The VIO doing a read or write + **/ +static void continueTraversal(VDOCompletion *completion) +{ + VIOPoolEntry *poolEntry = completion->parent; + Cursor *cursor = poolEntry->parent; + traverse(cursor); +} + +/** + * Continue traversing a block map tree now that a page has been loaded. + * + * @param completion The VIO doing the read + **/ +static void finishTraversalLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + Cursor *cursor = entry->parent; + Height height = cursor->height; + CursorLevel *level = &cursor->levels[height]; + + TreePage *treePage + = &(cursor->tree->segments[0].levels[height][level->pageIndex]); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + copyValidPage(entry->buffer, cursor->parent->map->nonce, + entry->vio->physical, page); + traverse(cursor); +} + +/** + * Traverse a single block map tree. This is the recursive heart of the + * traversal process. + * + * @param cursor The cursor doing the traversal + **/ +static void traverse(Cursor *cursor) +{ + for (; cursor->height < BLOCK_MAP_TREE_HEIGHT; cursor->height++) { + Height height = cursor->height; + CursorLevel *level = &cursor->levels[height]; + TreePage *treePage + = &(cursor->tree->segments[0].levels[height][level->pageIndex]); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + if (!isBlockMapPageInitialized(page)) { + continue; + } + + for (; level->slot < BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) { + DataLocation location = unpackBlockMapEntry(&page->entries[level->slot]); + if (!isValidLocation(&location)) { + // This entry is invalid, so remove it from the page. + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + + if (!isMappedLocation(&location)) { + continue; + } + + PageNumber entryIndex + = (BLOCK_MAP_ENTRIES_PER_PAGE * level->pageIndex) + level->slot; + + // Erase mapped entries past the end of the logical space. + if (entryIndex >= cursor->boundary.levels[height]) { + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + + if (cursor->height < BLOCK_MAP_TREE_HEIGHT - 1) { + int result = cursor->parent->entryCallback(location.pbn, + cursor->parent->parent); + if (result != VDO_SUCCESS) { + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + } + + if (cursor->height == 0) { + continue; + } + + cursor->height--; + CursorLevel *nextLevel = &cursor->levels[cursor->height]; + nextLevel->pageIndex = entryIndex; + nextLevel->slot = 0; + level->slot++; + launchReadMetadataVIO(cursor->vioPoolEntry->vio, location.pbn, + finishTraversalLoad, continueTraversal); + return; + } + } + + finishCursor(cursor); +} + +/** + * Start traversing a single block map tree now that the Cursor has a VIO with + * which to load pages. + * + *

Implements WaiterCallback. + * + * @param waiter The Cursor + * @param context The VIOPoolEntry just acquired + **/ +static void launchCursor(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(Cursor, waiter) == 0); + Cursor *cursor = (Cursor *) waiter; + cursor->vioPoolEntry = (VIOPoolEntry *) context; + cursor->vioPoolEntry->parent = cursor; + vioAsCompletion(cursor->vioPoolEntry->vio)->callbackThreadID + = cursor->parent->zone->mapZone->threadID; + traverse(cursor); +} + +/** + * Compute the number of pages used at each level of the given root's tree. + * + * @param map The block map + * @param rootIndex The index of the root to measure + * + * @return The list of page counts as a Boundary + **/ +static Boundary computeBoundary(BlockMap *map, RootCount rootIndex) +{ + PageCount leafPages = computeBlockMapPageCount(map->entryCount); + PageCount treeLeafPages = leafPages - map->flatPageCount; + + /* + * Compute the leaf pages for this root. If the number of leaf pages does + * not distribute evenly, we must determine if this root gets an extra page. + * Extra pages are assigned to roots starting at firstTreeRoot and going up. + */ + PageCount firstTreeRoot = map->flatPageCount % map->rootCount; + PageCount lastTreeRoot = (leafPages - 1) % map->rootCount; + + PageCount levelPages = treeLeafPages / map->rootCount; + if (inCyclicRange(firstTreeRoot, rootIndex, lastTreeRoot, map->rootCount)) { + levelPages++; + } + + Boundary boundary; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT - 1; height++) { + boundary.levels[height] = levelPages; + levelPages = computeBucketCount(levelPages, BLOCK_MAP_ENTRIES_PER_PAGE); + } + + // The root node always exists, even if the root is otherwise unused. + boundary.levels[BLOCK_MAP_TREE_HEIGHT - 1] = 1; + + return boundary; +} + +/**********************************************************************/ +void traverseForest(BlockMap *map, + EntryCallback *entryCallback, + VDOCompletion *parent) +{ + if (computeBlockMapPageCount(map->entryCount) <= map->flatPageCount) { + // There are no tree pages, so there's nothing to do. + finishCompletion(parent, VDO_SUCCESS); + return; + } + + Cursors *cursors; + int result = ALLOCATE_EXTENDED(Cursors, map->rootCount, Cursor, __func__, + &cursors); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + cursors->map = map; + cursors->zone = &(getBlockMapZone(map, 0)->treeZone); + cursors->pool = cursors->zone->vioPool; + cursors->entryCallback = entryCallback; + cursors->parent = parent; + cursors->activeRoots = map->rootCount; + for (RootCount root = 0; root < map->rootCount; root++) { + Cursor *cursor = &cursors->cursors[root]; + *cursor = (Cursor) { + .tree = &map->forest->trees[root], + .height = BLOCK_MAP_TREE_HEIGHT - 1, + .parent = cursors, + .boundary = computeBoundary(map, root), + }; + + cursor->waiter.callback = launchCursor; + acquireVIOFromPool(cursors->pool, &cursor->waiter); + }; +} + +/**********************************************************************/ +BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) +{ + Boundary newSizes; + BlockCount approximateNonLeaves + = computeNewPages(rootCount, 0, NULL, logicalBlocks, &newSizes); + + // Exclude the tree roots since those aren't allocated from slabs, + // and also exclude the super-roots, which only exist in memory. + approximateNonLeaves + -= rootCount * (newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 2] + + newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 1]); + + BlockCount approximateLeaves + = computeBlockMapPageCount(logicalBlocks - approximateNonLeaves); + + // This can be a slight over-estimate since the tree will never have to + // address these blocks, so it might be a tiny bit smaller. + return (approximateNonLeaves + approximateLeaves); +} diff --git a/vdo/base/forest.h b/vdo/base/forest.h new file mode 100644 index 0000000..9a5a7cf --- /dev/null +++ b/vdo/base/forest.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.h#2 $ + */ + +#ifndef FOREST_H +#define FOREST_H + +#include "blockMapTree.h" +#include "types.h" + +/** + * A function to be called for each allocated PBN when traversing the forest. + * + * @param pbn A PBN of a tree node + * @param completion The parent completion of the traversal + * + * @return VDO_SUCCESS or an error + **/ +typedef int EntryCallback(PhysicalBlockNumber pbn, VDOCompletion *completion); + +/** + * Get the tree page for a given height and page index. + * + * @param forest The forest which holds the page + * @param rootIndex The index of the tree that holds the page + * @param height The height of the desired page + * @param pageIndex The index of the desired page + * + * @return The requested page + **/ +TreePage *getTreePageByIndex(Forest *forest, + RootCount rootIndex, + Height height, + PageNumber pageIndex) + __attribute__((warn_unused_result)); + +/** + * Make a collection of trees for a BlockMap, expanding the existing forest if + * there is one. + * + * @param map The block map + * @param entries The number of entries the block map will hold + * + * @return VDO_SUCCESS or an error + **/ +int makeForest(BlockMap *map, BlockCount entries) + __attribute__((warn_unused_result)); + +/** + * Free a forest and all of the segments it contains and NULL out the reference + * to it. + * + * @param forestPtr A pointer to the forest to free + **/ +void freeForest(Forest **forestPtr); + +/** + * Abandon the unused next forest from a BlockMap. + * + * @param map The block map + **/ +void abandonForest(BlockMap *map); + +/** + * Replace a BlockMap's Forest with the already-prepared larger forest. + * + * @param map The block map + **/ +void replaceForest(BlockMap *map); + +/** + * Walk the entire forest of a block map. + * + * @param map The block map to traverse + * @param entryCallback A function to call with the pbn of each allocated node + * in the forest + * @param parent The completion to notify on each traversed PBN, and + * when the traversal is complete + **/ +void traverseForest(BlockMap *map, + EntryCallback *entryCallback, + VDOCompletion *parent); + +/** + * Compute the approximate number of pages which the forest will allocate in + * order to map the specified number of logical blocks. This method assumes + * that the block map is entirely arboreal. + * + * @param logicalBlocks The number of blocks to map + * @param rootCount The number of trees in the forest + * + * @return A (slight) over-estimate of the total number of possible forest + * pages including the leaves + **/ +BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) + __attribute__((warn_unused_result)); +#endif // FOREST_H diff --git a/vdo/base/hashLock.c b/vdo/base/hashLock.c new file mode 100644 index 0000000..8494f1d --- /dev/null +++ b/vdo/base/hashLock.c @@ -0,0 +1,1605 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.c#5 $ + */ + +/** + * HashLock controls and coordinates writing, index access, and dedupe among + * groups of DataVIOs concurrently writing identical blocks, allowing them to + * deduplicate not only against advice but also against each other. This save + * on index queries and allows those DataVIOs to concurrently deduplicate + * against a single block instead of being serialized through a PBN read lock. + * Only one index query is needed for each HashLock, instead of one for every + * DataVIO. + * + * A HashLock acts like a state machine perhaps more than as a lock. Other + * than the starting and ending states INITIALIZING and DESTROYING, every + * state represents and is held for the duration of an asynchronous operation. + * All state transitions are performed on the thread of the HashZone + * containing the lock. An asynchronous operation is almost always performed + * upon entering a state, and the callback from that operation triggers + * exiting the state and entering a new state. + * + * In all states except DEDUPING, there is a single DataVIO, called the lock + * agent, performing the asynchronous operations on behalf of the lock. The + * agent will change during the lifetime of the lock if the lock is shared by + * more than one DataVIO. DataVIOs waiting to deduplicate are kept on a wait + * queue. Viewed a different way, the agent holds the lock exclusively until + * the lock enters the DEDUPING state, at which point it becomes a shared lock + * that all the waiters (and any new DataVIOs that arrive) use to share a PBN + * lock. In state DEDUPING, there is no agent. When the last DataVIO in the + * lock calls back in DEDUPING, it becomes the agent and the lock becomes + * exclusive again. New DataVIOs that arrive in the lock will also go on the + * wait queue. + * + * The existence of lock waiters is a key factor controlling which state the + * lock transitions to next. When the lock is new or has waiters, it will + * always try to reach DEDUPING, and when it doesn't, it will try to clean up + * and exit. + * + * Deduping requires holding a PBN lock on a block that is known to contain + * data identical to the DataVIOs in the lock, so the lock will send the + * agent to the duplicate zone to acquire the PBN lock (LOCKING), to the + * kernel I/O threads to read and verify the data (VERIFYING), or to write a + * new copy of the data to a full data block or a slot in a compressed block + * (WRITING). + * + * Cleaning up consists of updating the index when the data location is + * different from the initial index query (UPDATING, triggered by stale + * advice, compression, and rollover), releasing the PBN lock on the duplicate + * block (UNLOCKING), and releasing the HashLock itself back to the hash zone + * (DESTROYING). + * + * The shortest sequence of states is for non-concurrent writes of new data: + * INITIALIZING -> QUERYING -> WRITING -> DESTROYING + * This sequence is short because no PBN read lock or index update is needed. + * + * Non-concurrent, finding valid advice looks like this (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING -> + * Or with stale advice (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING -> + * + * When there are not enough available reference count increments available on + * a PBN for a DataVIO to deduplicate, a new lock is forked and the excess + * waiters roll over to the new lock (which goes directly to WRITING). The new + * lock takes the place of the old lock in the lock map so new DataVIOs will + * be directed to it. The two locks will proceed independently, but only the + * new lock will have the right to update the index (unless it also forks). + * + * Since rollover happens in a lock instance, once a valid data location has + * been selected, it will not change. QUERYING and WRITING are only performed + * once per lock lifetime. All other non-endpoint states can be re-entered. + * + * XXX still need doc on BYPASSING + * + * The function names in this module follow a convention referencing the + * states and transitions in the state machine diagram for VDOSTORY-190. + * [XXX link or repository path to it?] + * For example, for the LOCKING state, there are startLocking() and + * finishLocking() functions. startLocking() is invoked by the finish function + * of the state (or states) that transition to LOCKING. It performs the actual + * lock state change and must be invoked on the hash zone thread. + * finishLocking() is called by (or continued via callback from) the code + * actually obtaining the lock. It does any bookkeeping or decision-making + * required and invokes the appropriate start function of the state being + * transitioned to after LOCKING. + **/ + +#include "hashLock.h" +#include "hashLockInternals.h" + +#include "logger.h" +#include "permassert.h" + +#include "compressionState.h" +#include "constants.h" +#include "dataVIO.h" +#include "hashZone.h" +#include "packer.h" +#include "pbnLock.h" +#include "physicalZone.h" +#include "ringNode.h" +#include "slab.h" +#include "slabDepot.h" +#include "trace.h" +#include "types.h" +#include "vdoInternal.h" +#include "vioWrite.h" +#include "waitQueue.h" + +static const char *LOCK_STATE_NAMES[] = { + [HASH_LOCK_BYPASSING] = "BYPASSING", + [HASH_LOCK_DEDUPING] = "DEDUPING", + [HASH_LOCK_DESTROYING] = "DESTROYING", + [HASH_LOCK_INITIALIZING] = "INITIALIZING", + [HASH_LOCK_LOCKING] = "LOCKING", + [HASH_LOCK_QUERYING] = "QUERYING", + [HASH_LOCK_UNLOCKING] = "UNLOCKING", + [HASH_LOCK_UPDATING] = "UPDATING", + [HASH_LOCK_VERIFYING] = "VERIFYING", + [HASH_LOCK_WRITING] = "WRITING", +}; + +// There are loops in the state diagram, so some forward decl's are needed. +static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone); +static void startLocking(HashLock *lock, DataVIO *agent); +static void startWriting(HashLock *lock, DataVIO *agent); +static void unlockDuplicatePBN(VDOCompletion *completion); +static void transferAllocationLock(DataVIO *dataVIO); + +/**********************************************************************/ +PBNLock *getDuplicateLock(DataVIO *dataVIO) +{ + if (dataVIO->hashLock == NULL) { + return NULL; + } + return dataVIO->hashLock->duplicateLock; +} + +/**********************************************************************/ +const char *getHashLockStateName(HashLockState state) +{ + // Catch if a state has been added without updating the name array. + STATIC_ASSERT((HASH_LOCK_DESTROYING + 1) == COUNT_OF(LOCK_STATE_NAMES)); + return (state < COUNT_OF(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : NULL; +} + +/** + * Set the current state of a hash lock. + * + * @param lock The lock to update + * @param newState The new state + **/ +static void setHashLockState(HashLock *lock, HashLockState newState) +{ + if (false) { + logWarning("XXX %" PRIptr " %s -> %s", (void *) lock, + getHashLockStateName(lock->state), + getHashLockStateName(newState)); + } + lock->state = newState; +} + +/** + * Assert that a DataVIO is the agent of its hash lock, and that this is being + * called in the hash zone. + * + * @param dataVIO The DataVIO expected to be the lock agent + * @param where A string describing the function making the assertion + **/ +static void assertHashLockAgent(DataVIO *dataVIO, const char *where) +{ + // Not safe to access the agent field except from the hash zone. + assertInHashZone(dataVIO); + ASSERT_LOG_ONLY(dataVIO == dataVIO->hashLock->agent, + "%s must be for the hash lock agent", where); +} + +/** + * Set or clear the lock agent. + * + * @param lock The hash lock to update + * @param newAgent The new lock agent (may be NULL to clear the agent) + **/ +static void setAgent(HashLock *lock, DataVIO *newAgent) +{ + lock->agent = newAgent; +} + +/** + * Set the duplicate lock held by a hash lock. May only be called in the + * physical zone of the PBN lock. + * + * @param hashLock The hash lock to update + * @param pbnLock The PBN read lock to use as the duplicate lock + **/ +static void setDuplicateLock(HashLock *hashLock, PBNLock *pbnLock) +{ + ASSERT_LOG_ONLY((hashLock->duplicateLock == NULL), + "hash lock must not already hold a duplicate lock"); + + pbnLock->holderCount += 1; + hashLock->duplicateLock = pbnLock; +} + +/** + * Convert a pointer to the hashLockNode field in a DataVIO to the enclosing + * DataVIO. + * + * @param lockNode The RingNode to convert + * + * @return A pointer to the DataVIO containing the RingNode + **/ +static inline DataVIO *dataVIOFromLockNode(RingNode *lockNode) +{ + return (DataVIO *) ((byte *) lockNode - offsetof(DataVIO, hashLockNode)); +} + +/** + * Remove the first DataVIO from the lock's wait queue and return it. + * + * @param lock The lock containing the wait queue + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +static inline DataVIO *dequeueLockWaiter(HashLock *lock) +{ + return waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); +} + +/** + * Continue processing a DataVIO that has been waiting for an event, setting + * the result from the event, and continuing in a specified callback function. + * + * @param dataVIO The DataVIO to continue + * @param result The current result (will not mask older errors) + * @param callback The function in which to continue processing + **/ +static void continueDataVIOIn(DataVIO *dataVIO, + int result, + VDOAction *callback) +{ + dataVIOAsCompletion(dataVIO)->callback = callback; + continueDataVIO(dataVIO, result); +} + +/** + * Set, change, or clear the hash lock a DataVIO is using. Updates the hash + * lock (or locks) to reflect the change in membership. + * + * @param dataVIO The DataVIO to update + * @param newLock The hash lock the DataVIO is joining + **/ +static void setHashLock(DataVIO *dataVIO, HashLock *newLock) +{ + HashLock *oldLock = dataVIO->hashLock; + if (oldLock != NULL) { + ASSERT_LOG_ONLY(dataVIO->hashZone != NULL, + "must have a hash zone when halding a hash lock"); + ASSERT_LOG_ONLY(!isRingEmpty(&dataVIO->hashLockNode), + "must be on a hash lock ring when holding a hash lock"); + ASSERT_LOG_ONLY(oldLock->referenceCount > 0, + "hash lock reference must be counted"); + + if ((oldLock->state != HASH_LOCK_BYPASSING) + && (oldLock->state != HASH_LOCK_UNLOCKING)) { + // If the reference count goes to zero in a non-terminal state, we're + // most likely leaking this lock. + ASSERT_LOG_ONLY(oldLock->referenceCount > 1, + "hash locks should only become unreferenced in" + " a terminal state, not state %s", + getHashLockStateName(oldLock->state)); + } + + unspliceRingNode(&dataVIO->hashLockNode); + oldLock->referenceCount -= 1; + + dataVIO->hashLock = NULL; + } + + if (newLock != NULL) { + // Keep all DataVIOs sharing the lock on a ring since they can complete in + // any order and we'll always need a pointer to one to compare data. + pushRingNode(&newLock->duplicateRing, &dataVIO->hashLockNode); + newLock->referenceCount += 1; + + // XXX Not needed for VDOSTORY-190, but useful for checking whether a test + // is getting concurrent dedupe, and how much. + if (newLock->maxReferences < newLock->referenceCount) { + newLock->maxReferences = newLock->referenceCount; + } + + dataVIO->hashLock = newLock; + } +} + +/** + * Bottleneck for DataVIOs that have written or deduplicated and that are no + * longer needed to be an agent for the hash lock. + * + * @param dataVIO The DataVIO to complete and send to be cleaned up + **/ +static void exitHashLock(DataVIO *dataVIO) +{ + // XXX trace record? + + // Release the hash lock now, saving a thread transition in cleanup. + releaseHashLock(dataVIO); + + // Complete the DataVIO and start the clean-up path in vioWrite to release + // any locks it still holds. + finishDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * Retire the active lock agent, replacing it with the first lock waiter, and + * make the retired agent exit the hash lock. + * + * @param lock The hash lock to update + * + * @return The new lock agent (which will be NULL if there was no waiter) + **/ +static DataVIO *retireLockAgent(HashLock *lock) +{ + DataVIO *oldAgent = lock->agent; + DataVIO *newAgent = dequeueLockWaiter(lock); + setAgent(lock, newAgent); + exitHashLock(oldAgent); + if (newAgent != NULL) { + setDuplicateLocation(newAgent, lock->duplicate); + } + return newAgent; +} + +/** + * Callback to call compressData(), putting a DataVIO back on the write path. + * + * @param completion The DataVIO + **/ +static void compressDataCallback(VDOCompletion *completion) +{ + // XXX VDOSTORY-190 need an error check since compressData doesn't have one. + compressData(asDataVIO(completion)); +} + +/** + * Add a DataVIO to the lock's queue of waiters. + * + * @param lock The hash lock on which to wait + * @param dataVIO The DataVIO to add to the queue + **/ +static void waitOnHashLock(HashLock *lock, DataVIO *dataVIO) +{ + int result = enqueueDataVIO(&lock->waiters, dataVIO, THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + // This should be impossible, but if it somehow happens, give up on trying + // to dedupe the data. + setHashLock(dataVIO, NULL); + continueDataVIOIn(dataVIO, result, compressDataCallback); + return; + } + + // Make sure the agent doesn't block indefinitely in the packer since it now + // has at least one other DataVIO waiting on it. + if ((lock->state == HASH_LOCK_WRITING) && cancelCompression(lock->agent)) { + /* + * Even though we're waiting, we also have to send ourselves as a one-way + * message to the packer to ensure the agent continues executing. This is + * safe because cancelCompression() guarantees the agent won't continue + * executing until this message arrives in the packer, and because the + * wait queue link isn't used for sending the message. + */ + dataVIO->compression.lockHolder = lock->agent; + launchPackerCallback(dataVIO, removeLockHolderFromPacker, + THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); + } +} + +/** + * WaiterCallback function that calls compressData on the DataVIO waiter. + * + * @param waiter The DataVIO's waiter link + * @param context Not used + **/ +static void compressWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIO->isDuplicate = false; + compressData(dataVIO); +} + +/** + * Handle the result of the agent for the lock releasing a read lock on + * duplicate candidate due to aborting the hash lock. This continuation is + * registered in unlockDuplicatePBN(). + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void finishBypassing(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must have released the duplicate lock for the hash lock"); + exitHashLock(agent); +} + +/** + * Stop using the hash lock, resuming the old write path for the lock agent + * and any DataVIOs waiting on it, and put it in a state where DataVIOs + * entering the lock will use the old dedupe path instead of waiting. + * + * @param lock The hash lock + * @param agent The DataVIO acting as the agent for the lock + **/ +static void startBypassing(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_BYPASSING); + + // Ensure we don't attempt to update advice when cleaning up. + lock->updateAdvice = false; + + ASSERT_LOG_ONLY(((agent != NULL) || !hasWaiters(&lock->waiters)), + "should not have waiters without an agent"); + notifyAllWaiters(&lock->waiters, compressWaiter, NULL); + + if (lock->duplicateLock != NULL) { + if (agent != NULL) { + // The agent must reference the duplicate zone to launch it. + agent->duplicate = lock->duplicate; + launchDuplicateZoneCallback(agent, unlockDuplicatePBN, + THIS_LOCATION(NULL)); + return; + } + ASSERT_LOG_ONLY(false, "hash lock holding a PBN lock must have an agent"); + } + + if (agent == NULL) { + return; + } + + setAgent(lock, NULL); + agent->isDuplicate = false; + compressData(agent); +} + +/** + * Abort processing on this hash lock when noticing an error. Currently, this + * moves the hash lock to the BYPASSING state, to release all pending DataVIOs. + * + * @param lock The HashLock + * @param dataVIO The DataVIO with the error + **/ +static void abortHashLock(HashLock *lock, DataVIO *dataVIO) +{ + // If we've already aborted the lock, don't try to re-abort it; just exit. + if (lock->state == HASH_LOCK_BYPASSING) { + exitHashLock(dataVIO); + return; + } + + if (dataVIO != lock->agent) { + if ((lock->agent != NULL) || (lock->referenceCount > 1)) { + // Other DataVIOs are still sharing the lock (which should be DEDUPING), + // so just kick this one out of the lock to report its error. + ASSERT_LOG_ONLY(lock->agent == NULL, + "only active agent should call abortHashLock"); + exitHashLock(dataVIO); + return; + } + // Make the lone DataVIO the lock agent so it can abort and clean up. + setAgent(lock, dataVIO); + } + + startBypassing(lock, dataVIO); +} + +/** + * Handle the result of the agent for the lock releasing a read lock on + * duplicate candidate. This continuation is registered in + * unlockDuplicatePBN(). + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void finishUnlocking(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must have released the duplicate lock for the hash lock"); + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + if (!lock->verified) { + /* + * UNLOCKING -> WRITING transition: The lock we released was on an + * unverified block, so it must have been a lock on advice we were + * verifying, not on a location that was used for deduplication. Go write + * (or compress) the block to get a location to dedupe against. + */ + startWriting(lock, agent); + return; + } + + // With the lock released, the verified duplicate block may already have + // changed and will need to be re-verified if a waiter arrived. + lock->verified = false; + + if (hasWaiters(&lock->waiters)) { + /* + * UNLOCKING -> LOCKING transition: A new DataVIO entered the hash lock + * while the agent was releasing the PBN lock. The current agent exits and + * the waiter has to re-lock and re-verify the duplicate location. + */ + // XXX VDOSTORY-190 If we used the current agent to re-acquire the PBN + // lock we wouldn't need to re-verify. + agent = retireLockAgent(lock); + startLocking(lock, agent); + return; + } + + /* + * UNLOCKING -> DESTROYING transition: The agent is done with the lock + * and no other DataVIOs reference it, so remove it from the lock map + * and return it to the pool. + */ + exitHashLock(agent); +} + +/** + * Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. This continuation is launched by + * startUnlocking(), and calls back to finishUnlocking() on the hash zone + * thread. + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void unlockDuplicatePBN(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertInDuplicateZone(agent); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock != NULL, + "must have a duplicate lock to release"); + + releasePBNLock(agent->duplicate.zone, agent->duplicate.pbn, + &lock->duplicateLock); + + if (lock->state == HASH_LOCK_BYPASSING) { + launchHashZoneCallback(agent, finishBypassing, THIS_LOCATION(NULL)); + } else { + launchHashZoneCallback(agent, finishUnlocking, THIS_LOCATION(NULL)); + } +} + +/** + * Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. + * + * @param lock The hash lock + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startUnlocking(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_UNLOCKING); + + /* + * XXX If we arrange to continue on the duplicate zone thread when + * verification fails, and don't explicitly change lock states (or use an + * agent-local state, or an atomic), we can avoid a thread transition here. + */ + launchDuplicateZoneCallback(agent, unlockDuplicatePBN, THIS_LOCATION(NULL)); +} + +/** + * Process the result of a UDS update performed by the agent for the lock. + * This continuation is registered in startQuerying(). + * + * @param completion The completion of the DataVIO that performed the update + **/ +static void finishUpdating(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + // UDS was updated successfully, so don't update again unless the + // duplicate location changes due to rollover. + lock->updateAdvice = false; + + if (hasWaiters(&lock->waiters)) { + /* + * UPDATING -> DEDUPING transition: A new DataVIO arrived during the UDS + * update. Send it on the verified dedupe path. The agent is done with the + * lock, but the lock may still need to use it to clean up after rollover. + */ + startDeduping(lock, agent, true); + return; + } + + if (lock->duplicateLock != NULL) { + /* + * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we + * hold a duplicate PBN lock, so go release it. + */ + startUnlocking(lock, agent); + } else { + /* + * UPDATING -> DESTROYING transition: No one is waiting to dedupe and + * there's no lock to release. + */ + // XXX startDestroying(lock, agent); + startBypassing(lock, NULL); + exitHashLock(agent); + } +} + +/** + * Continue deduplication with the last step, updating UDS with the location + * of the duplicate that should be returned as advice in the future. + * + * @param lock The hash lock + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startUpdating(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_UPDATING); + + ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified"); + ASSERT_LOG_ONLY(lock->updateAdvice, "should only update advice if needed"); + + agent->lastAsyncOperation = UPDATE_INDEX; + setHashZoneCallback(agent, finishUpdating, THIS_LOCATION(NULL)); + dataVIOAsCompletion(agent)->layer->updateAlbireo(agent); +} + +/** + * Handle a DataVIO that has finished deduplicating against the block locked + * by the hash lock. If there are other DataVIOs still sharing the lock, this + * will just release the DataVIO's share of the lock and finish processing the + * DataVIO. If this is the last DataVIO holding the lock, this makes the + * DataVIO the lock agent and uses it to advance the state of the lock so it + * can eventually be released. + * + * @param lock The hash lock + * @param dataVIO The lock holder that has finished deduplicating + **/ +static void finishDeduping(HashLock *lock, DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING"); + ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), + "shouldn't have any lock waiters in DEDUPING"); + + // Just release the lock reference if other DataVIOs are still deduping. + if (lock->referenceCount > 1) { + exitHashLock(dataVIO); + return; + } + + // The hash lock must have an agent for all other lock states. + DataVIO *agent = dataVIO; + setAgent(lock, agent); + + if (lock->updateAdvice) { + /* + * DEDUPING -> UPDATING transition: The location of the duplicate block + * changed since the initial UDS query because of compression, rollover, + * or because the query agent didn't have an allocation. The UDS update + * was delayed in case there was another change in location, but with only + * this DataVIO using the hash lock, it's time to update the advice. + */ + startUpdating(lock, agent); + } else { + /* + * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the + * duplicate location so the hash lock itself can be released (contingent + * on no new DataVIOs arriving in the lock before the agent returns). + */ + startUnlocking(lock, agent); + } +} + +/** + * Implements WaiterCallback. Binds the DataVIO that was waiting to a new hash + * lock and waits on that lock. + **/ +static void enterForkedLock(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + HashLock *newLock = (HashLock *) context; + + setHashLock(dataVIO, newLock); + waitOnHashLock(newLock, dataVIO); +} + +/** + * Fork a hash lock because it has run out of increments on the duplicate PBN. + * Transfers the new agent and any lock waiters to a new hash lock instance + * which takes the place of the old lock in the lock map. The old lock remains + * active, but will not update advice. + * + * @param oldLock The hash lock to fork + * @param newAgent The DataVIO that will be the agent for the new lock + **/ +static void forkHashLock(HashLock *oldLock, DataVIO *newAgent) +{ + HashLock *newLock; + int result = acquireHashLockFromZone(newAgent->hashZone, + &newAgent->chunkName, + oldLock, &newLock); + if (result != VDO_SUCCESS) { + abortHashLock(oldLock, newAgent); + return; + } + + // Only one of the two locks should update UDS. The old lock is out of + // references, so it would be poor dedupe advice in the short term. + oldLock->updateAdvice = false; + newLock->updateAdvice = true; + + setHashLock(newAgent, newLock); + setAgent(newLock, newAgent); + + notifyAllWaiters(&oldLock->waiters, enterForkedLock, newLock); + + newAgent->isDuplicate = false; + startWriting(newLock, newAgent); +} + +/** + * Reserve a reference count increment for a DataVIO and launch it on the + * dedupe path. If no increments are available, this will roll over to a new + * hash lock and launch the DataVIO as the writing agent for that lock. + * + * @param lock The hash lock + * @param dataVIO The DataVIO to deduplicate using the hash lock + * @param hasClaim true if the dataVIO already has claimed + * an increment from the duplicate lock + **/ +static void launchDedupe(HashLock *lock, DataVIO *dataVIO, bool hasClaim) +{ + if (!hasClaim && !claimPBNLockIncrement(lock->duplicateLock)) { + // Out of increments, so must roll over to a new lock. + forkHashLock(lock, dataVIO); + return; + } + + // Deduplicate against the lock's verified location. + setDuplicateLocation(dataVIO, lock->duplicate); + launchDuplicateZoneCallback(dataVIO, shareBlock, + THIS_LOCATION("$F;cb=shareBlock")); +} + +/** + * Enter the hash lock state where DataVIOs deduplicate in parallel against a + * true copy of their data on disk. If the agent itself needs to deduplicate, + * an increment for it must already have been claimed from the duplicate lock, + * ensuring the hash lock will still have a DataVIO holding it. + * + * @param lock The hash lock + * @param agent The DataVIO acting as the agent for the lock + * @param agentIsDone true only if the agent has already written + * or deduplicated against its data + **/ +static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone) +{ + setHashLockState(lock, HASH_LOCK_DEDUPING); + + // We don't take the downgraded allocation lock from the agent unless we + // actually need to deduplicate against it. + if (lock->duplicateLock == NULL) { + ASSERT_LOG_ONLY(!isCompressed(agent->newMapped.state), + "compression must have shared a lock"); + ASSERT_LOG_ONLY(agentIsDone, "agent must have written the new duplicate"); + transferAllocationLock(agent); + } + + ASSERT_LOG_ONLY(isPBNReadLock(lock->duplicateLock), + "duplicateLock must be a PBN read lock"); + + /* + * This state is not like any of the other states. There is no designated + * agent--the agent transitioning to this state and all the waiters will be + * launched to deduplicate in parallel. + */ + setAgent(lock, NULL); + + /* + * Launch the agent (if not already deduplicated) and as many lock waiters + * as we have available increments for on the dedupe path. If we run out of + * increments, rollover will be triggered and the remaining waiters will be + * transferred to the new lock. + */ + if (!agentIsDone) { + launchDedupe(lock, agent, true); + agent = NULL; + } + while (hasWaiters(&lock->waiters)) { + launchDedupe(lock, dequeueLockWaiter(lock), false); + } + + if (agentIsDone) { + /* + * In the degenerate case where all the waiters rolled over to a new lock, + * this will continue to use the old agent to clean up this lock, and + * otherwise it just lets the agent exit the lock. + */ + finishDeduping(lock, agent); + } +} + +/** + * Handle the result of the agent for the lock comparing its data to the + * duplicate candidate. This continuation is registered in startVerifying(). + * + * @param completion The completion of the DataVIO used to verify dedupe + **/ +static void finishVerifying(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + // XXX VDOSTORY-190 should convert verify IO errors to verification failure + abortHashLock(lock, agent); + return; + } + + lock->verified = agent->isDuplicate; + + // Only count the result of the initial verification of the advice as valid + // or stale, and not any re-verifications due to PBN lock releases. + if (!lock->verifyCounted) { + lock->verifyCounted = true; + if (lock->verified) { + bumpHashZoneValidAdviceCount(agent->hashZone); + } else { + bumpHashZoneStaleAdviceCount(agent->hashZone); + } + } + + // Even if the block is a verified duplicate, we can't start to deduplicate + // unless we can claim a reference count increment for the agent. + if (lock->verified && !claimPBNLockIncrement(lock->duplicateLock)) { + agent->isDuplicate = false; + lock->verified = false; + } + + if (lock->verified) { + /* + * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, + * so start deduplicating against it, if references are available. + */ + startDeduping(lock, agent, false); + } else { + /* + * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try + * to dedupe and roll over immediately, which would fail because it would + * leave the lock without an agent to release the PBN lock. In both cases, + * the data will have to be written or compressed, but first the advice + * PBN must be unlocked by the VERIFYING agent. + */ + lock->updateAdvice = true; + startUnlocking(lock, agent); + } +} + +/** + * Continue the deduplication path for a hash lock by using the agent to read + * (and possibly decompress) the data at the candidate duplicate location, + * comparing it to the data in the agent to verify that the candidate is + * identical to all the DataVIOs sharing the hash. If so, it can be + * deduplicated against, otherwise a DataVIO allocation will have to be + * written to and used for dedupe. + * + * @param lock The hash lock (must be LOCKING) + * @param agent The DataVIO to use to read and compare candidate data + **/ +static void startVerifying(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_VERIFYING); + ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once"); + + /* + * XXX VDOSTORY-190 Optimization: This is one of those places where the zone + * and continuation we want to use depends on the outcome of the comparison. + * If we could choose which path in the layer thread before continuing, we + * could save a thread transition in one of the two cases (assuming we're + * willing to delay visibility of the the hash lock state change). + */ + VDOCompletion *completion = dataVIOAsCompletion(agent); + agent->lastAsyncOperation = VERIFY_DEDUPLICATION; + setHashZoneCallback(agent, finishVerifying, THIS_LOCATION(NULL)); + completion->layer->verifyDuplication(agent); +} + +/** + * Handle the result of the agent for the lock attempting to obtain a PBN read + * lock on the candidate duplicate block. this continuation is registered in + * lockDuplicatePBN(). + * + * @param completion The completion of the DataVIO that attempted to get + * the read lock + **/ +static void finishLocking(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + abortHashLock(lock, agent); + return; + } + + if (!agent->isDuplicate) { + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must not hold duplicateLock if not flagged as a duplicate"); + /* + * LOCKING -> WRITING transition: The advice block is being modified or + * has no available references, so try to write or compress the data, + * remembering to update UDS later with the new advice. + */ + bumpHashZoneStaleAdviceCount(agent->hashZone); + lock->updateAdvice = true; + startWriting(lock, agent); + return; + } + + ASSERT_LOG_ONLY(lock->duplicateLock != NULL, + "must hold duplicateLock if flagged as a duplicate"); + + if (!lock->verified) { + /* + * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, + * reading the candidate duplicate and comparing it to the agent's data to + * decide whether it is a true duplicate or stale advice. + */ + startVerifying(lock, agent); + return; + } + + if (!claimPBNLockIncrement(lock->duplicateLock)) { + /* + * LOCKING -> UNLOCKING transition: The verified block was re-locked, but + * has no available increments left. Must first release the useless PBN + * read lock before rolling over to a new copy of the block. + */ + agent->isDuplicate = false; + lock->verified = false; + lock->updateAdvice = true; + startUnlocking(lock, agent); + return; + } + + /* + * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, + * deduplicating against a location that was previously verified or + * written to. + */ + startDeduping(lock, agent, false); +} + +/** + * Acquire a read lock on the PBN of the block containing candidate duplicate + * data (compressed or uncompressed). If the PBN is already locked for + * writing, the lock attempt is abandoned and isDuplicate will be cleared + * before calling back. this continuation is launched from startLocking(), and + * calls back to finishLocking() on the hash zone thread. + * + * @param completion The completion of the DataVIO attempting to acquire the + * physical block lock on behalf of its hash lock + **/ +static void lockDuplicatePBN(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + PhysicalZone *zone = agent->duplicate.zone; + assertInDuplicateZone(agent); + + setHashZoneCallback(agent, finishLocking, THIS_LOCATION(NULL)); + + // While in the zone that owns it, find out how many additional references + // can be made to the block if it turns out to truly be a duplicate. + SlabDepot *depot = getSlabDepot(getVDOFromDataVIO(agent)); + unsigned int incrementLimit = getIncrementLimit(depot, agent->duplicate.pbn); + if (incrementLimit == 0) { + // We could deduplicate against it later if a reference happened to be + // released during verification, but it's probably better to bail out now. + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + continueDataVIO(agent, VDO_SUCCESS); + return; + } + + PBNLock *lock; + int result = attemptPBNLock(zone, agent->duplicate.pbn, VIO_READ_LOCK, + &lock); + if (result != VDO_SUCCESS) { + continueDataVIO(agent, result); + return; + } + + if (!isPBNReadLock(lock)) { + /* + * There are three cases of write locks: uncompressed data block writes, + * compressed (packed) block writes, and block map page writes. In all + * three cases, we give up on trying to verify the advice and don't bother + * to try deduplicate against the data in the write lock holder. + * + * 1) We don't ever want to try to deduplicate against a block map page. + * + * 2a) It's very unlikely we'd deduplicate against an entire packed block, + * both because of the chance of matching it, and because we don't record + * advice for it, but for the uncompressed representation of all the + * fragments it contains. The only way we'd be getting lock contention is + * if we've written the same representation coincidentally before, had it + * become unreferenced, and it just happened to be packed together from + * compressed writes when we go to verify the lucky advice. Giving up is a + * miniscule loss of potential dedupe. + * + * 2b) If the advice is for a slot of a compressed block, it's about to + * get smashed, and the write smashing it cannot contain our data--it + * would have to be writing on behalf of our hash lock, but that's + * impossible since we're the lock agent. + * + * 3a) If the lock is held by a DataVIO with different data, the advice is + * already stale or is about to become stale. + * + * 3b) If the lock is held by a DataVIO that matches us, we may as well + * either write it ourselves (or reference the copy we already wrote) + * instead of potentially having many duplicates wait for the lock holder + * to write, journal, hash, and finally arrive in the hash lock. All we + * lose is a chance to avoid a UDS update in the very rare case of advice + * for a free block that just happened to be allocated to a DataVIO with + * the same hash. In async mode, there's also a chance to save on a block + * write, at the cost of a block verify. Saving on a full block compare in + * all stale advice cases almost certainly outweighs saving a UDS update + * in a lucky case where advice would have been saved from becoming stale. + */ + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + continueDataVIO(agent, VDO_SUCCESS); + return; + } + + if (lock->holderCount == 0) { + // Ensure that the newly-locked block is referenced. + Slab *slab = getSlab(depot, agent->duplicate.pbn); + result = acquireProvisionalReference(slab, agent->duplicate.pbn, lock); + if (result != VDO_SUCCESS) { + logWarningWithStringError(result, + "Error acquiring provisional reference for " + "dedupe candidate; aborting dedupe"); + agent->isDuplicate = false; + releasePBNLock(zone, agent->duplicate.pbn, &lock); + continueDataVIO(agent, result); + return; + } + + /* + * The increment limit we grabbed earlier is still valid. The lock now + * holds the rights to acquire all those references. Those rights will be + * claimed by hash locks sharing this read lock. + */ + lock->incrementLimit = incrementLimit; + } + + // We've successfully acquired a read lock on behalf of the hash lock, + // so mark it as such. + setDuplicateLock(agent->hashLock, lock); + + /* + * XXX VDOSTORY-190 Optimization: Same as startLocking() lazily changing + * state to save on having to switch back to the hash zone thread. Here we + * could directly launch the block verify, then switch to a hash thread. + */ + continueDataVIO(agent, VDO_SUCCESS); +} + +/** + * Continue deduplication for a hash lock that has obtained valid advice + * of a potential duplicate through its agent. + * + * @param lock The hash lock (currently must be QUERYING) + * @param agent The DataVIO bearing the dedupe advice + **/ +static void startLocking(HashLock *lock, DataVIO *agent) +{ + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must not acquire a duplicate lock when already holding it"); + + setHashLockState(lock, HASH_LOCK_LOCKING); + + /* + * XXX VDOSTORY-190 Optimization: If we arrange to continue on the duplicate + * zone thread when accepting the advice, and don't explicitly change lock + * states (or use an agent-local state, or an atomic), we can avoid a thread + * transition here. + */ + agent->lastAsyncOperation = ACQUIRE_PBN_READ_LOCK; + launchDuplicateZoneCallback(agent, lockDuplicatePBN, THIS_LOCATION(NULL)); +} + +/** + * Re-entry point for the lock agent after it has finished writing or + * compressing its copy of the data block. The agent will never need to dedupe + * against anything, so it's done with the lock, but the lock may not be + * finished with it, as a UDS update might still be needed. + * + * If there are other lock holders, the agent will hand the job to one of them + * and exit, leaving the lock to deduplicate against the just-written block. + * If there are no other lock holders, the agent either exits (and later tears + * down the hash lock), or it remains the agent and updates UDS. + * + * @param lock The hash lock, which must be in state WRITING + * @param agent The DataVIO that wrote its data for the lock + **/ +static void finishWriting(HashLock *lock, DataVIO *agent) +{ + // Dedupe against the data block or compressed block slot the agent wrote. + // Since we know the write succeeded, there's no need to verify it. + lock->duplicate = agent->newMapped; + lock->verified = true; + + if (isCompressed(lock->duplicate.state) && lock->registered) { + // Compression means the location we gave in the UDS query is not the + // location we're using to deduplicate. + lock->updateAdvice = true; + } + + // If there are any waiters, we need to start deduping them. + if (hasWaiters(&lock->waiters)) { + /* + * WRITING -> DEDUPING transition: an asynchronously-written block + * failed to compress, so the PBN lock on the written copy was already + * transferred. The agent is done with the lock, but the lock may + * still need to use it to clean up after rollover. + */ + startDeduping(lock, agent, true); + return; + } + + // There are no waiters and the agent has successfully written, so take a + // step towards being able to release the hash lock (or just release it). + if (lock->updateAdvice) { + /* + * WRITING -> UPDATING transition: There's no waiter and a UDS update is + * needed, so retain the WRITING agent and use it to launch the update. + * The happens on compression, rollover, or the QUERYING agent not having + * an allocation. + */ + startUpdating(lock, agent); + } else if (lock->duplicateLock != NULL) { + /* + * WRITING -> UNLOCKING transition: There's no waiter and no update + * needed, but the compressed write gave us a shared duplicate lock that + * we must release. + */ + setDuplicateLocation(agent, lock->duplicate); + startUnlocking(lock, agent); + } else { + /* + * WRITING -> DESTROYING transition: There's no waiter, no update needed, + * and no duplicate lock held, so both the agent and lock have no more + * work to do. The agent will release its allocation lock in cleanup. + */ + // XXX startDestroying(lock, agent); + startBypassing(lock, NULL); + exitHashLock(agent); + } +} + +/** + * Search through the lock waiters for a DataVIO that has an allocation. If + * one is found, swap agents, put the old agent at the head of the wait queue, + * then return the new agent. Otherwise, just return the current agent. + * + * @param lock The hash lock to modify + **/ +static DataVIO *selectWritingAgent(HashLock *lock) +{ + // This should-be-impossible condition is the only cause for + // enqueueDataVIO() to fail later on, where it would be a pain to handle. + int result = ASSERT(!isWaiting(dataVIOAsWaiter(lock->agent)), + "agent must not be waiting"); + if (result != VDO_SUCCESS) { + return lock->agent; + } + + WaitQueue tempQueue; + initializeWaitQueue(&tempQueue); + + // Move waiters to the temp queue one-by-one until we find an allocation. + // Not ideal to search, but it only happens when nearly out of space. + DataVIO *dataVIO; + while (((dataVIO = dequeueLockWaiter(lock)) != NULL) + && !hasAllocation(dataVIO)) { + // Use the lower-level enqueue since we're just moving waiters around. + int result = enqueueWaiter(&tempQueue, dataVIOAsWaiter(dataVIO)); + // The only error is the DataVIO already being on a wait queue, and since + // we just dequeued it, that could only happen due to a memory smash or + // concurrent use of that DataVIO. + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); + } + + if (dataVIO != NULL) { + // Move the rest of the waiters over to the temp queue, preserving the + // order they arrived at the lock. + transferAllWaiters(&lock->waiters, &tempQueue); + + // The current agent is being replaced and will have to wait to dedupe; + // make it the first waiter since it was the first to reach the lock. + int result = enqueueDataVIO(&lock->waiters, lock->agent, + THIS_LOCATION(NULL)); + ASSERT_LOG_ONLY(result == VDO_SUCCESS, + "impossible enqueueDataVIO error after isWaiting checked"); + setAgent(lock, dataVIO); + } else { + // No one has an allocation, so keep the current agent. + dataVIO = lock->agent; + } + + // Swap all the waiters back onto the lock's queue. + transferAllWaiters(&tempQueue, &lock->waiters); + return dataVIO; +} + +/** + * Begin the non-duplicate write path for a hash lock that had no advice, + * selecting a DataVIO with an allocation as a new agent, if necessary, + * then resuming the agent on the DataVIO write path. + * + * @param lock The hash lock (currently must be QUERYING) + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startWriting(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_WRITING); + + // The agent might not have received an allocation and so can't be used for + // writing, but it's entirely possible that one of the waiters did. + if (!hasAllocation(agent)) { + agent = selectWritingAgent(lock); + // If none of the waiters had an allocation, the writes all have to fail. + if (!hasAllocation(agent)) { + /* + * XXX VDOSTORY-190 Should we keep a variant of BYPASSING that causes + * new arrivals to fail immediately if they don't have an allocation? It + * might be possible that on some path there would be non-waiters still + * referencing the lock, so it would remain in the map as everything is + * currently spelled, even if the agent and all the waiters release. + */ + startBypassing(lock, agent); + return; + } + } + + // If the agent compresses, it might wait indefinitely in the packer, + // which would be bad if there are any other DataVIOs waiting. + if (hasWaiters(&lock->waiters)) { + // XXX in sync mode, transition directly to LOCKING to start dedupe? + cancelCompression(agent); + } + + /* + * Send the agent to the compress/pack/async-write path in vioWrite. If it + * succeeds, it will return to the hash lock via continueHashLock() and call + * finishWriting(). + */ + compressData(agent); +} + +/** + * Process the result of a UDS query performed by the agent for the lock. This + * continuation is registered in startQuerying(). + * + * @param completion The completion of the DataVIO that performed the query + **/ +static void finishQuerying(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + if (agent->isDuplicate) { + lock->duplicate = agent->duplicate; + /* + * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. + * Use the QUERYING agent to start the hash lock on the unverified dedupe + * path, verifying that the advice can be used. + */ + startLocking(lock, agent); + } else { + // The agent will be used as the duplicate if has an allocation; if it + // does, that location was posted to UDS, so no update will be needed. + lock->updateAdvice = !hasAllocation(agent); + /* + * QUERYING -> WRITING transition: There was no advice or the advice + * wasn't valid, so try to write or compress the data. + */ + startWriting(lock, agent); + } +} + +/** + * Start deduplication for a hash lock that has finished initializing by + * making the DataVIO that requested it the agent, entering the QUERYING + * state, and using the agent to perform the UDS query on behalf of the lock. + * + * @param lock The initialized hash lock + * @param dataVIO The DataVIO that has just obtained the new lock + **/ +static void startQuerying(HashLock *lock, DataVIO *dataVIO) +{ + setAgent(lock, dataVIO); + setHashLockState(lock, HASH_LOCK_QUERYING); + + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + dataVIO->lastAsyncOperation = CHECK_FOR_DEDUPLICATION; + setHashZoneCallback(dataVIO, finishQuerying, THIS_LOCATION(NULL)); + completion->layer->checkForDuplication(dataVIO); +} + +/** + * Complain that a DataVIO has entered a HashLock that is in an unimplemented + * or unusable state and continue the DataVIO with an error. + * + * @param lock The hash lock + * @param dataVIO The DataVIO attempting to enter the lock + **/ +static void reportBogusLockState(HashLock *lock, DataVIO *dataVIO) +{ + int result = ASSERT_FALSE("hash lock must not be in unimplemented state %s", + getHashLockStateName(lock->state)); + continueDataVIOIn(dataVIO, result, compressDataCallback); +} + +/**********************************************************************/ +void enterHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + switch (lock->state) { + case HASH_LOCK_INITIALIZING: + startQuerying(lock, dataVIO); + break; + + case HASH_LOCK_QUERYING: + case HASH_LOCK_WRITING: + case HASH_LOCK_UPDATING: + case HASH_LOCK_LOCKING: + case HASH_LOCK_VERIFYING: + case HASH_LOCK_UNLOCKING: + // The lock is busy, and can't be shared yet. + waitOnHashLock(lock, dataVIO); + break; + + case HASH_LOCK_BYPASSING: + // Bypass dedupe entirely. + compressData(dataVIO); + break; + + case HASH_LOCK_DEDUPING: + launchDedupe(lock, dataVIO, false); + break; + + case HASH_LOCK_DESTROYING: + // A lock in this state should not be acquired by new VIOs. + reportBogusLockState(lock, dataVIO); + break; + + default: + reportBogusLockState(lock, dataVIO); + } +} + +/**********************************************************************/ +void continueHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + // XXX VDOSTORY-190 Eventually we may be able to fold the error handling + // in at this point instead of using a separate entry point for it. + + switch (lock->state) { + case HASH_LOCK_WRITING: + ASSERT_LOG_ONLY(dataVIO == lock->agent, + "only the lock agent may continue the lock"); + finishWriting(lock, dataVIO); + break; + + case HASH_LOCK_DEDUPING: + finishDeduping(lock, dataVIO); + break; + + case HASH_LOCK_BYPASSING: + // This DataVIO has finished the write path and the lock doesn't need it. + // XXX This isn't going to be correct if DEDUPING ever uses BYPASSING. + finishDataVIO(dataVIO, VDO_SUCCESS); + break; + + case HASH_LOCK_INITIALIZING: + case HASH_LOCK_QUERYING: + case HASH_LOCK_UPDATING: + case HASH_LOCK_LOCKING: + case HASH_LOCK_VERIFYING: + case HASH_LOCK_UNLOCKING: + case HASH_LOCK_DESTROYING: + // A lock in this state should never be re-entered. + reportBogusLockState(lock, dataVIO); + break; + + default: + reportBogusLockState(lock, dataVIO); + } +} + +/**********************************************************************/ +void continueHashLockOnError(DataVIO *dataVIO) +{ + // XXX We could simply use continueHashLock() and check for errors in that. + abortHashLock(dataVIO->hashLock, dataVIO); +} + +/** + * Check whether the data in DataVIOs sharing a lock is different than in a + * DataVIO seeking to share the lock, which should only be possible in the + * extremely unlikely case of a hash collision. + * + * @param lock The lock to check + * @param candidate The DataVIO seeking to share the lock + * + * @return true if the given DataVIO must not share the lock + * because it doesn't have the same data as the lock holders + **/ +static bool isHashCollision(HashLock *lock, DataVIO *candidate) +{ + if (isRingEmpty(&lock->duplicateRing)) { + return false; + } + + DataVIO *lockHolder = dataVIOFromLockNode(lock->duplicateRing.next); + PhysicalLayer *layer = dataVIOAsCompletion(candidate)->layer; + bool collides = !layer->compareDataVIOs(lockHolder, candidate); + + if (collides) { + bumpHashZoneCollisionCount(candidate->hashZone); + } else { + bumpHashZoneDataMatchCount(candidate->hashZone); + } + + return collides; +} + +/**********************************************************************/ +static inline int assertHashLockPreconditions(const DataVIO *dataVIO) +{ + int result = ASSERT(dataVIO->hashLock == NULL, + "must not already hold a hash lock"); + if (result != VDO_SUCCESS) { + return result; + } + result = ASSERT(isRingEmpty(&dataVIO->hashLockNode), + "must not already be a member of a hash lock ring"); + if (result != VDO_SUCCESS) { + return result; + } + return ASSERT(dataVIO->recoverySequenceNumber == 0, + "must not hold a recovery lock when getting a hash lock"); +} + +/**********************************************************************/ +int acquireHashLock(DataVIO *dataVIO) +{ + int result = assertHashLockPreconditions(dataVIO); + if (result != VDO_SUCCESS) { + return result; + } + + HashLock *lock; + result = acquireHashLockFromZone(dataVIO->hashZone, &dataVIO->chunkName, + NULL, &lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (isHashCollision(lock, dataVIO)) { + // Hash collisions are extremely unlikely, but the bogus dedupe would be a + // data corruption. Bypass dedupe entirely by leaving hashLock unset. + // XXX clear hashZone too? + return VDO_SUCCESS; + } + + setHashLock(dataVIO, lock); + return VDO_SUCCESS; +} + +/**********************************************************************/ +void releaseHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + if (lock == NULL) { + return; + } + + setHashLock(dataVIO, NULL); + + if (lock->referenceCount > 0) { + // The lock is still in use by other DataVIOs. + return; + } + + setHashLockState(lock, HASH_LOCK_DESTROYING); + returnHashLockToZone(dataVIO->hashZone, &lock); +} + +/** + * Transfer a DataVIO's downgraded allocation PBN lock to the DataVIO's hash + * lock, converting it to a duplicate PBN lock. + * + * @param dataVIO The DataVIO holding the allocation lock to transfer + **/ +static void transferAllocationLock(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->newMapped.pbn == getDataVIOAllocation(dataVIO), + "transferred lock must be for the block written"); + + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + PBNLock *pbnLock = allocatingVIO->allocationLock; + allocatingVIO->allocationLock = NULL; + allocatingVIO->allocation = ZERO_BLOCK; + + ASSERT_LOG_ONLY(isPBNReadLock(pbnLock), + "must have downgraded the allocation lock before transfer"); + + HashLock *hashLock = dataVIO->hashLock; + hashLock->duplicate = dataVIO->newMapped; + dataVIO->duplicate = dataVIO->newMapped; + + // Since the lock is being transferred, the holder count doesn't change (and + // isn't even safe to examine on this thread). + hashLock->duplicateLock = pbnLock; +} + +/**********************************************************************/ +void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock) +{ + ASSERT_LOG_ONLY(getDuplicateLock(dataVIO) == NULL, + "a duplicate PBN lock should not exist when writing"); + ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), + "lock transfer must be for a compressed write"); + assertInNewMappedZone(dataVIO); + + // First sharer downgrades the lock. + if (!isPBNReadLock(pbnLock)) { + downgradePBNWriteLock(pbnLock); + } + + // Get a share of the PBN lock, ensuring it cannot be released until + // after this DataVIO has had a chance to journal a reference. + dataVIO->duplicate = dataVIO->newMapped; + dataVIO->hashLock->duplicate = dataVIO->newMapped; + setDuplicateLock(dataVIO->hashLock, pbnLock); + + // Claim a reference for this DataVIO, which is necessary since another + // HashLock might start deduplicating against it before our incRef. + bool claimed = claimPBNLockIncrement(pbnLock); + ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment"); +} diff --git a/vdo/base/hashLock.h b/vdo/base/hashLock.h new file mode 100644 index 0000000..b21e465 --- /dev/null +++ b/vdo/base/hashLock.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.h#3 $ + */ + +#ifndef HASH_LOCK_H +#define HASH_LOCK_H + +#include "types.h" + +/** + * Get the PBN lock on the duplicate data location for a DataVIO from the + * HashLock the DataVIO holds (if there is one). + * + * @param dataVIO The DataVIO to query + * + * @return The PBN lock on the DataVIO's duplicate location + **/ +PBNLock *getDuplicateLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Acquire or share a lock on the hash (chunk name) of the data in a DataVIO, + * updating the DataVIO to reference the lock. This must only be called in the + * correct thread for the zone. In the unlikely case of a hash collision, this + * function will succeed, but the DataVIO will not get a lock reference. + * + * @param dataVIO The DataVIO acquiring a lock on its chunk name + **/ +int acquireHashLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Asynchronously process a DataVIO that has just acquired its reference to a + * hash lock. This may place the DataVIO on a wait queue, or it may use the + * DataVIO to perform operations on the lock's behalf. + * + * @param dataVIO The DataVIO that has just acquired a lock on its chunk name + **/ +void enterHashLock(DataVIO *dataVIO); + +/** + * Asynchronously continue processing a DataVIO in its hash lock after it has + * finished writing, compressing, or deduplicating, so it can share the result + * with any DataVIOs waiting in the hash lock, or update Albireo, or simply + * release its share of the lock. This must only be called in the correct + * thread for the hash zone. + * + * @param dataVIO The DataVIO to continue processing in its hash lock + **/ +void continueHashLock(DataVIO *dataVIO); + +/** + * Re-enter the hash lock after encountering an error, to clean up the hash + * lock. + * + * @param dataVIO The DataVIO with an error + **/ +void continueHashLockOnError(DataVIO *dataVIO); + +/** + * Release a DataVIO's share of a hash lock, if held, and null out the + * DataVIO's reference to it. This must only be called in the correct thread + * for the hash zone. + * + * If the DataVIO is the only one holding the lock, this also releases any + * resources or locks used by the hash lock (such as a PBN read lock on a + * block containing data with the same hash) and returns the lock to the hash + * zone's lock pool. + * + * @param dataVIO The DataVIO releasing its hash lock + **/ +void releaseHashLock(DataVIO *dataVIO); + +/** + * Make a DataVIO's hash lock a shared holder of the PBN lock on the + * compressed block to which its data was just written. If the lock is still a + * write lock (as it will be for the first share), it will be converted to a + * read lock. This also reserves a reference count increment for the DataVIO. + * + * @param dataVIO The DataVIO which was just compressed + * @param pbnLock The PBN lock on the compressed block + **/ +void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock); + +#endif // HASH_LOCK_H diff --git a/vdo/base/hashLockInternals.h b/vdo/base/hashLockInternals.h new file mode 100644 index 0000000..67b5634 --- /dev/null +++ b/vdo/base/hashLockInternals.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLockInternals.h#2 $ + */ + +#ifndef HASH_LOCK_INTERNALS_H +#define HASH_LOCK_INTERNALS_H + +#include "completion.h" +#include "ringNode.h" +#include "types.h" +#include "uds.h" +#include "waitQueue.h" + +typedef enum { + /** State for locks that are not in use or are being initialized. */ + HASH_LOCK_INITIALIZING = 0, + + // This is the sequence of states typically used on the non-dedupe path. + HASH_LOCK_QUERYING, + HASH_LOCK_WRITING, + HASH_LOCK_UPDATING, + + // The remaining states are typically used on the dedupe path in this order. + HASH_LOCK_LOCKING, + HASH_LOCK_VERIFYING, + HASH_LOCK_DEDUPING, + HASH_LOCK_UNLOCKING, + + // XXX This is a temporary state denoting a lock which is sending VIOs back + // to the old dedupe and vioWrite pathways. It won't be in the final version + // of VDOSTORY-190. + HASH_LOCK_BYPASSING, + + /** + * Terminal state for locks returning to the pool. Must be last both because + * it's the final state, and also because it's used to count the states. + **/ + HASH_LOCK_DESTROYING, +} HashLockState; + +struct hashLock { + /** When the lock is unused, this RingNode allows the lock to be pooled */ + RingNode poolNode; + + /** The block hash covered by this lock */ + UdsChunkName hash; + + /** + * A ring containing the DataVIOs sharing this lock, all having the same + * chunk name and data block contents, linked by their hashLockNode fields. + **/ + RingNode duplicateRing; + + /** The number of DataVIOs sharing this lock instance */ + VIOCount referenceCount; + + /** The maximum value of referenceCount in the lifetime of this lock */ + VIOCount maxReferences; + + /** The current state of this lock */ + HashLockState state; + + /** True if the UDS index should be updated with new advice */ + bool updateAdvice; + + /** True if the advice has been verified to be a true duplicate */ + bool verified; + + /** True if the lock has already accounted for an initial verification */ + bool verifyCounted; + + /** True if this lock is registered in the lock map (cleared on rollover) */ + bool registered; + + /** + * If verified is false, this is the location of a possible duplicate. + * If verified is true, is is the verified location of a true duplicate. + **/ + ZonedPBN duplicate; + + /** The PBN lock on the block containing the duplicate data */ + PBNLock *duplicateLock; + + /** The DataVIO designated to act on behalf of the lock */ + DataVIO *agent; + + /** + * Other DataVIOs with data identical to the agent who are currently waiting + * for the agent to get the information they all need to deduplicate--either + * against each other, or against an existing duplicate on disk. + **/ + WaitQueue waiters; +}; + +/** + * Initialize a HashLock instance which has been newly allocated. + * + * @param lock The lock to initialize + **/ +static inline void initializeHashLock(HashLock *lock) +{ + initializeRing(&lock->poolNode); + initializeRing(&lock->duplicateRing); + initializeWaitQueue(&lock->waiters); +} + +/** + * Get the string representation of a hash lock state. + * + * @param state The hash lock state + * + * @return The short string representing the state + **/ +const char *getHashLockStateName(HashLockState state) + __attribute__((warn_unused_result)); + +#endif // HASH_LOCK_INTERNALS_H diff --git a/vdo/base/hashZone.c b/vdo/base/hashZone.c new file mode 100644 index 0000000..61345a7 --- /dev/null +++ b/vdo/base/hashZone.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.c#3 $ + */ + +#include "hashZone.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "constants.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "hashLockInternals.h" +#include "pointerMap.h" +#include "ringNode.h" +#include "statistics.h" +#include "threadConfig.h" +#include "types.h" +#include "vdoInternal.h" + +enum { + LOCK_POOL_CAPACITY = MAXIMUM_USER_VIOS, +}; + +/** + * These fields are only modified by the locks sharing the hash zone thread, + * but are queried by other threads. + **/ +typedef struct atomicHashLockStatistics { + /** Number of times the UDS advice proved correct */ + Atomic64 dedupeAdviceValid; + + /** Number of times the UDS advice proved incorrect */ + Atomic64 dedupeAdviceStale; + + /** Number of writes with the same data as another in-flight write */ + Atomic64 concurrentDataMatches; + + /** Number of writes whose hash collided with an in-flight write */ + Atomic64 concurrentHashCollisions; +} AtomicHashLockStatistics; + +struct hashZone { + /** Which hash zone this is */ + ZoneCount zoneNumber; + + /** The thread ID for this zone */ + ThreadID threadID; + + /** Mapping from chunkName fields to HashLocks */ + PointerMap *hashLockMap; + + /** Ring containing all unused HashLocks */ + RingNode lockPool; + + /** Statistics shared by all hash locks in this zone */ + AtomicHashLockStatistics statistics; + + /** Array of all HashLocks */ + HashLock *lockArray; +}; + +/** + * Implements PointerKeyComparator. + **/ +static bool compareKeys(const void *thisKey, const void *thatKey) +{ + // Null keys are not supported. + return (memcmp(thisKey, thatKey, sizeof(UdsChunkName)) == 0); +} + +/** + * Implements PointerKeyComparator. + **/ +static uint32_t hashKey(const void *key) +{ + const UdsChunkName *name = key; + /* + * Use a fragment of the chunk name as a hash code. It must not overlap with + * fragments used elsewhere to ensure uniform distributions. + */ + // XXX pick an offset in the chunk name that isn't used elsewhere + return getUInt32LE(&name->name[4]); +} + +/**********************************************************************/ +static inline HashLock *asHashLock(RingNode *poolNode) +{ + STATIC_ASSERT(offsetof(HashLock, poolNode) == 0); + return (HashLock *) poolNode; +} + +/**********************************************************************/ +int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) +{ + HashZone *zone; + int result = ALLOCATE(1, HashZone, __func__, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + result = makePointerMap(LOCK_MAP_CAPACITY, 0, compareKeys, hashKey, + &zone->hashLockMap); + if (result != VDO_SUCCESS) { + freeHashZone(&zone); + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getHashZoneThread(getThreadConfig(vdo), zoneNumber); + initializeRing(&zone->lockPool); + + result = ALLOCATE(LOCK_POOL_CAPACITY, HashLock, "HashLock array", + &zone->lockArray); + if (result != VDO_SUCCESS) { + freeHashZone(&zone); + return result; + } + + for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { + HashLock *lock = &zone->lockArray[i]; + initializeHashLock(lock); + pushRingNode(&zone->lockPool, &lock->poolNode); + } + + *zonePtr = zone; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeHashZone(HashZone **zonePtr) +{ + if (*zonePtr == NULL) { + return; + } + + HashZone *zone = *zonePtr; + freePointerMap(&zone->hashLockMap); + FREE(zone->lockArray); + FREE(zone); + *zonePtr = NULL; +} + +/**********************************************************************/ +ZoneCount getHashZoneNumber(const HashZone *zone) +{ + return zone->zoneNumber; +} + +/**********************************************************************/ +ThreadID getHashZoneThreadID(const HashZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +HashLockStatistics getHashZoneStatistics(const HashZone *zone) +{ + const AtomicHashLockStatistics *atoms = &zone->statistics; + return (HashLockStatistics) { + .dedupeAdviceValid = relaxedLoad64(&atoms->dedupeAdviceValid), + .dedupeAdviceStale = relaxedLoad64(&atoms->dedupeAdviceStale), + .concurrentDataMatches = relaxedLoad64(&atoms->concurrentDataMatches), + .concurrentHashCollisions + = relaxedLoad64(&atoms->concurrentHashCollisions), + }; +} + +/** + * Return a hash lock to the zone's pool and null out the reference to it. + * + * @param [in] zone The zone from which the lock was borrowed + * @param [in,out] lockPtr The last reference to the lock being returned + **/ +static void returnHashLockToPool(HashZone *zone, HashLock **lockPtr) +{ + HashLock *lock = *lockPtr; + *lockPtr = NULL; + + memset(lock, 0, sizeof(*lock)); + initializeHashLock(lock); + pushRingNode(&zone->lockPool, &lock->poolNode); +} + +/**********************************************************************/ +int acquireHashLockFromZone(HashZone *zone, + const UdsChunkName *hash, + HashLock *replaceLock, + HashLock **lockPtr) +{ + // Borrow and prepare a lock from the pool so we don't have to do two + // PointerMap accesses in the common case of no lock contention. + HashLock *newLock = asHashLock(popRingNode(&zone->lockPool)); + int result = ASSERT(newLock != NULL, + "never need to wait for a free hash lock"); + if (result != VDO_SUCCESS) { + return result; + } + + // Fill in the hash of the new lock so we can map it, since we have to use + // the hash as the map key. + newLock->hash = *hash; + + HashLock *lock; + result = pointerMapPut(zone->hashLockMap, &newLock->hash, newLock, + (replaceLock != NULL), (void **) &lock); + if (result != VDO_SUCCESS) { + returnHashLockToPool(zone, &newLock); + return result; + } + + if (replaceLock != NULL) { + // XXX on mismatch put the old lock back and return a severe error + ASSERT_LOG_ONLY(lock == replaceLock, + "old lock must have been in the lock map"); + // XXX check earlier and bail out? + ASSERT_LOG_ONLY(replaceLock->registered, + "old lock must have been marked registered"); + replaceLock->registered = false; + } + + if (lock == replaceLock) { + lock = newLock; + lock->registered = true; + } else { + // There's already a lock for the hash, so we don't need the borrowed lock. + returnHashLockToPool(zone, &newLock); + } + + *lockPtr = lock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnHashLockToZone(HashZone *zone, HashLock **lockPtr) +{ + HashLock *lock = *lockPtr; + *lockPtr = NULL; + + if (lock->registered) { + HashLock *removed = pointerMapRemove(zone->hashLockMap, &lock->hash); + ASSERT_LOG_ONLY(lock == removed, + "hash lock being released must have been mapped"); + } else { + ASSERT_LOG_ONLY(lock != pointerMapGet(zone->hashLockMap, &lock->hash), + "unregistered hash lock must not be in the lock map"); + } + + ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), + "hash lock returned to zone must have no waiters"); + ASSERT_LOG_ONLY((lock->duplicateLock == NULL), + "hash lock returned to zone must not reference a PBN lock"); + ASSERT_LOG_ONLY((lock->state == HASH_LOCK_DESTROYING), + "returned hash lock must not be in use with state %s", + getHashLockStateName(lock->state)); + ASSERT_LOG_ONLY(isRingEmpty(&lock->poolNode), + "hash lock returned to zone must not be in a pool ring"); + ASSERT_LOG_ONLY(isRingEmpty(&lock->duplicateRing), + "hash lock returned to zone must not reference DataVIOs"); + + returnHashLockToPool(zone, &lock); +} + +/** + * Dump a compact description of HashLock to the log if the lock is not on the + * free list. + * + * @param lock The hash lock to dump + **/ +static void dumpHashLock(const HashLock *lock) +{ + if (!isRingEmpty(&lock->poolNode)) { + // This lock is on the free list. + return; + } + + // Necessarily cryptic since we can log a lot of these. First three chars of + // state is unambiguous. 'U' indicates a lock not registered in the map. + const char *state = getHashLockStateName(lock->state); + logInfo(" hl %" PRIptr ": %3.3s %c%llu/%u rc=%u wc=%zu agt=%" PRIptr, + (const void *) lock, + state, + (lock->registered ? 'D' : 'U'), + lock->duplicate.pbn, + lock->duplicate.state, + lock->referenceCount, + countWaiters(&lock->waiters), + (void *) lock->agent); +} + +/**********************************************************************/ +void bumpHashZoneValidAdviceCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.dedupeAdviceValid, 1); +} + +/**********************************************************************/ +void bumpHashZoneStaleAdviceCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.dedupeAdviceStale, 1); +} + +/**********************************************************************/ +void bumpHashZoneDataMatchCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.concurrentDataMatches, 1); +} + +/**********************************************************************/ +void bumpHashZoneCollisionCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.concurrentHashCollisions, 1); +} + +/**********************************************************************/ +void dumpHashZone(const HashZone *zone) +{ + if (zone->hashLockMap == NULL) { + logInfo("HashZone %u: NULL map", zone->zoneNumber); + return; + } + + logInfo("HashZone %u: mapSize=%zu", + zone->zoneNumber, pointerMapSize(zone->hashLockMap)); + for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { + dumpHashLock(&zone->lockArray[i]); + } +} diff --git a/vdo/base/hashZone.h b/vdo/base/hashZone.h new file mode 100644 index 0000000..ac1b695 --- /dev/null +++ b/vdo/base/hashZone.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.h#1 $ + */ + +#ifndef HASH_ZONE_H +#define HASH_ZONE_H + +#include "uds.h" + +#include "statistics.h" +#include "types.h" + +/** + * Create a hash zone. + * + * @param [in] vdo The VDO to which the zone will belong + * @param [in] zoneNumber The number of the zone to create + * @param [out] zonePtr A pointer to hold the new HashZone + * + * @return VDO_SUCCESS or an error code + **/ +int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) + __attribute__((warn_unused_result)); + +/** + * Free a hash zone and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freeHashZone(HashZone **zonePtr); + +/** + * Get the zone number of a hash zone. + * + * @param zone The zone + * + * @return The number of the zone + **/ +ZoneCount getHashZoneNumber(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the ID of a hash zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getHashZoneThreadID(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the statistics for this hash zone. + * + * @param zone The hash zone to query + * + * @return A copy of the current statistics for the hash zone + **/ +HashLockStatistics getHashZoneStatistics(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the lock for the hash (chunk name) of the data in a DataVIO, or if one + * does not exist (or if we are explicitly rolling over), initialize a new + * lock for the hash and register it in the zone. This must only be called in + * the correct thread for the zone. + * + * @param [in] zone The zone responsible for the hash + * @param [in] hash The hash to lock + * @param [in] replaceLock If non-NULL, the lock already registered for the + * hash which should be replaced by the new lock + * @param [out] lockPtr A pointer to receive the hash lock + * + * @return VDO_SUCCESS or an error code + **/ +int acquireHashLockFromZone(HashZone *zone, + const UdsChunkName *hash, + HashLock *replaceLock, + HashLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Return a hash lock to the zone it was borrowed from, remove it from the + * zone's lock map, returning it to the pool, and nulling out the reference to + * it. This must only be called when the lock has been completely released, + * and only in the correct thread for the zone. + * + * @param [in] zone The zone from which the lock was borrowed + * @param [in,out] lockPtr The lock that is no longer in use + **/ +void returnHashLockToZone(HashZone *zone, HashLock **lockPtr); + +/** + * Increment the valid advice count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that received valid advice + **/ +void bumpHashZoneValidAdviceCount(HashZone *zone); + +/** + * Increment the stale advice count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that received stale advice + **/ +void bumpHashZoneStaleAdviceCount(HashZone *zone); + +/** + * Increment the concurrent dedupe count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that matched a new DataVIO + **/ +void bumpHashZoneDataMatchCount(HashZone *zone); + +/** + * Increment the concurrent hash collision count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that rejected a colliding DataVIO + **/ +void bumpHashZoneCollisionCount(HashZone *zone); + +/** + * Dump information about a hash zone to the log for debugging. + * + * @param zone The zone to dump + **/ +void dumpHashZone(const HashZone *zone); + +#endif // HASH_ZONE_H diff --git a/vdo/base/header.c b/vdo/base/header.c new file mode 100644 index 0000000..8f0582b --- /dev/null +++ b/vdo/base/header.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.c#5 $ + */ + +#include "header.h" + +#include "logger.h" +#include "permassert.h" +#include "statusCodes.h" + +/**********************************************************************/ +int validateVersion(VersionNumber expectedVersion, + VersionNumber actualVersion, + const char *componentName) +{ + if (!areSameVersion(expectedVersion, actualVersion)) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "%s version mismatch," + " expected %d.%d, got %d.%d", + componentName, + expectedVersion.majorVersion, + expectedVersion.minorVersion, + actualVersion.majorVersion, + actualVersion.minorVersion); + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int validateHeader(const Header *expectedHeader, + const Header *actualHeader, + bool exactSize, + const char *componentName) +{ + if (expectedHeader->id != actualHeader->id) { + return logErrorWithStringError(VDO_INCORRECT_COMPONENT, + "%s ID mismatch, expected %d, got %d", + componentName, + expectedHeader->id, + actualHeader->id); + } + + int result = validateVersion(expectedHeader->version, + actualHeader->version, + componentName); + if (result != VDO_SUCCESS) { + return result; + } + + if ((expectedHeader->size > actualHeader->size) + || (exactSize && (expectedHeader->size < actualHeader->size))) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "%s size mismatch, expected %zu, got %zu", + componentName, + expectedHeader->size, + actualHeader->size); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int encodeHeader(const Header *header, Buffer *buffer) +{ + if (!ensureAvailableSpace(buffer, ENCODED_HEADER_SIZE)) { + return UDS_BUFFER_ERROR; + } + + int result = putUInt32LEIntoBuffer(buffer, header->id); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeVersionNumber(header->version, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, header->size); +} + +/**********************************************************************/ +int encodeVersionNumber(VersionNumber version, Buffer *buffer) +{ + PackedVersionNumber packed = packVersionNumber(version); + return putBytes(buffer, sizeof(packed), &packed); +} + +/**********************************************************************/ +int decodeHeader(Buffer *buffer, Header *header) +{ + ComponentID id; + int result = getUInt32LEFromBuffer(buffer, &id); + if (result != UDS_SUCCESS) { + return result; + } + + VersionNumber version; + result = decodeVersionNumber(buffer, &version); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t size; + result = getUInt64LEFromBuffer(buffer, &size); + if (result != UDS_SUCCESS) { + return result; + } + + *header = (Header) { + .id = id, + .version = version, + .size = size, + }; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int decodeVersionNumber(Buffer *buffer, VersionNumber *version) +{ + PackedVersionNumber packed; + int result = getBytesFromBuffer(buffer, sizeof(packed), &packed); + if (result != UDS_SUCCESS) { + return result; + } + + *version = unpackVersionNumber(packed); + return UDS_SUCCESS; +} diff --git a/vdo/base/header.h b/vdo/base/header.h new file mode 100644 index 0000000..d5b4f0e --- /dev/null +++ b/vdo/base/header.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.h#4 $ + */ + +#ifndef HEADER_H +#define HEADER_H + +#include "buffer.h" +#include "numeric.h" + +#include "types.h" + +/** + * An in-memory representation of a version number for versioned structures on + * disk. + * + * A version number consists of two portions, a major version and a + * minor version. Any format change which does not require an explicit + * upgrade step from the previous version should increment the minor + * version. Any format change which either requires an explicit + * upgrade step, or is wholly incompatible (i.e. can not be upgraded + * to), should increment the major version, and set the minor version + * to 0. + **/ +typedef struct { + uint32_t majorVersion; + uint32_t minorVersion; +} __attribute__((packed)) VersionNumber; + +/** + * A packed, machine-independent, on-disk representation of a VersionNumber. + * Both fields are stored in little-endian byte order. + **/ +typedef struct { + byte majorVersion[4]; + byte minorVersion[4]; +} __attribute__((packed)) PackedVersionNumber; + +/** + * The registry of component ids for use in headers + **/ +typedef enum { + SUPER_BLOCK = 0, + FIXED_LAYOUT = 1, + RECOVERY_JOURNAL = 2, + SLAB_DEPOT = 3, + BLOCK_MAP = 4, + GEOMETRY_BLOCK = 5, +} ComponentID; + +/** + * The header for versioned data stored on disk. + **/ +typedef struct { + ComponentID id; // The component this is a header for + VersionNumber version; // The version of the data format + size_t size; // The size of the data following this header +} __attribute__((packed)) Header; + +enum { + ENCODED_HEADER_SIZE = sizeof(Header), +}; + +/** + * Check whether two version numbers are the same. + * + * @param versionA The first version + * @param versionB The second version + * + * @return true if the two versions are the same + **/ +static inline bool areSameVersion(VersionNumber versionA, + VersionNumber versionB) +{ + return ((versionA.majorVersion == versionB.majorVersion) + && (versionA.minorVersion == versionB.minorVersion)); +} + +/** + * Check whether an actual version is upgradable to an expected version. + * An actual version is upgradable if its major number is expected but + * its minor number differs, and the expected version's minor number + * is greater than the actual version's minor number. + * + * @param expectedVersion The expected version + * @param actualVersion The version being validated + * + * @return true if the actual version is upgradable + **/ +static inline bool isUpgradableVersion(VersionNumber expectedVersion, + VersionNumber actualVersion) +{ + return ((expectedVersion.majorVersion == actualVersion.majorVersion) + && (expectedVersion.minorVersion > actualVersion.minorVersion)); +} + +/** + * Check whether a version matches an expected version. Logs an error + * describing a mismatch. + * + * @param expectedVersion The expected version + * @param actualVersion The version being validated + * @param componentName The name of the component or the calling function + * (for error logging) + * + * @return VDO_SUCCESS if the versions are the same + * VDO_UNSUPPORTED_VERSION if the versions don't match + **/ +int validateVersion(VersionNumber expectedVersion, + VersionNumber actualVersion, + const char *componentName) + __attribute__((warn_unused_result)); + +/** + * Check whether a header matches expectations. Logs an error describing the + * first mismatch found. + * + * @param expectedHeader The expected header + * @param actualHeader The header being validated + * @param exactSize If true, the size fields of the two headers must be + * the same, otherwise actualSize >= expectedSize is OK + * @param componentName The name of the component or the calling function + * (for error logging) + * + * @return VDO_SUCCESS if the header meets expectations + * VDO_INCORRECT_COMPONENT if the component ids don't match + * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match + **/ +int validateHeader(const Header *expectedHeader, + const Header *actualHeader, + bool exactSize, + const char *componentName) + __attribute__((warn_unused_result)); + +/** + * Encode a header into a buffer. + * + * @param header The header to encode + * @param buffer The buffer in which to encode the header + * + * @return UDS_SUCCESS or an error + **/ +int encodeHeader(const Header *header, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Encode a version number into a buffer. + * + * @param version The version to encode + * @param buffer The buffer in which to encode the version + * + * @return UDS_SUCCESS or an error + **/ +int encodeVersionNumber(VersionNumber version, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode a header from a buffer. + * + * @param [in] buffer The buffer from which to decode the header + * @param [out] header The header to decode + * + * @return UDS_SUCCESS or an error + **/ +int decodeHeader(Buffer *buffer, Header *header) + __attribute__((warn_unused_result)); + +/** + * Decode a version number from a buffer. + * + * @param buffer The buffer from which to decode the version + * @param version The version structure to decode into + * + * @return UDS_SUCCESS or an error + **/ +int decodeVersionNumber(Buffer *buffer, VersionNumber *version) + __attribute__((warn_unused_result)); + +/** + * Convert a VersionNumber to its packed on-disk representation. + * + * @param version The version number to convert + * + * @return the platform-independent representation of the version + **/ +static inline PackedVersionNumber packVersionNumber(VersionNumber version) +{ + PackedVersionNumber packed; + storeUInt32LE(packed.majorVersion, version.majorVersion); + storeUInt32LE(packed.minorVersion, version.minorVersion); + return packed; +} + +/** + * Convert a PackedVersionNumber to its native in-memory representation. + * + * @param version The version number to convert + * + * @return the platform-independent representation of the version + **/ +static inline VersionNumber unpackVersionNumber(PackedVersionNumber version) +{ + return (VersionNumber) { + .majorVersion = getUInt32LE(version.majorVersion), + .minorVersion = getUInt32LE(version.minorVersion), + }; +} + +#endif // HEADER_H diff --git a/vdo/base/heap.c b/vdo/base/heap.c new file mode 100644 index 0000000..0928023 --- /dev/null +++ b/vdo/base/heap.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.c#2 $ + */ + +#include "heap.h" + +#include "errors.h" +#include "logger.h" +#include "numeric.h" + +#include "statusCodes.h" + +/**********************************************************************/ +void initializeHeap(Heap *heap, + HeapComparator *comparator, + HeapSwapper *swapper, + void *array, + size_t capacity, + size_t elementSize) +{ + *heap = (Heap) { + .comparator = comparator, + .swapper = swapper, + .capacity = capacity, + .elementSize = elementSize, + }; + if (array != NULL) { + // Calculating child indexes is simplified by pretending the element array + // is 1-based. + heap->array = ((byte *) array - elementSize); + } +} + +/**********************************************************************/ +static void siftHeapDown(Heap *heap, size_t topNode, size_t lastNode) +{ + // Keep sifting until the sub-heap rooted at topNode has no children. + size_t leftChild; + while ((leftChild = (2 * topNode)) <= lastNode) { + // If there are two children, select the largest child to swap with. + size_t swapNode = leftChild; + if (leftChild < lastNode) { + size_t rightChild = leftChild + heap->elementSize; + if (heap->comparator(&heap->array[leftChild], + &heap->array[rightChild]) < 0) { + swapNode = rightChild; + } + } + + // Stop sifting if topNode is at least as large as its largest child, + // which means the heap invariant was restored by the previous swap. + if (heap->comparator(&heap->array[topNode], &heap->array[swapNode]) >= 0) { + return; + } + + // Swap the element we've been sifting down with the larger child. + heap->swapper(&heap->array[topNode], &heap->array[swapNode]); + + // Descend into the sub-heap rooted at that child, going around the loop + // again in place of a tail-recursive call to siftHeapDown(). + topNode = swapNode; + } + + // We sifted the element all the way to a leaf node of the heap, so the heap + // invariant has now been restored. +} + +/**********************************************************************/ +void buildHeap(Heap *heap, size_t count) +{ + heap->count = minSizeT(count, heap->capacity); + + if ((heap->count < 2) || (heap->elementSize == 0)) { + return; + } + + /* + * All the leaf nodes are trivially valid sub-heaps. Starting with the parent + * of the right-most leaf node, restore the heap invariant in that sub-heap + * by sifting the top node of the sub-heap down into one of its children's + * valid sub-heaps (or not, if the top node is already larger than its + * children). Continue iterating through all the interior nodes in the heap, + * in sort of a reverse breadth-first traversal, restoring the heap + * invariant for each (increasingly larger) sub-heap until we reach the root + * of the heap. Once we sift the root node down into one of its two valid + * children, the entire heap must be valid, by induction. + * + * Even though we operate on every node and potentially perform an O(log N) + * traversal for each node, the combined probabilities of actually needing + * to do a swap and the heights of the sub-heaps sum to a constant, so + * restoring a heap from the bottom-up like this has only O(N) complexity. + */ + size_t size = heap->elementSize; + size_t lastParent = size * (heap->count / 2); + size_t lastNode = size * heap->count; + for (size_t topNode = lastParent; topNode > 0; topNode -= size) { + siftHeapDown(heap, topNode, lastNode); + } +} + +/**********************************************************************/ +bool popMaxHeapElement(Heap *heap, void *elementPtr) +{ + if (heap->count == 0) { + return false; + } + + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + + // Return the maximum element (the root of the heap) if the caller wanted it. + if (elementPtr != NULL) { + memcpy(elementPtr, &heap->array[rootNode], heap->elementSize); + } + + // Move the right-most leaf node to the vacated root node, reducing the + // number of elements by one and violating the heap invariant. + if (rootNode != lastNode) { + memcpy(&heap->array[rootNode], &heap->array[lastNode], heap->elementSize); + } + heap->count -= 1; + lastNode -= heap->elementSize; + + // Restore the heap invariant by sifting the root back down into the heap. + siftHeapDown(heap, rootNode, lastNode); + return true; +} + +/**********************************************************************/ +static inline size_t siftAndSort(Heap *heap, size_t rootNode, size_t lastNode) +{ + /* + * We have a valid heap, so the largest unsorted element is now at the top + * of the heap. That element belongs at the start of the partially-sorted + * array, preceding all the larger elements that we've already removed + * from the heap. Swap that largest unsorted element with the the + * right-most leaf node in the heap, moving it to its sorted position in + * the array. + */ + heap->swapper(&heap->array[rootNode], &heap->array[lastNode]); + // The sorted list is now one element larger and valid. The heap is + // one element smaller, and invalid. + lastNode -= heap->elementSize; + // Restore the heap invariant by sifting the swapped element back down + // into the heap. + siftHeapDown(heap, rootNode, lastNode); + return lastNode; +} + +/**********************************************************************/ +size_t sortHeap(Heap *heap) +{ + // All zero-length records are identical and therefore already sorted, as + // are empty or singleton arrays. + if ((heap->count < 2) || (heap->elementSize == 0)) { + return heap->count; + } + + // Get the byte array offset of the root node, and the right-most leaf node + // in the 1-based array of records that will form the heap. + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + + while (lastNode > rootNode) { + lastNode = siftAndSort(heap, rootNode, lastNode); + } + + size_t count = heap->count; + heap->count = 0; + return count; +} + +/**********************************************************************/ +void *sortNextHeapElement(Heap *heap) +{ + if ((heap->count == 0) || (heap->elementSize == 0)) { + return NULL; + } + + // Get the byte array offset of the root node, and the right-most leaf node + // in the 1-based array of records that will form the heap. + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + if (heap->count > 1) { + siftAndSort(heap, rootNode, lastNode); + } + heap->count--; + + return &heap->array[lastNode]; +} diff --git a/vdo/base/heap.h b/vdo/base/heap.h new file mode 100644 index 0000000..916f017 --- /dev/null +++ b/vdo/base/heap.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.h#2 $ + */ + +#ifndef HEAP_H +#define HEAP_H + +#include "common.h" + +/** + * Prototype for functions which compare two array elements. All the time + * complexity claims in this module assume this operation has O(1) time + * complexity. + * + * @param item1 The first element to compare + * @param item2 The second element to compare + * + * @return An integer which is less than, equal to, or greater than 0 + * depending on whether item1 is less than, equal to, or greater + * than item2, respectively + **/ +typedef int HeapComparator(const void *item1, const void *item2); + +/** + * Prototype for functions which swap two array elements. + * + * @param item1 The first element to swap + * @param item2 The second element to swap + **/ +typedef void HeapSwapper(void *item1, void *item2); + +/** + * A heap array can be any array of fixed-length elements in which the heap + * invariant can be established. In a max-heap, every child of a node must be + * at least as large as its children. Once that invariant is established in an + * array by calling buildHeap(), all the other heap operations may be used on + * that array. + **/ +typedef struct heap { + /** the 1-based array of heap elements (nodes) */ + byte *array; + /** the function to use to compare two elements */ + HeapComparator *comparator; + /** the function to use to swap two elements */ + HeapSwapper *swapper; + /** the maximum number of elements that can be stored */ + size_t capacity; + /** the size of every element (in bytes) */ + size_t elementSize; + /** the current number of elements in the heap */ + size_t count; +} Heap; + +/** + * Initialize an binary heap by wrapping it around an array of elements. + * + * The heap will not own the array it wraps. Use buildHeap() subsequently to + * arrange any elements contained in the array into a valid heap. + * + * @param heap The heap to initialize + * @param comparator The function to use to compare two heap elements + * @param swapper The function to use to swap two heap elements + * @param array The array of elements (not modified by this call) + * @param capacity The maximum number of elements which fit in the array + * @param elementSize The size of every array element, in bytes + **/ +void initializeHeap(Heap *heap, + HeapComparator *comparator, + HeapSwapper *swapper, + void *array, + size_t capacity, + size_t elementSize); + +/** + * Build a max-heap in place in an array (heapify it) by re-ordering the + * elements to establish the heap invariant. Before calling this function, + * first copy the elements to be arranged into a heap into the array that was + * passed to initializeHeap(). This operation has O(N) time complexity in the + * number of elements in the array. + * + * @param heap The heap to build + * @param count The number of elements in the array to build into a heap + **/ +void buildHeap(Heap *heap, size_t count); + +/** + * Check whether the heap is currently empty. + * + * @param heap The heap to query + * + * @return true if there are no elements in the heap + **/ +static inline bool isHeapEmpty(const Heap *heap) +{ + return (heap->count == 0); +} + +/** + * Remove the largest element from the top of the heap and restore the heap + * invariant on the remaining elements. This operation has O(log2(N)) time + * complexity. + * + * @param [in] heap The heap to modify + * @param [out] elementPtr A pointer to receive the largest element (may be + * NULL if the caller just wishes to discard it) + * + * @return false if the heap was empty, so no element was removed + **/ +bool popMaxHeapElement(Heap *heap, void *elementPtr); + +/** + * Sort the elements contained in a heap. + * + * This function re-orders the elements contained in the heap to a sorted + * array in-place by repeatedly popping the maximum element off the heap and + * moving it to the spot vacated at the end of the heap array. When the + * function returns, the heap will be empty and the array will contain the + * elements in sorted order, from heap minimum to heap maximum. The sort is + * unstable--relative ordering of equal keys is not preserved. This operation + * has O(N*log2(N)) time complexity. + * + * @param heap The heap containing the elements to sort + * + * @return the number of elements that were sorted + **/ +size_t sortHeap(Heap *heap); + +/** + * Gets the next sorted heap element and returns a pointer to it, in O(log2(N)) + * time. + * + * @param heap The heap to sort one more step + * + * @return a pointer to the element sorted, or NULL if already fully sorted. + **/ +void *sortNextHeapElement(Heap *heap); + +#endif /* HEAP_H */ diff --git a/vdo/base/intMap.c b/vdo/base/intMap.c new file mode 100644 index 0000000..2c690a6 --- /dev/null +++ b/vdo/base/intMap.c @@ -0,0 +1,661 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.c#1 $ + */ + +/** + * Hash table implementation of a map from integers to pointers, implemented + * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does + * not contain any of the locking/concurrency features of the algorithm, just + * the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear + * probing. All the entries are stored in a fixed array of buckets, with no + * dynamic allocation for collisions. Unlike linear probing, all the entries + * that hash to a given bucket are stored within a fixed neighborhood starting + * at that bucket. Chaining is effectively represented as a bit vector + * relative to each bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, + * subsequent neighborhoods are searched, and one or more entries will "hop" + * into those neighborhoods. When this process works, an empty bucket will + * move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the + * table must be resized and the all entries rehashed and added to the + * expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the + * worst case has a fixed upper bound (the size of the neighborhood). Those + * entries occupy a small number of memory cache lines, leading to improved + * use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load + * factors, so even with the increased memory burden for maintaining the hop + * vectors, less memory is needed to achieve that performance. Hopscotch is + * also immune to "contamination" from deleting entries since entries are + * genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper + * alludes to an offset scheme which is used by this implementation. Since the + * entries in the neighborhood are within N entries of the hash bucket at the + * start of the neighborhood, a pair of small offset fields each log2(N) bits + * wide is all that's needed to maintain the hops as a linked list. In order + * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, + * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => + * offset=1, etc.) We can represent neighborhoods of up to 255 entries with + * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the + * first entry in the list is always the bucket closest to the start of the + * neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations + * are very very expensive. If an upper bound on the latency of adding an + * entry to the table is needed, we either need to ensure the table is + * pre-sized to be large enough so no resize is ever needed, or we'll need to + * develop an approach to incrementally resize the table. + **/ + +#include "intMap.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table + NEIGHBORHOOD = 255, // the number of buckets in each neighborhood + MAX_PROBES = 1024, // limit on the number of probes for a free bucket + NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list + DEFAULT_LOAD = 75 // a compromise between memory use and performance +}; + +/** + * Buckets are packed together to reduce memory usage and improve cache + * efficiency. It would be tempting to encode the hop offsets separately and + * maintain alignment of key/value pairs, but it's crucial to keep the hop + * fields near the buckets that they use them so they'll tend to share cache + * lines. + **/ +typedef struct __attribute__((packed)) bucket { + uint8_t firstHop; // the biased offset of the first entry in the hop list + // of the neighborhood that hashes to this bucket + uint8_t nextHop; // the biased offset of the next bucket in the hop list + + uint64_t key; // the key stored in this bucket + void *value; // the value stored in this bucket (NULL if empty) +} Bucket; + +/** + * The concrete definition of the opaque IntMap type. To avoid having to wrap + * the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array + * instead, which is why capacity and bucketCount are different. + **/ +struct intMap { + size_t size; // the number of entries stored in the map + size_t capacity; // the number of neighborhoods in the map + size_t bucketCount; // the number of buckets in the bucket array + Bucket *buckets; // the array of hash buckets +}; + +/** + * This is the Google CityHash 16-byte hash mixing function. + * + * @param input1 the first input value + * @param input2 the second input value + * + * @return a hash of the two inputs + **/ +static uint64_t mix(uint64_t input1, uint64_t input2) +{ + static const uint64_t CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL; + + uint64_t hash = (input1 ^ input2); + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash ^= input2; + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash *= CITY_MULTIPLIER; + return hash; +} + +/** + * Calculate a 64-bit non-cryptographic hash value for the provided 64-bit + * integer key. The implementation is based on Google's CityHash, only + * handling the specific case of an 8-byte input. + * + * @param key the mapping key + * + * @return the hash of the mapping key + **/ +static uint64_t hashKey(uint64_t key) +{ + // Aliasing restrictions forbid us from casting pointer types, so use a + // union to convert a single uint64_t to two uint32_t values. + union { + uint64_t u64; + uint32_t u32[2]; + } pun = { .u64 = key }; + return mix(sizeof(key) + (((uint64_t) pun.u32[0]) << 3), pun.u32[1]); +} + +/** + * Initialize an IntMap. + * + * @param map the map to initialize + * @param capacity the initial capacity of the map + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateBuckets(IntMap *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + // full neighborhood without have to wrap back around to element zero. + map->bucketCount = capacity + (NEIGHBORHOOD - 1); + return ALLOCATE(map->bucketCount, Bucket, "IntMap buckets", &map->buckets); +} + +/**********************************************************************/ +int makeIntMap(size_t initialCapacity, + unsigned int initialLoad, + IntMap **mapPtr) +{ + // Use the default initial load if the caller did not specify one. + if (initialLoad == 0) { + initialLoad = DEFAULT_LOAD; + } + if (initialLoad > 100) { + return UDS_INVALID_ARGUMENT; + } + + IntMap *map; + int result = ALLOCATE(1, IntMap, "IntMap", &map); + if (result != UDS_SUCCESS) { + return result; + } + + // Use the default capacity if the caller did not specify one. + size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; + + // Scale up the capacity by the specified initial load factor. + // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + capacity = capacity * 100 / initialLoad; + + result = allocateBuckets(map, capacity); + if (result != UDS_SUCCESS) { + freeIntMap(&map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/** + * Free the bucket array for the map. + * + * @param map the map whose bucket array is to be freed + **/ +static void freeBuckets(IntMap *map) +{ + FREE(map->buckets); + map->buckets = NULL; +} + +/**********************************************************************/ +void freeIntMap(IntMap **mapPtr) +{ + if (*mapPtr != NULL) { + freeBuckets(*mapPtr); + FREE(*mapPtr); + *mapPtr = NULL; + } +} + +/**********************************************************************/ +size_t intMapSize(const IntMap *map) +{ + return map->size; +} + +/** + * Convert a biased hop offset within a neighborhood to a pointer to the + * bucket it references. + * + * @param neighborhood the first bucket in the neighborhood + * @param hopOffset the biased hop offset to the desired bucket + * + * @return NULL if hopOffset is zero, otherwise a pointer to + * the bucket in the neighborhood at hopOffset - 1 + **/ +static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) +{ + if (hopOffset == NULL_HOP_OFFSET) { + return NULL; + } + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hopOffset - 1]; +} + +/** + * Add a bucket into the hop list for the neighborhood, inserting it into the + * list so the hop list remains sorted by hop offset. + * + * @param neighborhood the first bucket in the neighborhood + * @param newBucket the bucket to add to the hop list + **/ +static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) +{ + // Zero indicates a NULL hop offset, so bias the hop offset by one. + int hopOffset = 1 + (newBucket - neighborhood); + + // Handle the special case of adding a bucket at the start of the list. + int nextHop = neighborhood->firstHop; + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + neighborhood->firstHop = hopOffset; + return; + } + + // Search the hop list for the insertion point that maintains the sort + // order. + for (;;) { + Bucket *bucket = dereferenceHop(neighborhood, nextHop); + nextHop = bucket->nextHop; + + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + bucket->nextHop = hopOffset; + return; + } + } +} + +/** + * Select and return the hash bucket for a given search key. + * + * @param map the map to search + * @param key the mapping key + **/ +static Bucket *selectBucket(const IntMap *map, uint64_t key) +{ + // Calculate a good hash value for the provided key. We want exactly 32 + // bits, so mask the result. + uint64_t hash = hashKey(key) & 0xFFFFFFFF; + + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary + * fraction and multiplying that by the capacity. If the hash is uniformly + * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be + * uniformly distributed over [0 .. capacity-1]. The multiply and shift is + * much faster than a divide (modulus) on X86 CPUs. + */ + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * Search the hop list associated with given hash bucket for a given search + * key. If the key is found, returns a pointer to the entry (bucket or + * collision), otherwise returns NULL. + * + * @param [in] map the map being searched + * @param [in] bucket the map bucket to search for the key + * @param [in] key the mapping key + * @param [out] previousPtr if not NULL, a pointer in which to + * store the bucket in the list preceding the one + * that had the matching key + * + * @return an entry that matches the key, or NULL if not found + **/ +static Bucket *searchHopList(IntMap *map __attribute__((unused)), + Bucket *bucket, + uint64_t key, + Bucket **previousPtr) +{ + Bucket *previous = NULL; + unsigned int nextHop = bucket->firstHop; + while (nextHop != NULL_HOP_OFFSET) { + // Check the neighboring bucket indexed by the offset for the desired key. + Bucket *entry = dereferenceHop(bucket, nextHop); + if ((key == entry->key) && (entry->value != NULL)) { + if (previousPtr != NULL) { + *previousPtr = previous; + } + return entry; + } + nextHop = entry->nextHop; + previous = entry; + } + return NULL; +} + +/**********************************************************************/ +void *intMapGet(IntMap *map, uint64_t key) +{ + Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); + return ((match != NULL) ? match->value : NULL); +} + +/** + * Increase the number of hash buckets and rehash all the existing entries, + * storing them in the new buckets. + * + * @param map the map to resize + **/ +static int resizeBuckets(IntMap *map) +{ + // Copy the top-level map data to the stack. + IntMap oldMap = *map; + + // Re-initialize the map to be empty and 50% larger. + size_t newCapacity = map->capacity / 2 * 3; + logInfo("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, map->capacity, newCapacity, map->size); + int result = allocateBuckets(map, newCapacity); + if (result != UDS_SUCCESS) { + *map = oldMap; + return result; + } + + // Populate the new hash table from the entries in the old bucket array. + for (size_t i = 0; i < oldMap.bucketCount; i++) { + Bucket *entry = &oldMap.buckets[i]; + if (entry->value == NULL) { + continue; + } + + result = intMapPut(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + // Destroy the new partial map and restore the map from the stack. + freeBuckets(map); + *map = oldMap; + return result; + } + } + + // Destroy the old bucket array. + freeBuckets(&oldMap); + return UDS_SUCCESS; +} + +/** + * Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. NULL will be returned if + * the search reaches the end of the bucket array or if the number of linear + * probes exceeds a specified limit. + * + * @param map the map containing the buckets to search + * @param bucket the bucket at which to start probing + * @param maxProbes the maximum number of buckets to search + * + * @return the next empty bucket, or NULL if the search failed + **/ +static Bucket *findEmptyBucket(IntMap *map, + Bucket *bucket, + unsigned int maxProbes) +{ + // Limit the search to either the nearer of the end of the bucket array or a + // fixed distance beyond the initial bucket. + size_t remaining = &map->buckets[map->bucketCount] - bucket; + Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; + + for (Bucket *entry = bucket; entry < sentinel; entry++) { + if (entry->value == NULL) { + return entry; + } + } + return NULL; +} + +/** + * Move an empty bucket closer to the start of the bucket array. This searches + * the neighborhoods that contain the empty bucket for a non-empty bucket + * closer to the start of the array. If such a bucket is found, this swaps the + * two buckets by moving the entry to the empty bucket. + * + * @param map the map containing the bucket + * @param hole the empty bucket to fill with an entry that precedes it in one + * of its enclosing neighborhoods + * + * @return the bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved + **/ +static Bucket *moveEmptyBucket(IntMap *map __attribute__((unused)), + Bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting + * with the one in which it is the last bucket. No boundary check is needed + * for the negative array arithmetic since this function is only called when + * hole is at least NEIGHBORHOOD cells deeper into the array than a valid + * bucket. + */ + for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + // Find the entry that is nearest to the bucket, which means it will be + // nearest to the hash bucket whose neighborhood is full. + Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); + if (newHole == NULL) { + // There are no buckets in this neighborhood that are in use by this one + // (they must all be owned by overlapping neighborhoods). + continue; + } + + // Skip this bucket if its first entry is actually further away than the + // hole that we're already trying to fill. + if (hole < newHole) { + continue; + } + + /* + * We've found an entry in this neighborhood that we can "hop" further + * away, moving the hole closer to the hash bucket, if not all the way + * into its neighborhood. + */ + + // The entry that will be the new hole is the first bucket in the list, + // so setting firstHop is all that's needed remove it from the list. + bucket->firstHop = newHole->nextHop; + newHole->nextHop = NULL_HOP_OFFSET; + + // Move the entry into the original hole. + hole->key = newHole->key; + hole->value = newHole->value; + newHole->value = NULL; + + // Insert the filled hole into the hop list for the neighborhood. + insertInHopList(bucket, hole); + return newHole; + } + + // We couldn't find an entry to relocate to the hole. + return NULL; +} + +/** + * Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * + * @param [in] map the IntMap to attempt to modify + * @param [in] neighborhood the first bucket in the neighborhood that + * would contain the search key + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store the old value + * (unmodified if no mapping was found) + * + * @return true if the map contains a mapping for the key + * false if it does not + **/ +static bool updateMapping(IntMap *map, + Bucket *neighborhood, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) +{ + Bucket *bucket = searchHopList(map, neighborhood, key, NULL); + if (bucket == NULL) { + // There is no bucket containing the key in the neighborhood. + return false; + } + + // Return the value of the current mapping (if desired) and update the + // mapping with the new value (if desired). + if (oldValuePtr != NULL) { + *oldValuePtr = bucket->value; + } + if (update) { + bucket->value = newValue; + } + return true; +} + +/** + * Find an empty bucket in a specified neighborhood for a new mapping or + * attempt to re-arrange mappings so there is such a bucket. This operation + * may fail (returning NULL) if an empty bucket is not available or could not + * be relocated to the neighborhood. + * + * @param map the IntMap to search or modify + * @param neighborhood the first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping + * + * @return a pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged + **/ +static Bucket *findOrMakeVacancy(IntMap *map, Bucket *neighborhood) +{ + // Probe within and beyond the neighborhood for the first empty bucket. + Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); + + // Keep trying until the empty bucket is in the bucket's neighborhood or we + // are unable to move it any closer by swapping it with a filled bucket. + while (hole != NULL) { + int distance = hole - neighborhood; + if (distance < NEIGHBORHOOD) { + // We've found or relocated an empty bucket close enough to the initial + // hash bucket to be referenced by its hop vector. + return hole; + } + + // The nearest empty bucket isn't within the neighborhood that must + // contain the new entry, so try to swap it with bucket that is closer. + hole = moveEmptyBucket(map, hole); + } + + return NULL; +} + +/**********************************************************************/ +int intMapPut(IntMap *map, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) +{ + if (newValue == NULL) { + return UDS_INVALID_ARGUMENT; + } + + // Select the bucket at the start of the neighborhood that must contain any + // entry for the provided key. + Bucket *neighborhood = selectBucket(map, key); + + // Check whether the neighborhood already contains an entry for the key, in + // which case we optionally update it, returning the old value. + if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { + return UDS_SUCCESS; + } + + /* + * Find an empty bucket in the desired neighborhood for the new entry or + * re-arrange entries in the map so there is such a bucket. This operation + * will usually succeed; the loop body will only be executed on the rare + * occasions that we have to resize the map. + */ + Bucket *bucket; + while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the current + * map, so we're forced to allocate a new bucket array with a larger + * capacity, re-hash all the entries into those buckets, and try again (a + * very expensive operation for large maps). + */ + int result = resizeBuckets(map); + if (result != UDS_SUCCESS) { + return result; + } + + // Resizing the map invalidates all pointers to buckets, so recalculate + // the neighborhood pointer. + neighborhood = selectBucket(map, key); + } + + // Put the new entry in the empty bucket, adding it to the neighborhood. + bucket->key = key; + bucket->value = newValue; + insertInHopList(neighborhood, bucket); + map->size += 1; + + // There was no existing entry, so there was no old value to be returned. + if (oldValuePtr != NULL) { + *oldValuePtr = NULL; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void *intMapRemove(IntMap *map, uint64_t key) +{ + // Select the bucket to search and search it for an existing entry. + Bucket *bucket = selectBucket(map, key); + Bucket *previous; + Bucket *victim = searchHopList(map, bucket, key, &previous); + + if (victim == NULL) { + // There is no matching entry to remove. + return NULL; + } + + // We found an entry to remove. Save the mapped value to return later and + // empty the bucket. + map->size -= 1; + void *value = victim->value; + victim->value = NULL; + victim->key = 0; + + // The victim bucket is now empty, but it still needs to be spliced out of + // the hop list. + if (previous == NULL) { + // The victim is the head of the list, so swing firstHop. + bucket->firstHop = victim->nextHop; + } else { + previous->nextHop = victim->nextHop; + } + victim->nextHop = NULL_HOP_OFFSET; + + return value; +} diff --git a/vdo/base/intMap.h b/vdo/base/intMap.h new file mode 100644 index 0000000..0b18209 --- /dev/null +++ b/vdo/base/intMap.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.h#1 $ + */ + +#ifndef INT_MAP_H +#define INT_MAP_H + +#include "common.h" + +/** + * IntMap associates pointers (void *) with integer keys + * (uint64_t). NULL pointer values are not + * supported. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + **/ + +typedef struct intMap IntMap; + +/** + * Allocate and initialize an IntMap. + * + * @param [in] initialCapacity the number of entries the map should + * initially be capable of holding (zero tells + * the map to use its own small default) + * @param [in] initialLoad the load factor of the map, expressed as an + * integer percentage (typically in the range + * 50 to 90, with zero telling the map to use + * its own default) + * @param [out] mapPtr a pointer to hold the new IntMap + * + * @return UDS_SUCCESS or an error code + **/ +int makeIntMap(size_t initialCapacity, + unsigned int initialLoad, + IntMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free an IntMap and null out the reference to it. NOTE: The map does not own + * the pointer values stored in the map and they are not freed by this call. + * + * @param [in,out] mapPtr the reference to the IntMap to free + **/ +void freeIntMap(IntMap **mapPtr); + +/** + * Get the number of entries stored in an IntMap. + * + * @param map the IntMap to query + * + * @return the number of entries in the map + **/ +size_t intMapSize(const IntMap *map); + +/** + * Retrieve the value associated with a given key from the IntMap. + * + * @param map the IntMap to query + * @param key the key to look up + * + * @return the value associated with the given key, or NULL + * if the key is not mapped to any value + **/ +void *intMapGet(IntMap *map, uint64_t key); + +/** + * Try to associate a value (a pointer) with an integer in an IntMap. If the + * map already contains a mapping for the provided key, the old value is + * only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of update. + * + * @param [in] map the IntMap to attempt to modify + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store either the old value + * (if the key was already mapped) or + * NULL if the map did not contain the + * key; NULL may be provided if the + * caller does not need to know the old value + * + * @return UDS_SUCCESS or an error code + **/ +int intMapPut(IntMap *map, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) + __attribute__((warn_unused_result)); + +/** + * Remove the mapping for a given key from the IntMap. + * + * @param map the IntMap from which to remove the mapping + * @param key the key whose mapping is to be removed + * + * @return the value that was associated with the key, or + * NULL if it was not mapped + **/ +void *intMapRemove(IntMap *map, uint64_t key); + +#endif /* INT_MAP_H */ diff --git a/vdo/base/journalPoint.h b/vdo/base/journalPoint.h new file mode 100644 index 0000000..30d44cd --- /dev/null +++ b/vdo/base/journalPoint.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/journalPoint.h#2 $ + */ + +#ifndef JOURNAL_POINT_H +#define JOURNAL_POINT_H + +#include "numeric.h" +#include "types.h" + +typedef uint16_t JournalEntryCount; + +/** + * The absolute position of an entry in a recovery journal or slab journal. + **/ +typedef struct { + SequenceNumber sequenceNumber; + JournalEntryCount entryCount; +} JournalPoint; + +/** + * A packed, platform-independent encoding of a JournalPoint. + **/ +typedef struct { + /** + * The packed representation is the little-endian 64-bit representation of + * the low-order 48 bits of the sequence number, shifted up 16 bits, or'ed + * with the 16-bit entry count. + * + * Very long-term, the top 16 bits of the sequence number may not always be + * zero, as this encoding assumes--see BZ 1523240. + **/ + byte encodedPoint[8]; +} __attribute__((packed)) PackedJournalPoint; + +/** + * Move the given journal point forward by one entry. + * + * @param point the journal point to adjust + * @param entriesPerBlock the number of entries in one full block + **/ +static inline void advanceJournalPoint(JournalPoint *point, + JournalEntryCount entriesPerBlock) +{ + point->entryCount++; + if (point->entryCount == entriesPerBlock) { + point->sequenceNumber++; + point->entryCount = 0; + } +} + +/** + * Check whether a journal point is valid. + * + * @param point the journal point + * + * @return true if the journal point is valid + **/ +static inline bool isValidJournalPoint(const JournalPoint *point) +{ + return ((point != NULL) && (point->sequenceNumber > 0)); +} + +/** + * Check whether the first point precedes the second point. + * + * @param first the first journal point + * @param second the second journal point + + * + * @return true if the first point precedes the second point. + **/ +static inline bool beforeJournalPoint(const JournalPoint *first, + const JournalPoint *second) +{ + return ((first->sequenceNumber < second->sequenceNumber) + || ((first->sequenceNumber == second->sequenceNumber) + && (first->entryCount < second->entryCount))); +} + +/** + * Check whether the first point is the same as the second point. + * + * @param first the first journal point + * @param second the second journal point + * + * @return true if both points reference the same logical + * position of an entry the journal + **/ +static inline bool areEquivalentJournalPoints(const JournalPoint *first, + const JournalPoint *second) +{ + return ((first->sequenceNumber == second->sequenceNumber) + && (first->entryCount == second->entryCount)); +} + +/** + * Encode the journal location represented by a JournalPoint into a + * PackedJournalPoint. + * + * @param unpacked The unpacked input point + * @param packed The packed output point + **/ +static inline void packJournalPoint(const JournalPoint *unpacked, + PackedJournalPoint *packed) +{ + uint64_t native = ((unpacked->sequenceNumber << 16) | unpacked->entryCount); + storeUInt64LE(packed->encodedPoint, native); +} + +/** + * Decode the journal location represented by a PackedJournalPoint into a + * JournalPoint. + * + * @param packed The packed input point + * @param unpacked The unpacked output point + **/ +static inline void unpackJournalPoint(const PackedJournalPoint *packed, + JournalPoint *unpacked) +{ + uint64_t native = getUInt64LE(packed->encodedPoint); + unpacked->sequenceNumber = (native >> 16); + unpacked->entryCount = (native & 0xffff); +} + +#endif // JOURNAL_POINT_H diff --git a/vdo/base/lockCounter.c b/vdo/base/lockCounter.c new file mode 100644 index 0000000..e762576 --- /dev/null +++ b/vdo/base/lockCounter.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.c#3 $ + */ + +#include "lockCounter.h" + +#include "atomic.h" +#include "memoryAlloc.h" + +/** + * LockCounter is intended to keep all of the locks for the blocks in the + * recovery journal. The per-zone counters are all kept in a single array which + * is arranged by zone (i.e. zone 0's lock 0 is at index 0, zone 0's lock 1 is + * at index 1, and zone 1's lock 0 is at index 'locks'. This arrangement is + * intended to minimize cache-line contention for counters from different + * zones. + * + * The locks are implemented as a single object instead of as a lock counter + * per lock both to afford this opportunity to reduce cache line contention and + * also to eliminate the need to have a completion per lock. + * + * Lock sets are laid out with the set for recovery journal first, followed by + * the logical zones, and then the physical zones. + **/ +typedef enum lockCounterState { + LOCK_COUNTER_STATE_NOT_NOTIFYING = 0, + LOCK_COUNTER_STATE_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED, +} LockCounterState; + +struct lockCounter { + /** The completion for notifying the owner of a lock release */ + VDOCompletion completion; + /** The number of logical zones which may hold locks */ + ZoneCount logicalZones; + /** The number of physical zones which may hold locks */ + ZoneCount physicalZones; + /** The number of locks */ + BlockCount locks; + /** Whether the lock release notification is in flight */ + Atomic32 state; + /** The number of logical zones which hold each lock */ + Atomic32 *logicalZoneCounts; + /** The number of physical zones which hold each lock */ + Atomic32 *physicalZoneCounts; + /** The per-zone, per-lock counts for the journal zone */ + uint16_t *journalCounters; + /** The per-zone, per-lock decrement counts for the journal zone */ + Atomic32 *journalDecrementCounts; + /** The per-zone, per-lock reference counts for logical zones */ + uint16_t *logicalCounters; + /** The per-zone, per-lock reference counts for physical zones */ + uint16_t *physicalCounters; +}; + +/**********************************************************************/ +int makeLockCounter(PhysicalLayer *layer, + void *parent, + VDOAction callback, + ThreadID threadID, + ZoneCount logicalZones, + ZoneCount physicalZones, + BlockCount locks, + LockCounter **lockCounterPtr) +{ + LockCounter *lockCounter; + + int result = ALLOCATE(1, LockCounter, __func__, &lockCounter); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(locks, uint16_t, __func__, &lockCounter->journalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->journalDecrementCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks * logicalZones, uint16_t, __func__, + &lockCounter->logicalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->logicalZoneCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks * physicalZones, uint16_t, __func__, + &lockCounter->physicalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->physicalZoneCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = initializeEnqueueableCompletion(&lockCounter->completion, + LOCK_COUNTER_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + setCallbackWithParent(&lockCounter->completion, callback, threadID, parent); + lockCounter->logicalZones = logicalZones; + lockCounter->physicalZones = physicalZones; + lockCounter->locks = locks; + *lockCounterPtr = lockCounter; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeLockCounter(LockCounter **lockCounterPtr) +{ + if (*lockCounterPtr == NULL) { + return; + } + + LockCounter *lockCounter = *lockCounterPtr; + destroyEnqueueable(&lockCounter->completion); + freeVolatile(lockCounter->physicalZoneCounts); + freeVolatile(lockCounter->logicalZoneCounts); + freeVolatile(lockCounter->journalDecrementCounts); + FREE(lockCounter->journalCounters); + FREE(lockCounter->logicalCounters); + FREE(lockCounter->physicalCounters); + FREE(lockCounter); + *lockCounterPtr = NULL; +} + +/** + * Get a pointer to the zone count for a given lock on a given zone. + * + * @param counter The lock counter + * @param lockNumber The lock to get + * @param zoneType The zone type whose count is desired + * + * @return A pointer to the zone count for the given lock and zone + **/ +static inline Atomic32 *getZoneCountPtr(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType) +{ + return ((zoneType == ZONE_TYPE_LOGICAL) + ? &counter->logicalZoneCounts[lockNumber] + : &counter->physicalZoneCounts[lockNumber]); +} + +/** + * Get the zone counter for a given lock on a given zone. + * + * @param counter The lock counter + * @param lockNumber The lock to get + * @param zoneType The zone type whose count is desired + * @param zoneID The zone index whose count is desired + * + * @return The counter for the given lock and zone + **/ +static inline uint16_t *getCounter(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + BlockCount zoneCounter = (counter->locks * zoneID) + lockNumber; + if (zoneType == ZONE_TYPE_JOURNAL) { + return &counter->journalCounters[zoneCounter]; + } + + if (zoneType == ZONE_TYPE_LOGICAL) { + return &counter->logicalCounters[zoneCounter]; + } + + return &counter->physicalCounters[zoneCounter]; +} + +/** + * Check whether the journal zone is locked for a given lock. + * + * @param counter The LockCounter + * @param lockNumber The lock to check + * + * @return true if the journal zone is locked + **/ +static bool isJournalZoneLocked(LockCounter *counter, BlockCount lockNumber) +{ + uint16_t journalValue + = *(getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, 0)); + uint32_t decrements + = atomicLoad32(&(counter->journalDecrementCounts[lockNumber])); + ASSERT_LOG_ONLY((decrements <= journalValue), + "journal zone lock counter must not underflow"); + + return (journalValue != decrements); +} + +/**********************************************************************/ +bool isLocked(LockCounter *lockCounter, + BlockCount lockNumber, + ZoneType zoneType) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "isLocked() called for non-journal zone"); + return (isJournalZoneLocked(lockCounter, lockNumber) + || (atomicLoad32(getZoneCountPtr(lockCounter, lockNumber, zoneType)) + != 0)); +} + +/** + * Check that we are on the journal thread. + * + * @param counter The LockCounter + * @param caller The name of the caller (for logging) + **/ +static void assertOnJournalThread(LockCounter *counter, const char *caller) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == counter->completion.callbackThreadID), + "%s() called from journal zone", caller); +} + +/**********************************************************************/ +void initializeLockCount(LockCounter *counter, + BlockCount lockNumber, + uint16_t value) +{ + assertOnJournalThread(counter, __func__); + uint16_t *journalValue = getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, + 0); + Atomic32 *decrementCount = &(counter->journalDecrementCounts[lockNumber]); + ASSERT_LOG_ONLY((*journalValue == atomicLoad32(decrementCount)), + "count to be initialized not in use"); + + *journalValue = value; + atomicStore32(decrementCount, 0); +} + +/**********************************************************************/ +void acquireLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "invalid lock count increment from journal zone"); + + uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); + ASSERT_LOG_ONLY(*currentValue < UINT16_MAX, + "increment of lock counter must not overflow"); + + if (*currentValue == 0) { + // This zone is acquiring this lock for the first time. + atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), 1); + } + *currentValue += 1; +} + +/** + * Decrement a non-atomic counter. + * + * @param counter The LockCounter + * @param lockNumber Which lock to decrement + * @param zoneType The type of the zone releasing the reference + * @param zoneID The ID of the zone releasing the reference + * + * @return The new value of the counter + **/ +static uint16_t releaseReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); + ASSERT_LOG_ONLY((*currentValue >= 1), + "decrement of lock counter must not underflow"); + + *currentValue -= 1; + return *currentValue; +} + +/** + * Attempt to notify the owner of this LockCounter that some lock has been + * released for some zone type. Will do nothing if another notification is + * already in progress. + * + * @param counter The LockCounter + **/ +static void attemptNotification(LockCounter *counter) +{ + if (compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_NOTIFYING)) { + resetCompletion(&counter->completion); + invokeCallback(&counter->completion); + } +} + +/**********************************************************************/ +void releaseLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "invalid lock count decrement from journal zone"); + if (releaseReference(counter, lockNumber, zoneType, zoneID) != 0) { + return; + } + + if (atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), -1) == 0) { + // This zone was the last lock holder of its type, so try to notify the + // owner. + attemptNotification(counter); + } +} + +/**********************************************************************/ +void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber) +{ + assertOnJournalThread(counter, __func__); + releaseReference(counter, lockNumber, ZONE_TYPE_JOURNAL, 0); + if (!isJournalZoneLocked(counter, lockNumber)) { + // The journal zone is not locked, so try to notify the owner. + attemptNotification(counter); + } +} + +/**********************************************************************/ +void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, + BlockCount lockNumber) +{ + atomicAdd32(&(counter->journalDecrementCounts[lockNumber]), 1); +} + +/**********************************************************************/ +void acknowledgeUnlock(LockCounter *counter) +{ + atomicStore32(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING); +} + +/**********************************************************************/ +bool suspendLockCounter(LockCounter *counter) +{ + assertOnJournalThread(counter, __func__); + return ((atomicLoad32(&counter->state) == LOCK_COUNTER_STATE_SUSPENDED) + || compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED)); +} + +/**********************************************************************/ +bool resumeLockCounter(LockCounter *counter) +{ + assertOnJournalThread(counter, __func__); + return compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_SUSPENDED, + LOCK_COUNTER_STATE_NOT_NOTIFYING); +} diff --git a/vdo/base/lockCounter.h b/vdo/base/lockCounter.h new file mode 100644 index 0000000..cbda7bd --- /dev/null +++ b/vdo/base/lockCounter.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.h#2 $ + */ + +#ifndef LOCK_COUNTER_H +#define LOCK_COUNTER_H + +#include "completion.h" +#include "types.h" + +/** + * LockCounter provides a set of shared reference count locks which is safe + * across multiple zones with a minimum of cross-thread synchronization + * operations. For each lock in the set, it maintains a set of per-zone lock + * counts, and a single, atomic count of the number of zones holding locks. + * Whenever a zone's individual counter for a lock goes from 0 to 1, the + * zone count for that lock is incremented. Whenever a zone's individual + * counter for a lock goes from 1 to 0, the zone count for that lock is + * decremented. If the zone count goes to 0, and the lock counter's + * completion is not in use, the completion is launched to inform the counter's + * owner that some lock has been released. It is the owner's responsibility to + * check for which locks have been released, and to inform the lock counter + * that it has received the notification by calling acknowledgeUnlock(). + **/ + +/** + * Create a lock counter. + * + * @param [in] layer The physical layer of the VDO + * @param [in] parent The parent to notify when the lock count goes + * to zero + * @param [in] callback The function to call when the lock count goes + * to zero + * @param [in] threadID The id of thread on which to run the callback + * @param [in] logicalZones The total number of logical zones + * @param [in] physicalZones The total number of physical zones + * @param [in] locks The number of locks + * @param [out] lockCounterPtr A pointer to hold the new counter + * + * @return VDO_SUCCESS or an error + **/ +int makeLockCounter(PhysicalLayer *layer, + void *parent, + VDOAction callback, + ThreadID threadID, + ZoneCount logicalZones, + ZoneCount physicalZones, + BlockCount locks, + LockCounter **lockCounterPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a lock counter and NULL out the reference to it. + * + * @param lockCounterPtr A pointer to the lock counter reference to free + **/ +void freeLockCounter(LockCounter **lockCounterPtr); + +/** + * Check whether a lock is locked for a zone type. If the recovery journal has + * a lock on the lock number, both logical and physical zones are considered + * locked. + * + * @param lockCounter The set of locks to check + * @param lockNumber The lock to check + * @param zoneType The type of the zone + * + * @return true if the specified lock has references (is locked) + **/ +bool isLocked(LockCounter *lockCounter, + BlockCount lockNumber, + ZoneType zoneType) + __attribute__((warn_unused_result)); + +/** + * Initialize the value of the journal zone's counter for a given lock. This + * must be called from the journal zone. + * + * @param counter The counter to initialize + * @param lockNumber Which lock to initialize + * @param value The value to set + **/ +void initializeLockCount(LockCounter *counter, + BlockCount lockNumber, + uint16_t value); + +/** + * Acquire a reference to a given lock in the specified zone. This method must + * not be used from the journal zone. + * + * @param counter The LockCounter + * @param lockNumber Which lock to increment + * @param zoneType The type of the zone acquiring the reference + * @param zoneID The ID of the zone acquiring the reference + **/ +void acquireLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a reference to a given lock in the specified zone. This method + * must not be used from the journal zone. + * + * @param counter The LockCounter + * @param lockNumber Which lock to increment + * @param zoneType The type of the zone releasing the reference + * @param zoneID The ID of the zone releasing the reference + **/ +void releaseLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a single journal zone reference from the journal zone. This method + * must be called from the journal zone. + * + * @param counter The counter from which to release a reference + * @param lockNumber The lock from which to release a reference + **/ +void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber); + +/** + * Release a single journal zone reference from any zone. This method shouldn't + * be called from the journal zone as it would be inefficient; use + * releaseJournalZoneReference() instead. + * + * @param counter The counter from which to release a reference + * @param lockNumber The lock from which to release a reference + **/ +void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, + BlockCount lockNumber); + +/** + * Inform a lock counter that an unlock notification was received by the + * caller. + * + * @param counter The counter to inform + **/ +void acknowledgeUnlock(LockCounter *counter); + +/** + * Prevent the lock counter from issuing notifications. + * + * @param counter The counter + * + * @return true if the lock counter was not notifying and hence + * the suspend was efficacious + **/ +bool suspendLockCounter(LockCounter *counter) + __attribute__((warn_unused_result)); + +/** + * Re-allow notifications from a suspended lock counter. + * + * @param counter The counter + * + * @return true if the lock counter was suspended + **/ +bool resumeLockCounter(LockCounter *counter) + __attribute__((warn_unused_result)); + +#endif // LOCK_COUNTER_H diff --git a/vdo/base/logicalZone.c b/vdo/base/logicalZone.c new file mode 100644 index 0000000..0834ff1 --- /dev/null +++ b/vdo/base/logicalZone.c @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.c#6 $ + */ + +#include "logicalZone.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "actionManager.h" +#include "adminState.h" +#include "allocationSelector.h" +#include "atomic.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "dataVIO.h" +#include "flush.h" +#include "intMap.h" +#include "vdoInternal.h" + +struct logicalZone { + /** The completion for flush notifications */ + VDOCompletion completion; + /** The owner of this zone */ + LogicalZones *zones; + /** Which logical zone this is */ + ZoneCount zoneNumber; + /** The thread id for this zone */ + ThreadID threadID; + /** In progress operations keyed by LBN */ + IntMap *lbnOperations; + /** The logical to physical map */ + BlockMapZone *blockMapZone; + /** The current flush generation */ + SequenceNumber flushGeneration; + /** The oldest active generation in this zone */ + SequenceNumber oldestActiveGeneration; + /** The number of IOs in the current flush generation */ + BlockCount iosInFlushGeneration; + /** + * The oldest locked generation in this zone (an atomic copy of + * oldestActiveGeneration) + **/ + Atomic64 oldestLockedGeneration; + /** The youngest generation of the current notification */ + SequenceNumber notificationGeneration; + /** Whether a notification is in progress */ + bool notifying; + /** The queue of active data write VIOs */ + RingNode writeVIOs; + /** The administrative state of the zone */ + AdminState state; + /** The selector for determining which physical zone to allocate from */ + AllocationSelector *selector; +}; + +struct logicalZones { + /** The VDO whose zones these are */ + VDO *vdo; + /** The manager for administrative actions */ + ActionManager *manager; + /** The number of zones */ + ZoneCount zoneCount; + /** The logical zones themselves */ + LogicalZone zones[]; +}; + +/** + * Convert a generic VDOCompletion to a LogicalZone. + * + * @param completion The completion to convert + * + * @return The completion as a LogicalZone + **/ +static LogicalZone *asLogicalZone(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(LogicalZone, completion) == 0); + assertCompletionType(completion->type, GENERATION_FLUSHED_COMPLETION); + return (LogicalZone *) completion; +} + +/**********************************************************************/ +LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) +{ + return (zoneNumber < zones->zoneCount) ? &zones->zones[zoneNumber] : NULL; +} + +/** + * Implements ZoneThreadGetter + **/ +static ThreadID getThreadIDForZone(void *context, ZoneCount zoneNumber) +{ + return getLogicalZoneThreadID(getLogicalZone(context, zoneNumber)); +} + +/** + * Initialize a logical zone. + * + * @param zones The LogicalZones to which this zone belongs + * @param zoneNumber The LogicalZone's index + **/ +static int initializeZone(LogicalZones *zones, ZoneCount zoneNumber) +{ + LogicalZone *zone = &zones->zones[zoneNumber]; + zone->zones = zones; + int result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->lbnOperations); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo = zones->vdo; + result = initializeEnqueueableCompletion(&zone->completion, + GENERATION_FLUSHED_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getLogicalZoneThread(getThreadConfig(vdo), + zoneNumber); + zone->blockMapZone = getBlockMapZone(vdo->blockMap, zoneNumber); + initializeRing(&zone->writeVIOs); + atomicStore64(&zone->oldestLockedGeneration, 0); + + return makeAllocationSelector(getThreadConfig(vdo)->physicalZoneCount, + zone->threadID, &zone->selector); +} + +/**********************************************************************/ +int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) +{ + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (threadConfig->logicalZoneCount == 0) { + return VDO_SUCCESS; + } + + LogicalZones *zones; + int result = ALLOCATE_EXTENDED(LogicalZones, threadConfig->logicalZoneCount, + LogicalZone, __func__, &zones); + if (result != VDO_SUCCESS) { + return result; + } + + zones->vdo = vdo; + zones->zoneCount = threadConfig->logicalZoneCount; + for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { + result = initializeZone(zones, zone); + if (result != VDO_SUCCESS) { + freeLogicalZones(&zones); + return result; + } + } + + result = makeActionManager(zones->zoneCount, getThreadIDForZone, + getAdminThread(threadConfig), zones, NULL, + vdo->layer, &zones->manager); + if (result != VDO_SUCCESS) { + freeLogicalZones(&zones); + return result; + } + + *zonesPtr = zones; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeLogicalZones(LogicalZones **zonesPtr) +{ + LogicalZones *zones = *zonesPtr; + if (zones == NULL) { + return; + } + + freeActionManager(&zones->manager); + + for (ZoneCount index = 0; index < zones->zoneCount; index++) { + LogicalZone *zone = &zones->zones[index]; + freeAllocationSelector(&zone->selector); + destroyEnqueueable(&zone->completion); + freeIntMap(&zone->lbnOperations); + } + + FREE(zones); + *zonesPtr = NULL; +} + +/**********************************************************************/ +static inline void assertOnZoneThread(LogicalZone *zone, const char *what) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == zone->threadID), + "%s() called on correct thread", what); +} + +/** + * Check whether this zone has drained. + * + * @param zone The zone to check + **/ +static void checkForDrainComplete(LogicalZone *zone) +{ + if (!isDraining(&zone->state) || zone->notifying + || !isRingEmpty(&zone->writeVIOs)) { + return; + } + + finishDraining(&zone->state); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, LogicalZone, state)); +} + +/** + * Drain a logical zone. + * + *

Implements ZoneAction. + **/ +static void drainLogicalZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + LogicalZone *zone = getLogicalZone(context, zoneNumber); + startDraining(&zone->state, getCurrentManagerOperation(zone->zones->manager), + parent, initiateDrain); +} + +/**********************************************************************/ +void drainLogicalZones(LogicalZones *zones, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(zones->manager, operation, NULL, drainLogicalZone, NULL, + parent); +} + +/** + * Resume a logical zone. + * + *

Implements ZoneAction. + **/ +static void resumeLogicalZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + LogicalZone *zone = getLogicalZone(context, zoneNumber); + finishCompletion(parent, resumeIfQuiescent(&zone->state)); +} + +/**********************************************************************/ +void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent) +{ + scheduleOperation(zones->manager, ADMIN_STATE_RESUMING, NULL, + resumeLogicalZone, NULL, parent); +} + +/**********************************************************************/ +ThreadID getLogicalZoneThreadID(const LogicalZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +BlockMapZone *getBlockMapForZone(const LogicalZone *zone) +{ + return zone->blockMapZone; +} + +/**********************************************************************/ +IntMap *getLBNLockMap(const LogicalZone *zone) +{ + return zone->lbnOperations; +} + +/**********************************************************************/ +LogicalZone *getNextLogicalZone(const LogicalZone *zone) +{ + return getLogicalZone(zone->zones, zone->zoneNumber + 1); +} + +/** + * Convert a RingNode to a DataVIO. + * + * @param ringNode The RingNode to convert + * + * @return The DataVIO which owns the RingNode + **/ +static inline DataVIO *dataVIOFromRingNode(RingNode *ringNode) +{ + return (DataVIO *) ((byte *) ringNode - offsetof(DataVIO, writeNode)); +} + +/** + * Update the oldest active generation. If it has changed, update the + * atomic copy as well. + * + * @param zone The zone + * + * @return true if the oldest active generation has changed + **/ +static bool updateOldestActiveGeneration(LogicalZone *zone) +{ + SequenceNumber currentOldest = zone->oldestActiveGeneration; + if (isRingEmpty(&zone->writeVIOs)) { + zone->oldestActiveGeneration = zone->flushGeneration; + } else { + zone->oldestActiveGeneration + = dataVIOFromRingNode(zone->writeVIOs.next)->flushGeneration; + } + + if (zone->oldestActiveGeneration == currentOldest) { + return false; + } + + atomicStore64(&zone->oldestLockedGeneration, zone->oldestActiveGeneration); + return true; +} + +/**********************************************************************/ +void incrementFlushGeneration(LogicalZone *zone, + SequenceNumber expectedGeneration) +{ + assertOnZoneThread(zone, __func__); + ASSERT_LOG_ONLY((zone->flushGeneration == expectedGeneration), + "logical zone %u flush generation %" PRIu64 + " should be %llu before increment", + zone->zoneNumber, zone->flushGeneration, + expectedGeneration); + + zone->flushGeneration++; + zone->iosInFlushGeneration = 0; + updateOldestActiveGeneration(zone); +} + +/**********************************************************************/ +SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) +{ + return (SequenceNumber) atomicLoad64(&zone->oldestLockedGeneration); +} + +/**********************************************************************/ +int acquireFlushGenerationLock(DataVIO *dataVIO) +{ + LogicalZone *zone = dataVIO->logical.zone; + assertOnZoneThread(zone, __func__); + if (!isNormal(&zone->state)) { + return VDO_INVALID_ADMIN_STATE; + } + + dataVIO->flushGeneration = zone->flushGeneration; + pushRingNode(&zone->writeVIOs, &dataVIO->writeNode); + dataVIO->hasFlushGenerationLock = true; + zone->iosInFlushGeneration++; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void attemptGenerationCompleteNotification(VDOCompletion *completion); + +/** + * Notify the flush that at least one generation no longer has active VIOs. + * This callback is registered in attemptGenerationCompleteNotification(). + * + * @param completion The zone completion + **/ +static void notifyFlusher(VDOCompletion *completion) +{ + LogicalZone *zone = asLogicalZone(completion); + completeFlushes(zone->zones->vdo->flusher); + launchCallback(completion, attemptGenerationCompleteNotification, + zone->threadID); +} + +/** + * Notify the flusher if some generation no longer has active VIOs. + * + * @param completion The zone completion + **/ +static void attemptGenerationCompleteNotification(VDOCompletion *completion) +{ + LogicalZone *zone = asLogicalZone(completion); + assertOnZoneThread(zone, __func__); + if (zone->oldestActiveGeneration <= zone->notificationGeneration) { + zone->notifying = false; + checkForDrainComplete(zone); + return; + } + + zone->notifying = true; + zone->notificationGeneration = zone->oldestActiveGeneration; + launchCallback(&zone->completion, notifyFlusher, + getFlusherThreadID(zone->zones->vdo->flusher)); +} + +/**********************************************************************/ +void releaseFlushGenerationLock(DataVIO *dataVIO) +{ + LogicalZone *zone = dataVIO->logical.zone; + assertOnZoneThread(zone, __func__); + if (isRingEmpty(&dataVIO->writeNode)) { + // This VIO never got a lock, either because it is a read, or because + // we are in read-only mode. + ASSERT_LOG_ONLY(!dataVIO->hasFlushGenerationLock, + "hasFlushGenerationLock false for VIO not on active list"); + return; + } + + unspliceRingNode(&dataVIO->writeNode); + dataVIO->hasFlushGenerationLock = false; + ASSERT_LOG_ONLY(zone->oldestActiveGeneration <= dataVIO->flushGeneration, + "DataVIO releasing lock on generation %" PRIu64 + " is not older than oldest active generation %llu", + dataVIO->flushGeneration, zone->oldestActiveGeneration); + + if (!updateOldestActiveGeneration(zone) || zone->notifying) { + return; + } + + attemptGenerationCompleteNotification(&zone->completion); +} + +/**********************************************************************/ +AllocationSelector *getAllocationSelector(LogicalZone *zone) +{ + return zone->selector; +} + +/**********************************************************************/ +void dumpLogicalZone(const LogicalZone *zone) +{ + logInfo("LogicalZone %u", zone->zoneNumber); + logInfo(" flushGeneration=%llu oldestActiveGeneration=%" PRIu64 + " oldestLockedGeneration=%llu notificationGeneration=%" PRIu64 + " notifying=%s iosInCurrentGeneration=%llu", + zone->flushGeneration, zone->oldestActiveGeneration, + relaxedLoad64(&zone->oldestLockedGeneration), + zone->notificationGeneration, boolToString(zone->notifying), + zone->iosInFlushGeneration); +} diff --git a/vdo/base/logicalZone.h b/vdo/base/logicalZone.h new file mode 100644 index 0000000..8e0eae6 --- /dev/null +++ b/vdo/base/logicalZone.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.h#3 $ + */ + +#ifndef LOGICAL_ZONE_H +#define LOGICAL_ZONE_H + +#include "adminState.h" +#include "intMap.h" +#include "types.h" + +/** + * Get a logical zone by number. + * + * @param zones A set of logical zones + * @param zoneNumber The number of the zone to get + * + * @return The requested zone + **/ +LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Create a set of logical zones. + * + * @param [in] vdo The VDO to which the zones will belong + * @param [out] zonesPtr A pointer to hold the new zones + * + * @return VDO_SUCCESS or an error code + **/ +int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) + __attribute__((warn_unused_result)); + +/** + * Free a set of logical zones and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freeLogicalZones(LogicalZones **zonePtr); + +/** + * Drain a set of logical zones. + * + * @param zones The logical zones to suspend + * @param operation The type of drain to perform + * @param completion The object to notify when the zones are suspended + **/ +void drainLogicalZones(LogicalZones *zones, + AdminStateCode operation, + VDOCompletion *completion); + +/** + * Resume a set of logical zones. + * + * @param zones The logical zones to resume + * @param parent The object to notify when the zones have resumed + **/ +void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent); + +/** + * Get the ID of a logical zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getLogicalZoneThreadID(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the portion of the block map for this zone. + * + * @param zone The zone + * + * @return The block map zone + **/ +BlockMapZone *getBlockMapForZone(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the logical lock map for this zone. + * + * @param zone The zone + * + * @return The logical lock map for the zone + **/ +IntMap *getLBNLockMap(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the next-highest-numbered logical zone, or NULL if the + * zone is the highest-numbered zone in its VDO. + * + * @param zone The logical zone to query + * + * @return The logical zone whose zone number is one greater than the given + * zone, or NULL if there is no such zone + **/ +LogicalZone *getNextLogicalZone(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Increment the flush generation in a logical zone. + * + * @param zone The logical zone + * @param expectedGeneration The expected value of the flush generation + * before the increment + **/ +void incrementFlushGeneration(LogicalZone *zone, + SequenceNumber expectedGeneration); + +/** + * Get the oldest flush generation which is locked by a logical zone. + * + * @param zone The logical zone + * + * @return The oldest generation locked by the zone + **/ +SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Acquire the shared lock on a flush generation by a write DataVIO. + * + * @param dataVIO The DataVIO + * + * @return VDO_SUCCESS or an error code + **/ +int acquireFlushGenerationLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Release the shared lock on a flush generation held by a write DataVIO. If + * there are pending flushes, and this DataVIO completes the oldest generation + * active in this zone, an attempt will be made to finish any flushes which may + * now be complete. + * + * @param dataVIO The DataVIO whose lock is to be released + **/ +void releaseFlushGenerationLock(DataVIO *dataVIO); + +/** + * Get the selector for deciding which physical zone should be allocated from + * next for activities in a logical zone. + * + * @param zone The logical zone of the operation which needs an allocation + * + * @return The allocation selector for this zone + **/ +AllocationSelector *getAllocationSelector(LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Dump information about a logical zone to the log for debugging, in a + * thread-unsafe fashion. + * + * @param zone The zone to dump + **/ +void dumpLogicalZone(const LogicalZone *zone); + +#endif // LOGICAL_ZONE_H diff --git a/vdo/base/lz4.c b/vdo/base/lz4.c new file mode 100644 index 0000000..1114aa8 --- /dev/null +++ b/vdo/base/lz4.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lz4.c#2 $ + */ + +// Get the memcpy fixup from common.h. +#include "common.h" + +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2012, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + - LZ4 source repository : http://code.google.com/p/lz4/ +*/ +/* + * With authors permission dual licensed as BSD/GPL for linux kernel + * + * Origin: http://lz4.googlecode.com/svn/trunk + * Revision: 88 + */ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// NOTCOMPRESSIBLE_DETECTIONLEVEL : +// Decreasing this value will make the algorithm skip faster data segments considered "incompressible" +// This may decrease compression ratio dramatically, but will be faster on incompressible data +// Increasing this value will make the algorithm search more before declaring a segment "incompressible" +// This could improve compression a bit, but will be slower on incompressible data +// The default value (6) is recommended +#define NOTCOMPRESSIBLE_DETECTIONLEVEL 6 + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This will provide a small boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU. +// You can set this option to 1 in situations where data will remain within closed environment +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// GCC normally defines these three macros (and PDP-endian which we ignore). +#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ + || !defined(__BYTE_ORDER__) +#error "GCC byte order macros not defined?" +#endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define LZ4_BIG_ENDIAN 1 +#elif __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +# error "fix byte order check" +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +//************************************** +// Compiler Options +//************************************** +#if __STDC_VERSION__ >= 199901L // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#define _GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef _MSC_VER // Visual Studio +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bit +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#if (_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +//************************************** +// Includes +//************************************** +#ifdef __KERNEL__ +# include // for memset +#else /* __KERNEL__ */ +# include // for malloc +# include // for memset +#endif /* __KERNEL__ */ +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined(_MSC_VER) // Visual Studio does not support 'stdint' natively +# define BYTE unsigned __int8 +# define U16 unsigned __int16 +# define U32 unsigned __int32 +# define S32 __int32 +# define U64 unsigned __int64 +#else +# ifdef __KERNEL__ +# include +# else /* __KERNEL__ */ +# include +# endif /* __KERNEL__ */ +# define BYTE uint8_t +# define U16 uint16_t +# define U32 uint32_t +# define S32 int32_t +# define U64 uint64_t +#endif + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +# pragma pack(push, 1) +#endif + +typedef struct _U16_S { U16 v; } U16_S; +typedef struct _U32_S { U32 v; } U32_S; +typedef struct _U64_S { U64 v; } U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +# pragma pack(pop) +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define MINMATCH 4 + +#define HASH_LOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_DETECTIONLEVEL>2?NOTCOMPRESSIBLE_DETECTIONLEVEL:2) +#define STACKLIMIT 13 +#define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()). +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +#define MINLENGTH (MFLIMIT+1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<> ((MINMATCH*8)-HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s,d,e) do { LZ4_COPYPACKET(s,d) } while (d>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); + #else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; + #endif +#else + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); + #else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; + #endif +#endif +} + +#else + +static inline int LZ4_NbCommonBytes (register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); + #else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; + #endif +#else + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); + #else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; + #endif +#endif +} + +#endif + + + +//****************************** +// Compression functions +//****************************** + +// LZ4_compressCtx : +// ----------------- +// Compress 'isize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. +// If it cannot achieve it, compression will stop, and result of the function will be zero. +// return : the number of bytes written in buffer 'dest', or 0 if the compression fails + +static inline int LZ4_compressCtx(void** ctx, + const char* source, + char* dest, + int isize, + int maxOutputSize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *) (*ctx); + HTYPE* HashTable; +#else + HTYPE HashTable[HASHTABLESIZE] = {0}; +#endif + + const BYTE* ip = (BYTE*) source; + INITBASE(base); + const BYTE* anchor = ip; + const BYTE* const iend = ip + isize; + const BYTE* const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + // Init + if (isizehashTable); + memset((void*)HashTable, 0, sizeof(srt->hashTable)); +#else + (void) ctx; +#endif + + + // First Byte + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; forwardH = LZ4_HASH_VALUE(ip); + + // Main Loop + for ( ; ; ) + { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE* forwardIp = ip; + const BYTE* ref; + BYTE* token; + + // Find a match + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (unlikely(forwardIp > mflimit)) { goto _last_literals; } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref>(BYTE*)source) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit +#ifdef _MSC_VER + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<254) + { + do { *op++ = 255; len -= 255; } while (len>254); + *op++ = (BYTE)len; + memcpy(op, anchor, length); + op += length; + goto _next_match; + } + else + *op++ = (BYTE)len; + } + else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } + else *token = (length<>8) > oend)) return 0; // Check output limit + if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } + else *token += len; + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + HashTable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + // Test next position + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0; + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (lastRun<> ((MINMATCH*8)-HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) +static inline int LZ4_compress64kCtx(void** ctx, + const char* source, + char* dest, + int isize, + int maxOutputSize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *) (*ctx); + U16* HashTable; +#else + U16 HashTable[HASH64KTABLESIZE] = {0}; +#endif + + const BYTE* ip = (BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const base = ip; + const BYTE* const iend = ip + isize; + const BYTE* const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + // Init + if (isizehashTable); + memset((void*)HashTable, 0, sizeof(srt->hashTable)); +#else + (void) ctx; +#endif + + + // First Byte + ip++; forwardH = LZ4_HASH64K_VALUE(ip); + + // Main Loop + for ( ; ; ) + { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE* forwardIp = ip; + const BYTE* ref; + BYTE* token; + + // Find a match + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = (U16)(ip - base); + + } while (A32(ref) != A32(ip)); + + // Catch up + while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit +#ifdef _MSC_VER + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<254) + { + do { *op++ = 255; len -= 255; } while (len>254); + *op++ = (BYTE)len; + memcpy(op, anchor, length); + op += length; + goto _next_match; + } + else + *op++ = (BYTE)len; + } + else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } + else *token = (length<>8) > oend)) return 0; // Check output limit + if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } + else *token += len; + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + HashTable[LZ4_HASH64K_VALUE(ip-2)] = (U16)(ip - 2 - base); + + // Test next position + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = (U16)(ip - base); + if (A32(ref) == A32(ip)) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if (op + lastRun + 1 + (lastRun-RUN_MASK+255)/255 > oend) return 0; + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (lastRun<>ML_BITS)) == RUN_MASK) { size_t len; for (;(len=*ip++)==255;length+=255){} length += len; } + + // copy literals + cpy = op+length; + if (unlikely(cpy>oend-COPYLENGTH)) + { + if (cpy != oend) goto _output_error; // Error : not enough place for another match (min 4) + 5 literals + memcpy(op, ip, length); + ip += length; + break; // EOF + } + LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if (unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset create reference outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; } + + // copy repeated sequence + if (unlikely((op-ref)oend-COPYLENGTH) + { + if (cpy > oend) goto _output_error; // Error : request to write beyond destination buffer + LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); + while(op>ML_BITS)) == RUN_MASK) { int s=255; while ((ipoend-COPYLENGTH) || (ip+length>iend-COPYLENGTH)) + { + if (cpy > oend) goto _output_error; // Error : writes beyond output buffer + if (ip+length != iend) goto _output_error; // Error : LZ4 format requires to consume all input at this stage + memcpy(op, ip, length); + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if (ref < (BYTE* const)dest) goto _output_error; // Error : offset creates reference outside of destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) { while (ipoend-COPYLENGTH) + { + if (cpy > oend) goto _output_error; // Error : request to write outside of destination buffer + LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); + while(op 0) && ((n & (n - 1)) == 0); +} + +/** + * Efficiently calculate the base-2 logarithm of a number truncated to an + * integer value. + * + * This also happens to be the bit index of the highest-order non-zero bit in + * the binary representation of the number, which can easily be used to + * calculate the bit shift corresponding to a bit mask or an array capacity, + * or to calculate the binary floor or ceiling (next lowest or highest power + * of two). + * + * @param n The input value + * + * @return the integer log2 of the value, or -1 if the value is zero + **/ +static inline int logBaseTwo(uint64_t n) +{ + if (n == 0) { + return -1; + } + // Many CPUs, including x86, directly support this calculation, so use the + // GCC function for counting the number of leading high-order zero bits. + return 63 - __builtin_clzll(n); +} + +/** + * Find the minimum of two physical block numbers. + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber minBlock(PhysicalBlockNumber a, + PhysicalBlockNumber b) +{ + return (a < b) ? a : b; +} + +/** + * Find the maximum of two physical block numbers. + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber maxBlock(PhysicalBlockNumber a, + PhysicalBlockNumber b) +{ + return (a > b) ? a : b; +} + +/** + * Find the minimum of two block counts. + **/ +__attribute__((warn_unused_result)) +static inline BlockCount minBlockCount(BlockCount a, BlockCount b) +{ + return (a < b) ? a : b; +} + +/** + * Find the maximum of two block counts. + **/ +__attribute__((warn_unused_result)) +static inline BlockCount maxBlockCount(BlockCount a, BlockCount b) +{ + return (a > b) ? a : b; +} + +/** + * Find the minimum of two sequence numbers. + **/ +__attribute__((warn_unused_result)) +static inline SequenceNumber minSequenceNumber(SequenceNumber a, + SequenceNumber b) +{ + return (a < b) ? a : b; +} + +/** + * Return the minimum of two page counts. + **/ +__attribute__((warn_unused_result)) +static inline PageCount minPageCount(PageCount a, PageCount b) +{ + return (a < b) ? a : b; +} + +/** + * Return the maximum of two page counts. + **/ +__attribute__((warn_unused_result)) +static inline PageCount maxPageCount(PageCount a, PageCount b) +{ + return (a > b) ? a : b; +} + +/** + * Round upward towards the nearest multiple of quantum. + * + * @param number a number + * @param quantum the quantum + * + * @return the least multiple of quantum not less than number + **/ +__attribute__((warn_unused_result)) +static inline size_t roundUpToMultipleSizeT(size_t number, size_t quantum) +{ + return number + quantum - 1 - ((number + quantum - 1) % quantum); +} + +/** + * Round upward towards the nearest multiple of quantum for uint64_t + * + * @param number a number + * @param quantum the quantum + * + * @return the least multiple of quantum not less than number + **/ +__attribute__((warn_unused_result)) +static inline uint64_t roundUpToMultipleUInt64T(uint64_t number, + uint64_t quantum) +{ + return number + quantum - 1 - ((number + quantum - 1) % quantum); +} + +/** + * Check whether the given value is between the lower and upper bounds, + * within a cyclic range of values from 0 to (modulus - 1). The value + * and both bounds must be smaller than the modulus. + * + * @param lower The lowest value to accept + * @param value The value to check + * @param upper The highest value to accept + * @param modulus The size of the cyclic space, no more than 2^15 + * + * @return true if the value is in range + **/ +static inline bool inCyclicRange(uint16_t lower, + uint16_t value, + uint16_t upper, + uint16_t modulus) +{ + if (value < lower) { + value += modulus; + } + if (upper < lower) { + upper += modulus; + } + return (value <= upper); +} + +/** + * Compute the number of buckets of a given size which are required to hold a + * given number of objects. + * + * @param objectCount The number of objects to hold + * @param bucketSize The size of a bucket + * + * @return The number of buckets required + **/ +static inline uint64_t computeBucketCount(uint64_t objectCount, + uint64_t bucketSize) +{ + uint64_t quotient = objectCount / bucketSize; + if ((objectCount % bucketSize) > 0) { + ++quotient; + } + return quotient; +} + +#endif // NUM_UTILS_H diff --git a/vdo/base/packedRecoveryJournalBlock.h b/vdo/base/packedRecoveryJournalBlock.h new file mode 100644 index 0000000..b592225 --- /dev/null +++ b/vdo/base/packedRecoveryJournalBlock.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packedRecoveryJournalBlock.h#3 $ + */ + +#ifndef PACKED_RECOVERY_JOURNAL_BLOCK_H +#define PACKED_RECOVERY_JOURNAL_BLOCK_H + +#include "numeric.h" + +#include "constants.h" +#include "recoveryJournalEntry.h" +#include "types.h" + +typedef struct { + SequenceNumber blockMapHead; // Block map head sequence number + SequenceNumber slabJournalHead; // Slab journal head sequence number + SequenceNumber sequenceNumber; // Sequence number for this block + Nonce nonce; // A given VDO instance's nonce + BlockCount logicalBlocksUsed; // Count of logical blocks in use + BlockCount blockMapDataBlocks; // Count of allocated block map pages + JournalEntryCount entryCount; // Number of entries written + uint8_t checkByte; // The protection check byte + uint8_t recoveryCount; // The number of recoveries completed + VDOMetadataType metadataType; // Metadata type +} RecoveryBlockHeader; + +/** + * The packed, on-disk representation of a recovery journal block header. + * All fields are kept in little-endian byte order. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** Block map head 64-bit sequence number */ + byte blockMapHead[8]; + + /** Slab journal head 64-bit sequence number */ + byte slabJournalHead[8]; + + /** The 64-bit sequence number for this block */ + byte sequenceNumber[8]; + + /** A given VDO instance's 64-bit nonce */ + byte nonce[8]; + + /** 8-bit metadata type (should always be one for the recovery journal) */ + uint8_t metadataType; + + /** 16-bit count of the entries encoded in the block */ + byte entryCount[2]; + + /** 64-bit count of the logical blocks used when this block was opened */ + byte logicalBlocksUsed[8]; + + /** 64-bit count of the block map blocks used when this block was opened */ + byte blockMapDataBlocks[8]; + + /** The protection check byte */ + uint8_t checkByte; + + /** The number of recoveries completed */ + uint8_t recoveryCount; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 8 + 1 + 2 + 8 + 8 + 1 + 1]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + SequenceNumber blockMapHead; + SequenceNumber slabJournalHead; + SequenceNumber sequenceNumber; + Nonce nonce; + VDOMetadataType metadataType; + JournalEntryCount entryCount; + BlockCount logicalBlocksUsed; + BlockCount blockMapDataBlocks; + uint8_t checkByte; + uint8_t recoveryCount; + } littleEndian; +#endif +} PackedJournalHeader; + +typedef struct { + /** The protection check byte */ + uint8_t checkByte; + + /** The number of recoveries completed */ + uint8_t recoveryCount; + + /** The number of entries in this sector */ + uint8_t entryCount; + + /** Journal entries for this sector */ + PackedRecoveryJournalEntry entries[]; +} __attribute__((packed)) PackedJournalSector; + +enum { + // Allowing more than 311 entries in each block changes the math + // concerning the amortization of metadata writes and recovery speed. + RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = 311, + /** The number of entries in each sector (except the last) when filled */ + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR + = ((VDO_SECTOR_SIZE - sizeof(PackedJournalSector)) + / sizeof(PackedRecoveryJournalEntry)), + /** The number of entries in the last sector when a block is full */ + RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR + = (RECOVERY_JOURNAL_ENTRIES_PER_BLOCK + % RECOVERY_JOURNAL_ENTRIES_PER_SECTOR), +}; + +/** + * Find the recovery journal sector from the block header and sector number. + * + * @param header The header of the recovery journal block + * @param sectorNumber The index of the sector (1-based) + * + * @return A packed recovery journal sector + **/ +__attribute__((warn_unused_result)) +static inline +PackedJournalSector *getJournalBlockSector(PackedJournalHeader *header, + int sectorNumber) +{ + char *sectorData = ((char *) header) + (VDO_SECTOR_SIZE * sectorNumber); + return (PackedJournalSector *) sectorData; +} + +/** + * Generate the packed representation of a recovery block header. + * + * @param header The header containing the values to encode + * @param packed The header into which to pack the values + **/ +static inline void packRecoveryBlockHeader(const RecoveryBlockHeader *header, + PackedJournalHeader *packed) +{ + storeUInt64LE(packed->fields.blockMapHead, header->blockMapHead); + storeUInt64LE(packed->fields.slabJournalHead, header->slabJournalHead); + storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); + storeUInt64LE(packed->fields.nonce, header->nonce); + storeUInt64LE(packed->fields.logicalBlocksUsed, header->logicalBlocksUsed); + storeUInt64LE(packed->fields.blockMapDataBlocks, header->blockMapDataBlocks); + storeUInt16LE(packed->fields.entryCount, header->entryCount); + + packed->fields.checkByte = header->checkByte; + packed->fields.recoveryCount = header->recoveryCount; + packed->fields.metadataType = header->metadataType; +} + +/** + * Decode the packed representation of a recovery block header. + * + * @param packed The packed header to decode + * @param header The header into which to unpack the values + **/ +static inline void unpackRecoveryBlockHeader(const PackedJournalHeader *packed, + RecoveryBlockHeader *header) +{ + *header = (RecoveryBlockHeader) { + .blockMapHead = getUInt64LE(packed->fields.blockMapHead), + .slabJournalHead = getUInt64LE(packed->fields.slabJournalHead), + .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), + .nonce = getUInt64LE(packed->fields.nonce), + .logicalBlocksUsed = getUInt64LE(packed->fields.logicalBlocksUsed), + .blockMapDataBlocks = getUInt64LE(packed->fields.blockMapDataBlocks), + .entryCount = getUInt16LE(packed->fields.entryCount), + .checkByte = packed->fields.checkByte, + .recoveryCount = packed->fields.recoveryCount, + .metadataType = packed->fields.metadataType, + }; +} + +#endif // PACKED_RECOVERY_JOURNAL_BLOCK_H diff --git a/vdo/base/packer.c b/vdo/base/packer.c new file mode 100644 index 0000000..efb4dd4 --- /dev/null +++ b/vdo/base/packer.c @@ -0,0 +1,1023 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.c#8 $ + */ + +#include "packerInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "allocatingVIO.h" +#include "allocationSelector.h" +#include "compressionState.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "pbnLock.h" +#include "vdo.h" +#include "vdoInternal.h" + +/** + * Check that we are on the packer thread. + * + * @param packer The packer + * @param caller The function which is asserting + **/ +static inline void assertOnPackerThread(Packer *packer, const char *caller) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == packer->threadID), + "%s() called from packer thread", caller); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static inline InputBin *inputBinFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(InputBin, ring) == 0); + return (InputBin *) node; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static inline OutputBin *outputBinFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(OutputBin, ring) == 0); + return (OutputBin *) node; +} + +/**********************************************************************/ +InputBin *nextBin(const Packer *packer, InputBin *bin) +{ + if (bin->ring.next == &packer->inputBins) { + return NULL; + } else { + return inputBinFromRingNode(bin->ring.next); + } +} + +/**********************************************************************/ +InputBin *getFullestBin(const Packer *packer) +{ + if (isRingEmpty(&packer->inputBins)) { + return NULL; + } else { + return inputBinFromRingNode(packer->inputBins.next); + } +} + +/** + * Insert an input bin to the list, which is in ascending order of free space. + * Since all bins are already in the list, this actually moves the bin to the + * correct position in the list. + * + * @param packer The packer + * @param bin The input bin to move to its sorted position + **/ +static void insertInSortedList(Packer *packer, InputBin *bin) +{ + for (InputBin *activeBin = getFullestBin(packer); + activeBin != NULL; + activeBin = nextBin(packer, activeBin)) { + if (activeBin->freeSpace > bin->freeSpace) { + pushRingNode(&activeBin->ring, &bin->ring); + return; + } + } + + pushRingNode(&packer->inputBins, &bin->ring); +} + +/** + * Allocate an input bin and put it into the packer's list. + * + * @param packer The packer + **/ +__attribute__((warn_unused_result)) +static int makeInputBin(Packer *packer) +{ + InputBin *bin; + int result = ALLOCATE_EXTENDED(InputBin, MAX_COMPRESSION_SLOTS, VIO *, + __func__, &bin); + if (result != VDO_SUCCESS) { + return result; + } + + bin->freeSpace = packer->binDataSize; + initializeRing(&bin->ring); + pushRingNode(&packer->inputBins, &bin->ring); + return VDO_SUCCESS; +} + +/** + * Push an output bin onto the stack of idle bins. + * + * @param packer The packer + * @param bin The output bin + **/ +static void pushOutputBin(Packer *packer, OutputBin *bin) +{ + ASSERT_LOG_ONLY(!hasWaiters(&bin->outgoing), + "idle output bin has no waiters"); + packer->idleOutputBins[packer->idleOutputBinCount++] = bin; +} + +/** + * Pop an output bin off the end of the stack of idle bins. + * + * @param packer The packer + * + * @return an idle output bin, or NULL if there are no idle bins + **/ +__attribute__((warn_unused_result)) +static OutputBin *popOutputBin(Packer *packer) +{ + if (packer->idleOutputBinCount == 0) { + return NULL; + } + + size_t index = --packer->idleOutputBinCount; + OutputBin *bin = packer->idleOutputBins[index]; + packer->idleOutputBins[index] = NULL; + return bin; +} + +/** + * Allocate a new output bin and push it onto the packer's stack of idle bins. + * + * @param packer The packer + * @param layer The physical layer that will receive the compressed block + * writes from the output bin + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int makeOutputBin(Packer *packer, PhysicalLayer *layer) +{ + OutputBin *output; + int result = ALLOCATE(1, OutputBin, __func__, &output); + if (result != VDO_SUCCESS) { + return result; + } + + // Add the bin to the stack even before it's fully initialized so it will + // be freed even if we fail to initialize it below. + initializeRing(&output->ring); + pushRingNode(&packer->outputBins, &output->ring); + pushOutputBin(packer, output); + + result = ALLOCATE_EXTENDED(CompressedBlock, packer->binDataSize, char, + "compressed block", &output->block); + if (result != VDO_SUCCESS) { + return result; + } + + return layer->createCompressedWriteVIO(layer, output, (char *) output->block, + &output->writer); +} + +/** + * Free an idle output bin and null out the reference to it. + * + * @param binPtr The reference to the output bin to free + **/ +static void freeOutputBin(OutputBin **binPtr) +{ + OutputBin *bin = *binPtr; + if (bin == NULL) { + return; + } + + unspliceRingNode(&bin->ring); + + VIO *vio = allocatingVIOAsVIO(bin->writer); + freeVIO(&vio); + FREE(bin->block); + FREE(bin); + *binPtr = NULL; +} + +/**********************************************************************/ +int makePacker(PhysicalLayer *layer, + BlockCount inputBinCount, + BlockCount outputBinCount, + const ThreadConfig *threadConfig, + Packer **packerPtr) +{ + Packer *packer; + int result = ALLOCATE_EXTENDED(Packer, outputBinCount, + OutputBin *, __func__, &packer); + if (result != VDO_SUCCESS) { + return result; + } + + packer->threadID = getPackerZoneThread(threadConfig); + packer->binDataSize = VDO_BLOCK_SIZE - sizeof(CompressedBlockHeader); + packer->size = inputBinCount; + packer->maxSlots = MAX_COMPRESSION_SLOTS; + packer->outputBinCount = outputBinCount; + initializeRing(&packer->inputBins); + initializeRing(&packer->outputBins); + + result = makeAllocationSelector(threadConfig->physicalZoneCount, + packer->threadID, &packer->selector); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + + for (BlockCount i = 0; i < inputBinCount; i++) { + int result = makeInputBin(packer); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + } + + /* + * The canceled bin can hold up to half the number of user VIOs. Every + * canceled VIO in the bin must have a canceler for which it is waiting, and + * any canceler will only have canceled one lock holder at a time. + */ + result = ALLOCATE_EXTENDED(InputBin, MAXIMUM_USER_VIOS / 2, VIO *, __func__, + &packer->canceledBin); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + + for (BlockCount i = 0; i < outputBinCount; i++) { + int result = makeOutputBin(packer, layer); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + } + + *packerPtr = packer; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePacker(Packer **packerPtr) +{ + Packer *packer = *packerPtr; + if (packer == NULL) { + return; + } + + InputBin *input; + while ((input = getFullestBin(packer)) != NULL) { + unspliceRingNode(&input->ring); + FREE(input); + } + + FREE(packer->canceledBin); + + OutputBin *output; + while ((output = popOutputBin(packer)) != NULL) { + freeOutputBin(&output); + } + + freeAllocationSelector(&packer->selector); + FREE(packer); + *packerPtr = NULL; +} + +/** + * Get the Packer from a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The Packer from the VDO to which the DataVIO belongs + **/ +static inline Packer *getPackerFromDataVIO(DataVIO *dataVIO) +{ + return getVDOFromDataVIO(dataVIO)->packer; +} + +/**********************************************************************/ +bool isSufficientlyCompressible(DataVIO *dataVIO) +{ + Packer *packer = getPackerFromDataVIO(dataVIO); + return (dataVIO->compression.size < packer->binDataSize); +} + +/**********************************************************************/ +ThreadID getPackerThreadID(Packer *packer) +{ + return packer->threadID; +} + +/**********************************************************************/ +PackerStatistics getPackerStatistics(const Packer *packer) +{ + /* + * This is called from getVDOStatistics(), which is called from outside the + * packer thread. These are just statistics with no semantics that could + * rely on memory order, so unfenced reads are sufficient. + */ + return (PackerStatistics) { + .compressedFragmentsWritten = relaxedLoad64(&packer->fragmentsWritten), + .compressedBlocksWritten = relaxedLoad64(&packer->blocksWritten), + .compressedFragmentsInPacker = relaxedLoad64(&packer->fragmentsPending), + }; +} + +/** + * Abort packing a DataVIO. + * + * @param dataVIO The DataVIO to abort + **/ +static void abortPacking(DataVIO *dataVIO) +{ + setCompressionDone(dataVIO); + relaxedAdd64(&getPackerFromDataVIO(dataVIO)->fragmentsPending, -1); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + continueDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * This continues the VIO completion without packing the VIO. + * + * @param waiter The wait queue entry of the VIO to continue + * @param unused An argument required so this function may be called + * from notifyAllWaiters + **/ +static void continueVIOWithoutPacking(Waiter *waiter, + void *unused __attribute__((unused))) +{ + abortPacking(waiterAsDataVIO(waiter)); +} + +/** + * Check whether the packer has drained. + * + * @param packer The packer + **/ +static void checkForDrainComplete(Packer *packer) +{ + if (isDraining(&packer->state) + && (packer->canceledBin->slotsUsed == 0) + && (packer->idleOutputBinCount == packer->outputBinCount)) { + finishDraining(&packer->state); + } +} + +/**********************************************************************/ +static void writePendingBatches(Packer *packer); + +/** + * Ensure that a completion is running on the packer thread. + * + * @param completion The compressed write VIO + * + * @return true if the completion is on the packer thread + **/ +__attribute__((warn_unused_result)) +static bool switchToPackerThread(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + ThreadID threadID = vio->vdo->packer->threadID; + if (completion->callbackThreadID == threadID) { + return true; + } + + completion->callbackThreadID = threadID; + invokeCallback(completion); + return false; +} + +/** + * Finish processing an output bin whose write has completed. If there was + * an error, any DataVIOs waiting on the bin write will be notified. + * + * @param packer The packer which owns the bin + * @param bin The bin which has finished + **/ +static void finishOutputBin(Packer *packer, OutputBin *bin) +{ + if (hasWaiters(&bin->outgoing)) { + notifyAllWaiters(&bin->outgoing, continueVIOWithoutPacking, NULL); + } else { + // No waiters implies no error, so the compressed block was written. + relaxedAdd64(&packer->fragmentsPending, -bin->slotsUsed); + relaxedAdd64(&packer->fragmentsWritten, bin->slotsUsed); + relaxedAdd64(&packer->blocksWritten, 1); + } + + bin->slotsUsed = 0; + pushOutputBin(packer, bin); +} + +/** + * This finishes the bin write process after the bin is written to disk. This + * is the VIO callback function registered by writeOutputBin(). + * + * @param completion The compressed write VIO + **/ +static void completeOutputBin(VDOCompletion *completion) +{ + if (!switchToPackerThread(completion)) { + return; + } + + VIO *vio = asVIO(completion); + if (completion->result != VDO_SUCCESS) { + updateVIOErrorStats(vio, + "Completing compressed write VIO for physical block %" + PRIu64 " with error", + vio->physical); + } + + Packer *packer = vio->vdo->packer; + finishOutputBin(packer, completion->parent); + writePendingBatches(packer); + checkForDrainComplete(packer); +} + +/** + * Implements WaiterCallback. Continues the DataVIO waiter. + **/ +static void continueWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + continueDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * Implements WaiterCallback. Updates the DataVIO waiter to refer to its slot + * in the compressed block, gives the DataVIO a share of the PBN lock on that + * block, and reserves a reference count increment on the lock. + **/ +static void shareCompressedBlock(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + OutputBin *bin = context; + + dataVIO->newMapped = (ZonedPBN) { + .pbn = bin->writer->allocation, + .zone = bin->writer->zone, + .state = getStateForSlot(dataVIO->compression.slot), + }; + dataVIOAsVIO(dataVIO)->physical = dataVIO->newMapped.pbn; + + shareCompressedWriteLock(dataVIO, bin->writer->allocationLock); + + // Wait again for all the waiters to get a share. + int result = enqueueWaiter(&bin->outgoing, waiter); + // Cannot fail since this waiter was just dequeued. + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); +} + +/** + * Finish a compressed block write. This callback is registered in + * continueAfterAllocation(). + * + * @param completion The compressed write completion + **/ +static void finishCompressedWrite(VDOCompletion *completion) +{ + OutputBin *bin = completion->parent; + assertInPhysicalZone(bin->writer); + + if (completion->result != VDO_SUCCESS) { + releaseAllocationLock(bin->writer); + // Invokes completeOutputBin() on the packer thread, which will deal with + // the waiters. + vioDoneCallback(completion); + return; + } + + // First give every DataVIO/HashLock a share of the PBN lock to ensure it + // can't be released until they've all done their incRefs. + notifyAllWaiters(&bin->outgoing, shareCompressedBlock, bin); + + // The waiters now hold the (downgraded) PBN lock. + bin->writer->allocationLock = NULL; + + // Invokes the callbacks registered before entering the packer. + notifyAllWaiters(&bin->outgoing, continueWaiter, NULL); + + // Invokes completeOutputBin() on the packer thread. + vioDoneCallback(completion); +} + +/** + * Continue the write path for a compressed write AllocatingVIO now that block + * allocation is complete (the AllocatingVIO may or may not have actually + * received an allocation). + * + * @param allocatingVIO The AllocatingVIO which has finished the allocation + * process + **/ +static void continueAfterAllocation(AllocatingVIO *allocatingVIO) +{ + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + VDOCompletion *completion = vioAsCompletion(vio); + if (allocatingVIO->allocation == ZERO_BLOCK) { + completion->requeue = true; + setCompletionResult(completion, VDO_NO_SPACE); + vioDoneCallback(completion); + return; + } + + setPhysicalZoneCallback(allocatingVIO, finishCompressedWrite, + THIS_LOCATION("$F(meta);cb=finishCompressedWrite")); + completion->layer->writeCompressedBlock(allocatingVIO); +} + +/** + * Launch an output bin. + * + * @param packer The packer which owns the bin + * @param bin The output bin to launch + **/ +static void launchCompressedWrite(Packer *packer, OutputBin *bin) +{ + if (isReadOnly(getVDOFromAllocatingVIO(bin->writer)->readOnlyNotifier)) { + finishOutputBin(packer, bin); + return; + } + + VIO *vio = allocatingVIOAsVIO(bin->writer); + resetCompletion(vioAsCompletion(vio)); + vio->callback = completeOutputBin; + vio->priority = VIO_PRIORITY_COMPRESSED_DATA; + allocateDataBlock(bin->writer, packer->selector, VIO_COMPRESSED_WRITE_LOCK, + continueAfterAllocation); +} + +/** + * Consume from the pending queue the next batch of VIOs that can be packed + * together in a single compressed block. VIOs that have been mooted since + * being placed in the pending queue will not be returned. + * + * @param packer The packer + * @param batch The counted array to fill with the next batch of VIOs + **/ +static void getNextBatch(Packer *packer, OutputBatch *batch) +{ + BlockSize spaceRemaining = packer->binDataSize; + batch->slotsUsed = 0; + + DataVIO *dataVIO; + while ((dataVIO = waiterAsDataVIO(getFirstWaiter(&packer->batchedDataVIOs))) + != NULL) { + // If there's not enough space for the next DataVIO, the batch is done. + if ((dataVIO->compression.size > spaceRemaining) + || (batch->slotsUsed == packer->maxSlots)) { + break; + } + + // Remove the next DataVIO from the queue and put it in the output batch. + dequeueNextWaiter(&packer->batchedDataVIOs); + batch->slots[batch->slotsUsed++] = dataVIO; + spaceRemaining -= dataVIO->compression.size; + } +} + +/** + * Pack the next batch of compressed VIOs from the batched queue into an + * output bin and write the output bin. + * + * @param packer The packer + * @param output The output bin to fill + * + * @return true if a write was issued for the output bin + **/ +__attribute__((warn_unused_result)) +static bool writeNextBatch(Packer *packer, OutputBin *output) +{ + OutputBatch batch; + getNextBatch(packer, &batch); + + if (batch.slotsUsed == 0) { + // The pending queue must now be empty (there may have been mooted VIOs). + return false; + } + + // If the batch contains only a single VIO, then we save nothing by saving + // the compressed form. Continue processing the single VIO in the batch. + if (batch.slotsUsed == 1) { + abortPacking(batch.slots[0]); + return false; + } + + resetCompressedBlockHeader(&output->block->header); + + size_t spaceUsed = 0; + for (SlotNumber slot = 0; slot < batch.slotsUsed; slot++) { + DataVIO *dataVIO = batch.slots[slot]; + dataVIO->compression.slot = slot; + putCompressedBlockFragment(output->block, slot, spaceUsed, + dataVIO->compression.data, + dataVIO->compression.size); + spaceUsed += dataVIO->compression.size; + + int result = enqueueDataVIO(&output->outgoing, dataVIO, + THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + abortPacking(dataVIO); + continue; + } + + output->slotsUsed += 1; + } + + launchCompressedWrite(packer, output); + return true; +} + +/** + * Put a DataVIO in a specific InputBin in which it will definitely fit. + * + * @param bin The bin in which to put the DataVIO + * @param dataVIO The DataVIO to add + **/ +static void addToInputBin(InputBin *bin, DataVIO *dataVIO) +{ + dataVIO->compression.bin = bin; + dataVIO->compression.slot = bin->slotsUsed; + bin->incoming[bin->slotsUsed++] = dataVIO; +} + +/** + * Start a new batch of VIOs in an InputBin, moving the existing batch, if + * any, to the queue of pending batched VIOs in the packer. + * + * @param packer The packer + * @param bin The bin to prepare + **/ +static void startNewBatch(Packer *packer, InputBin *bin) +{ + // Move all the DataVIOs in the current batch to the batched queue so they + // will get packed into the next free output bin. + for (SlotNumber slot = 0; slot < bin->slotsUsed; slot++) { + DataVIO *dataVIO = bin->incoming[slot]; + dataVIO->compression.bin = NULL; + + if (!mayWriteCompressedDataVIO(dataVIO)) { + /* + * Compression of this DataVIO was canceled while it was waiting; put it + * in the canceled bin so it can be rendezvous with the canceling + * DataVIO. + */ + addToInputBin(packer->canceledBin, dataVIO); + continue; + } + + int result = enqueueDataVIO(&packer->batchedDataVIOs, dataVIO, + THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + // Impossible but we're required to check the result from enqueue. + abortPacking(dataVIO); + } + } + + // The bin is now empty. + bin->slotsUsed = 0; + bin->freeSpace = packer->binDataSize; +} + +/** + * Add a DataVIO to a bin's incoming queue, handle logical space change, and + * call physical space processor. + * + * @param packer The packer + * @param bin The bin to which to add the the DataVIO + * @param dataVIO The DataVIO to add to the bin's queue + **/ +static void addDataVIOToInputBin(Packer *packer, + InputBin *bin, + DataVIO *dataVIO) +{ + // If the selected bin doesn't have room, start a new batch to make room. + if (bin->freeSpace < dataVIO->compression.size) { + startNewBatch(packer, bin); + } + + addToInputBin(bin, dataVIO); + bin->freeSpace -= dataVIO->compression.size; + + // If we happen to exactly fill the bin, start a new input batch. + if ((bin->slotsUsed == packer->maxSlots) || (bin->freeSpace == 0)) { + startNewBatch(packer, bin); + } + + // Now that we've finished changing the free space, restore the sort order. + insertInSortedList(packer, bin); +} + +/** + * Move DataVIOs in pending batches from the batchedDataVIOs to all free output + * bins, issuing writes for the output bins as they are packed. This will loop + * until either the pending queue is drained or all output bins are busy + * writing a compressed block. + * + * @param packer The packer + **/ +static void writePendingBatches(Packer *packer) +{ + if (packer->writingBatches) { + /* + * We've attempted to re-enter this function recursively due to completion + * handling, which can lead to kernel stack overflow as in VDO-1340. It's + * perfectly safe to break the recursion and do nothing since we know any + * pending batches will eventually be handled by the earlier call. + */ + return; + } + + // Record that we are in this function for the above check. IMPORTANT: never + // return from this function without clearing this flag. + packer->writingBatches = true; + + OutputBin *output; + while (hasWaiters(&packer->batchedDataVIOs) + && ((output = popOutputBin(packer)) != NULL)) { + if (!writeNextBatch(packer, output)) { + // We didn't use the output bin to write, so push it back on the stack. + pushOutputBin(packer, output); + } + } + + packer->writingBatches = false; +} + +/** + * Select the input bin that should be used to pack the compressed data in a + * DataVIO with other DataVIOs. + * + * @param packer The packer + * @param dataVIO The DataVIO + **/ +__attribute__((warn_unused_result)) +static InputBin *selectInputBin(Packer *packer, DataVIO *dataVIO) +{ + // First best fit: select the bin with the least free space that has enough + // room for the compressed data in the DataVIO. + InputBin *fullestBin = getFullestBin(packer); + for (InputBin *bin = fullestBin; bin != NULL; bin = nextBin(packer, bin)) { + if (bin->freeSpace >= dataVIO->compression.size) { + return bin; + } + } + + /* + * None of the bins have enough space for the DataVIO. We're not allowed to + * create new bins, so we have to overflow one of the existing bins. It's + * pretty intuitive to select the fullest bin, since that "wastes" the least + * amount of free space in the compressed block. But if the space currently + * used in the fullest bin is smaller than the compressed size of the + * incoming block, it seems wrong to force that bin to write when giving up + * on compressing the incoming DataVIO would likewise "waste" the the least + * amount of free space. + */ + if (dataVIO->compression.size + >= (packer->binDataSize - fullestBin->freeSpace)) { + return NULL; + } + + // The fullest bin doesn't have room, but writing it out and starting a new + // batch with the incoming DataVIO will increase the packer's free space. + return fullestBin; +} + +/**********************************************************************/ +void attemptPacking(DataVIO *dataVIO) +{ + Packer *packer = getPackerFromDataVIO(dataVIO); + assertOnPackerThread(packer, __func__); + + VIOCompressionState state = getCompressionState(dataVIO); + int result = ASSERT((state.status == VIO_COMPRESSING), + "attempt to pack DataVIO not ready for packing, state: " + "%u", + state.status); + if (result != VDO_SUCCESS) { + return; + } + + /* + * Increment whether or not this DataVIO will be packed or not since + * abortPacking() always decrements the counter. + */ + relaxedAdd64(&packer->fragmentsPending, 1); + + // If packing of this DataVIO is disallowed for administrative reasons, give + // up before making any state changes. + if (!isNormal(&packer->state) + || (dataVIO->flushGeneration < packer->flushGeneration)) { + abortPacking(dataVIO); + return; + } + + /* + * The check of mayBlockInPacker() here will set the DataVIO's compression + * state to VIO_PACKING if the DataVIO is allowed to be compressed (if it has + * already been canceled, we'll fall out here). Once the DataVIO is in the + * VIO_PACKING state, it must be guaranteed to be put in an input bin before + * any more requests can be processed by the packer thread. Otherwise, a + * canceling DataVIO could attempt to remove the canceled DataVIO from the + * packer and fail to rendezvous with it (VDO-2809). We must also make sure + * that we will actually bin the DataVIO and not give up on it as being + * larger than the space used in the fullest bin. Hence we must call + * selectInputBin() before calling mayBlockInPacker() (VDO-2826). + */ + InputBin *bin = selectInputBin(packer, dataVIO); + if ((bin == NULL) || !mayBlockInPacker(dataVIO)) { + abortPacking(dataVIO); + return; + } + + addDataVIOToInputBin(packer, bin, dataVIO); + writePendingBatches(packer); +} + +/** + * Force a pending write for all non-empty bins on behalf of a flush or + * suspend. + * + * @param packer The packer being flushed + **/ +static void writeAllNonEmptyBins(Packer *packer) +{ + for (InputBin *bin = getFullestBin(packer); + bin != NULL; + bin = nextBin(packer, bin)) { + startNewBatch(packer, bin); + // We don't need to re-sort the bin here since this loop will make every + // bin have the same amount of free space, so every ordering is sorted. + } + + writePendingBatches(packer); +} + +/**********************************************************************/ +void flushPacker(Packer *packer) +{ + assertOnPackerThread(packer, __func__); + if (isNormal(&packer->state)) { + writeAllNonEmptyBins(packer); + } +} + +/* + * This method is only exposed for unit tests and should not normally be called + * directly; use removeLockHolderFromPacker() instead. + */ +void removeFromPacker(DataVIO *dataVIO) +{ + InputBin *bin = dataVIO->compression.bin; + ASSERT_LOG_ONLY((bin != NULL), "DataVIO in packer has an input bin"); + + SlotNumber slot = dataVIO->compression.slot; + bin->slotsUsed--; + if (slot < bin->slotsUsed) { + bin->incoming[slot] = bin->incoming[bin->slotsUsed]; + bin->incoming[slot]->compression.slot = slot; + } + + dataVIO->compression.bin = NULL; + dataVIO->compression.slot = 0; + + Packer *packer = getPackerFromDataVIO(dataVIO); + if (bin != packer->canceledBin) { + bin->freeSpace += dataVIO->compression.size; + insertInSortedList(packer, bin); + } + + abortPacking(dataVIO); + checkForDrainComplete(packer); +} + +/**********************************************************************/ +void removeLockHolderFromPacker(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInPackerZone(dataVIO); + + DataVIO *lockHolder = dataVIO->compression.lockHolder; + dataVIO->compression.lockHolder = NULL; + removeFromPacker(lockHolder); +} + +/**********************************************************************/ +void incrementPackerFlushGeneration(Packer *packer) +{ + assertOnPackerThread(packer, __func__); + packer->flushGeneration++; + flushPacker(packer); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + Packer *packer = container_of(state, Packer, state); + writeAllNonEmptyBins(packer); + checkForDrainComplete(packer); +} + +/**********************************************************************/ +void drainPacker(Packer *packer, VDOCompletion *completion) +{ + assertOnPackerThread(packer, __func__); + startDraining(&packer->state, ADMIN_STATE_SUSPENDING, completion, + initiateDrain); +} + +/**********************************************************************/ +void resumePacker(Packer *packer, VDOCompletion *parent) +{ + assertOnPackerThread(packer, __func__); + finishCompletion(parent, resumeIfQuiescent(&packer->state)); +} + +/**********************************************************************/ +void resetSlotCount(Packer *packer, CompressedFragmentCount slots) +{ + if (slots > MAX_COMPRESSION_SLOTS) { + return; + } + + packer->maxSlots = slots; +} + +/**********************************************************************/ +static void dumpInputBin(const InputBin *bin, bool canceled) +{ + if (bin->slotsUsed == 0) { + // Don't dump empty input bins. + return; + } + + logInfo(" %sBin slotsUsed=%u freeSpace=%zu", + (canceled ? "Canceled" : "Input"), bin->slotsUsed, bin->freeSpace); + + // XXX dump VIOs in bin->incoming? The VIOs should have been dumped from the + // VIO pool. Maybe just dump their addresses so it's clear they're here? +} + +/**********************************************************************/ +static void dumpOutputBin(const OutputBin *bin) +{ + size_t count = countWaiters(&bin->outgoing); + if (bin->slotsUsed == 0) { + // Don't dump empty output bins. + return; + } + + logInfo(" OutputBin contains %zu outgoing waiters", count); + + // XXX dump VIOs in bin->outgoing? The VIOs should have been dumped from the + // VIO pool. Maybe just dump their addresses so it's clear they're here? + + // XXX dump writer VIO? +} + +/**********************************************************************/ +void dumpPacker(const Packer *packer) +{ + logInfo("Packer"); + logInfo(" flushGeneration=%llu state %s writingBatches=%s", + packer->flushGeneration, getAdminStateName(&packer->state), + boolToString(packer->writingBatches)); + + logInfo(" inputBinCount=%llu", packer->size); + for (InputBin *bin = getFullestBin(packer); + bin != NULL; + bin = nextBin(packer, bin)) { + dumpInputBin(bin, false); + } + + dumpInputBin(packer->canceledBin, true); + + logInfo(" outputBinCount=%zu idleOutputBinCount=%zu", + packer->outputBinCount, packer->idleOutputBinCount); + const RingNode *head = &packer->outputBins; + for (RingNode *node = head->next; node != head; node = node->next) { + dumpOutputBin(outputBinFromRingNode(node)); + } +} diff --git a/vdo/base/packer.h b/vdo/base/packer.h new file mode 100644 index 0000000..6661552 --- /dev/null +++ b/vdo/base/packer.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.h#3 $ + */ + +#ifndef PACKER_H +#define PACKER_H + +#include "completion.h" +#include "physicalLayer.h" +#include "statistics.h" +#include "threadConfig.h" +#include "types.h" + +enum { + DEFAULT_PACKER_INPUT_BINS = 16, + DEFAULT_PACKER_OUTPUT_BINS = 256, +}; + +typedef struct packer Packer; + +/** + * Make a new block packer. + * + * @param [in] layer The physical layer to which compressed blocks + * will be written + * @param [in] inputBinCount The number of partial bins to keep in memory + * @param [in] outputBinCount The number of compressed blocks that can be + * written concurrently + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] packerPtr A pointer to hold the new packer + * + * @return VDO_SUCCESS or an error + **/ +int makePacker(PhysicalLayer *layer, + BlockCount inputBinCount, + BlockCount outputBinCount, + const ThreadConfig *threadConfig, + Packer **packerPtr) + __attribute__((warn_unused_result)); + +/** + * Free a block packer and null out the reference to it. + * + * @param packerPtr A pointer to the packer to free + **/ +void freePacker(Packer **packerPtr); + +/** + * Check whether the compressed data in a DataVIO will fit in a packer bin. + * + * @param dataVIO The DataVIO + * + * @return true if the DataVIO will fit in a bin + **/ +bool isSufficientlyCompressible(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Get the thread ID of the packer's zone. + * + * @param packer The packer + * + * @return The packer's thread ID + **/ +ThreadID getPackerThreadID(Packer *packer); + +/** + * Get the current statistics from the packer. + * + * @param packer The packer to query + * + * @return a copy of the current statistics for the packer + **/ +PackerStatistics getPackerStatistics(const Packer *packer) + __attribute__((warn_unused_result)); + +/** + * Attempt to rewrite the data in this DataVIO as part of a compressed block. + * + * @param dataVIO The DataVIO to pack + **/ +void attemptPacking(DataVIO *dataVIO); + +/** + * Request that the packer flush asynchronously. All bins with at least two + * compressed data blocks will be written out, and any solitary pending VIOs + * will be released from the packer. While flushing is in progress, any VIOs + * submitted to attemptPacking() will be continued immediately without + * attempting to pack them. + * + * @param packer The packer to flush + **/ +void flushPacker(Packer *packer); + +/** + * Remove a lock holder from the packer. + * + * @param completion The DataVIO which needs a lock held by a DataVIO in the + * packer. The dataVIO's compressedVIO.lockHolder field will + * point to the DataVIO to remove. + **/ +void removeLockHolderFromPacker(VDOCompletion *completion); + +/** + * Increment the flush generation in the packer. This will also cause the + * packer to flush so that any VIOs from previous generations will exit the + * packer. + * + * @param packer The packer + **/ +void incrementPackerFlushGeneration(Packer *packer); + +/** + * Drain the packer by preventing any more VIOs from entering the packer and + * then flushing. + * + * @param packer The packer to drain + * @param completion The completion to finish when the packer has drained + **/ +void drainPacker(Packer *packer, VDOCompletion *completion); + +/** + * Resume a packer which has been suspended. + * + * @param packer The packer to resume + * @param parent The completion to finish when the packer has resumed + * + * @return VDO_SUCCESS or an error + **/ +void resumePacker(Packer *packer, VDOCompletion *parent); + +/** + * Dump the packer, in a thread-unsafe fashion. + * + * @param packer The packer + **/ +void dumpPacker(const Packer *packer); + +#endif /* PACKER_H */ diff --git a/vdo/base/packerInternals.h b/vdo/base/packerInternals.h new file mode 100644 index 0000000..e5aa500 --- /dev/null +++ b/vdo/base/packerInternals.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packerInternals.h#4 $ + */ + +#ifndef PACKER_INTERNALS_H +#define PACKER_INTERNALS_H + +#include "packer.h" + +#include "atomic.h" + +#include "adminState.h" +#include "compressedBlock.h" +#include "header.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Each InputBin holds an incomplete batch of DataVIOs that only partially fill + * a compressed block. The InputBins are kept in a ring sorted by the amount of + * unused space so the first bin with enough space to hold a newly-compressed + * DataVIO can easily be found. When the bin fills up or is flushed, the + * incoming DataVIOs are moved to the Packer's batchedDataVIOs queue, from + * which they will eventually be routed to an idle OutputBin. + * + * There is one special input bin which is used to hold DataVIOs which have + * been canceled and removed from their input bin by the packer. These DataVIOs + * need to wait for the canceller to rendezvous with them (VDO-2809) and so + * they sit in this special bin. + **/ +struct inputBin { + /** List links for Packer.sortedBins */ + RingNode ring; + /** The number of items in the bin */ + SlotNumber slotsUsed; + /** The number of compressed block bytes remaining in the current batch */ + size_t freeSpace; + /** The current partial batch of DataVIOs, waiting for more */ + DataVIO *incoming[]; +}; + +/** + * Each OutputBin allows a single compressed block to be packed and written. + * When it is not idle, it holds a batch of DataVIOs that have been packed + * into the compressed block, written asynchronously, and are waiting for the + * write to complete. + **/ +typedef struct { + /** List links for Packer.outputBins */ + RingNode ring; + /** The storage for encoding the compressed block representation */ + CompressedBlock *block; + /** The AllocatingVIO wrapping the compressed block for writing */ + AllocatingVIO *writer; + /** The number of compression slots used in the compressed block */ + SlotNumber slotsUsed; + /** The DataVIOs packed into the block, waiting for the write to complete */ + WaitQueue outgoing; +} OutputBin; + +/** + * A counted array holding a batch of DataVIOs that should be packed into an + * output bin. + **/ +typedef struct { + size_t slotsUsed; + DataVIO *slots[MAX_COMPRESSION_SLOTS]; +} OutputBatch; + +struct packer { + /** The ID of the packer's callback thread */ + ThreadID threadID; + /** The selector for determining which physical zone to allocate from */ + AllocationSelector *selector; + /** The number of input bins */ + BlockCount size; + /** The block size minus header size */ + size_t binDataSize; + /** The number of compression slots */ + size_t maxSlots; + /** A ring of all InputBins, kept sorted by freeSpace */ + RingNode inputBins; + /** A ring of all OutputBins */ + RingNode outputBins; + /** + * A bin to hold DataVIOs which were canceled out of the packer and are + * waiting to rendezvous with the canceling DataVIO. + **/ + InputBin *canceledBin; + + /** The current flush generation */ + SequenceNumber flushGeneration; + + /** The administrative state of the packer */ + AdminState state; + /** True when writing batched DataVIOs */ + bool writingBatches; + + // Atomic counters corresponding to the fields of PackerStatistics: + + /** Number of compressed data items written since startup */ + Atomic64 fragmentsWritten; + /** Number of blocks containing compressed items written since startup */ + Atomic64 blocksWritten; + /** Number of DataVIOs that are pending in the packer */ + Atomic64 fragmentsPending; + + /** Queue of batched DataVIOs waiting to be packed */ + WaitQueue batchedDataVIOs; + + /** The total number of output bins allocated */ + size_t outputBinCount; + /** The number of idle output bins on the stack */ + size_t idleOutputBinCount; + /** The stack of idle output bins (0=bottom) */ + OutputBin *idleOutputBins[]; +}; + +/** + * This returns the first bin in the freeSpace-sorted list. + **/ +InputBin *getFullestBin(const Packer *packer); + +/** + * This returns the next bin in the freeSpace-sorted list. + **/ +InputBin *nextBin(const Packer *packer, InputBin *bin); + +/** + * Change the maxiumum number of compression slots the packer will use. The new + * number of slots must be less than or equal to MAX_COMPRESSION_SLOTS. Bins + * which already have fragments will not be resized until they are next written + * out. + * + * @param packer The packer + * @param slots The new number of slots + **/ +void resetSlotCount(Packer *packer, CompressedFragmentCount slots); + +/** + * Remove a DataVIO from the packer. This method is exposed for testing. + * + * @param dataVIO The DataVIO to remove + **/ +void removeFromPacker(DataVIO *dataVIO); + +#endif /* PACKER_INTERNALS_H */ diff --git a/vdo/base/partitionCopy.c b/vdo/base/partitionCopy.c new file mode 100644 index 0000000..d5fa6de --- /dev/null +++ b/vdo/base/partitionCopy.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.c#2 $ + */ + +#include "partitionCopy.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "extent.h" +#include "numUtils.h" + +enum { + STRIDE_LENGTH = 2048 +}; + +/** + * A partition copy completion. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the source partition to copy from */ + Partition *source; + /** the target partition to copy to */ + Partition *target; + /** the current in-partition PBN the copy is beginning at */ + PhysicalBlockNumber currentIndex; + /** the last block to copy */ + PhysicalBlockNumber endingIndex; + /** the backing data used by the extent */ + char *data; + /** the extent being used to copy */ + VDOExtent *extent; +} CopyCompletion; + +/** + * Convert a VDOCompletion to a CopyCompletion. + * + * @param completion The completion to convert + * + * @return the completion as a CopyCompletion + **/ +__attribute__((warn_unused_result)) +static inline +CopyCompletion *asCopyCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(CopyCompletion, completion) == 0); + assertCompletionType(completion->type, PARTITION_COPY_COMPLETION); + return (CopyCompletion *) completion; +} + +/**********************************************************************/ +int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) +{ + CopyCompletion *copy; + int result = ALLOCATE(1, CopyCompletion, __func__, ©); + if (result != VDO_SUCCESS) { + return result; + } + initializeCompletion(©->completion, PARTITION_COPY_COMPLETION, layer); + + result = ALLOCATE((VDO_BLOCK_SIZE * STRIDE_LENGTH), char, + "partition copy extent", ©->data); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = ©->completion; + freeCopyCompletion(&completion); + return result; + } + + result = createExtent(layer, VIO_TYPE_PARTITION_COPY, VIO_PRIORITY_HIGH, + STRIDE_LENGTH, copy->data, ©->extent); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = ©->completion; + freeCopyCompletion(&completion); + return result; + } + + *completionPtr = ©->completion; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeCopyCompletion(VDOCompletion **completionPtr) +{ + if (*completionPtr == NULL) { + return; + } + + CopyCompletion *copy = asCopyCompletion(*completionPtr); + freeExtent(©->extent); + FREE(copy->data); + FREE(copy); + *completionPtr = NULL; +} + +/**********************************************************************/ +static void copyPartitionStride(CopyCompletion *copy); + +/** + * Determine the number of blocks to copy in the current stride. + * + * @param copy The copy completion + * + * @return The number of blocks to copy in the current stride + **/ +static inline BlockCount getStrideSize(CopyCompletion *copy) +{ + return minBlockCount(STRIDE_LENGTH, copy->endingIndex - copy->currentIndex); +} + +/** + * Process a completed write during a partition copy. + * + * @param completion The extent which has just completed writing + **/ +static void completeWriteForCopy(VDOCompletion *completion) +{ + CopyCompletion *copy = asCopyCompletion(completion->parent); + copy->currentIndex += getStrideSize(copy); + if (copy->currentIndex >= copy->endingIndex) { + // We're done. + finishCompletion(completion->parent, VDO_SUCCESS); + return; + } + copyPartitionStride(copy); +} + +/** + * Process a completed read during a partition copy, and launch the + * corresponding write to the new partition. + * + * @param completion The extent which has just completed reading + **/ +static void completeReadForCopy(VDOCompletion *completion) +{ + CopyCompletion *copy = asCopyCompletion(completion->parent); + PhysicalBlockNumber layerStartBlock; + int result = translateToPBN(copy->target, copy->currentIndex, + &layerStartBlock); + if (result != VDO_SUCCESS) { + finishCompletion(completion->parent, result); + return; + } + + completion->callback = completeWriteForCopy; + writePartialMetadataExtent(asVDOExtent(completion), layerStartBlock, + getStrideSize(copy)); +} + +/** + * Copy a stride from one partition to the new partition. + * + * @param copy The CopyCompletion + **/ +static void copyPartitionStride(CopyCompletion *copy) +{ + PhysicalBlockNumber layerStartBlock; + int result = translateToPBN(copy->source, copy->currentIndex, + &layerStartBlock); + if (result != VDO_SUCCESS) { + finishCompletion(©->completion, result); + return; + } + + prepareCompletion(©->extent->completion, completeReadForCopy, + finishParentCallback, copy->completion.callbackThreadID, + ©->completion); + readPartialMetadataExtent(copy->extent, layerStartBlock, + getStrideSize(copy)); +} + +/** + * Verify that the source can be copied to the target safely. + * + * @param source The source partition + * @param target The target partition + * + * @return VDO_SUCCESS or an error code + **/ +static int validatePartitionCopy(Partition *source, Partition *target) +{ + BlockCount sourceSize = getFixedLayoutPartitionSize(source); + BlockCount targetSize = getFixedLayoutPartitionSize(target); + + PhysicalBlockNumber sourceStart = getFixedLayoutPartitionOffset(source); + PhysicalBlockNumber sourceEnd = sourceStart + sourceSize; + PhysicalBlockNumber targetStart = getFixedLayoutPartitionOffset(target); + PhysicalBlockNumber targetEnd = targetStart + targetSize; + + int result = ASSERT(sourceSize <= targetSize, + "target partition must be not smaller than source" + " partition"); + if (result != UDS_SUCCESS) { + return result; + } + + return ASSERT(((sourceEnd <= targetStart) || (targetEnd <= sourceStart)), + "target partition must not overlap source partition"); +} + +/**********************************************************************/ +void copyPartitionAsync(VDOCompletion *completion, + Partition *source, + Partition *target, + VDOCompletion *parent) +{ + int result = validatePartitionCopy(source, target); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + CopyCompletion *copy = asCopyCompletion(completion); + prepareToFinishParent(©->completion, parent); + copy->source = source; + copy->target = target; + copy->currentIndex = 0; + copy->endingIndex = getFixedLayoutPartitionSize(source); + copyPartitionStride(copy); +} diff --git a/vdo/base/partitionCopy.h b/vdo/base/partitionCopy.h new file mode 100644 index 0000000..574ac13 --- /dev/null +++ b/vdo/base/partitionCopy.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.h#2 $ + */ + +#ifndef PARTITION_COPY_H +#define PARTITION_COPY_H + +#include "fixedLayout.h" +#include "physicalLayer.h" +#include "types.h" + +/** + * Make a copy completion. + * + * @param [in] layer The layer on which the partitions reside + * @param [out] completionPtr A pointer to hold the copy completion + * + * @return VDO_SUCCESS or an error + **/ +int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) + __attribute__((warn_unused_result)); + +/** + * Free a copy completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the complete to be freed + **/ +void freeCopyCompletion(VDOCompletion **completionPtr); + +/** + * Copy a partition. + * + * @param completion The copy completion to use + * @param source The partition to copy from + * @param target The partition to copy to + * @param parent The parent to finish when the copy is complete + **/ +void copyPartitionAsync(VDOCompletion *completion, + Partition *source, + Partition *target, + VDOCompletion *parent); + +#endif /* PARTITION_COPY_H */ diff --git a/vdo/base/pbnLock.c b/vdo/base/pbnLock.c new file mode 100644 index 0000000..5e9a274 --- /dev/null +++ b/vdo/base/pbnLock.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.c#3 $ + */ + +#include "pbnLock.h" + +#include "logger.h" + +#include "blockAllocator.h" +#include "referenceBlock.h" + +struct pbnLockImplementation { + PBNLockType type; + const char *name; + const char *releaseReason; +}; + +/** + * This array must have an entry for every PBNLockType value. + **/ +static const PBNLockImplementation LOCK_IMPLEMENTATIONS[] = { + [VIO_READ_LOCK] = { + .type = VIO_READ_LOCK, + .name = "read", + .releaseReason = "candidate duplicate", + }, + [VIO_WRITE_LOCK] = { + .type = VIO_WRITE_LOCK, + .name = "write", + .releaseReason = "newly allocated", + }, + [VIO_COMPRESSED_WRITE_LOCK] = { + .type = VIO_COMPRESSED_WRITE_LOCK, + .name = "compressed write", + .releaseReason = "failed compression", + }, + [VIO_BLOCK_MAP_WRITE_LOCK] = { + .type = VIO_BLOCK_MAP_WRITE_LOCK, + .name = "block map write", + .releaseReason = "block map write", + }, +}; + +/**********************************************************************/ +static inline bool hasLockType(const PBNLock *lock, PBNLockType type) +{ + return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); +} + +/**********************************************************************/ +bool isPBNReadLock(const PBNLock *lock) +{ + return hasLockType(lock, VIO_READ_LOCK); +} + +/**********************************************************************/ +static inline void setPBNLockType(PBNLock *lock, PBNLockType type) +{ + lock->implementation = &LOCK_IMPLEMENTATIONS[type]; +} + +/**********************************************************************/ +void initializePBNLock(PBNLock *lock, PBNLockType type) +{ + lock->holderCount = 0; + setPBNLockType(lock, type); +} + +/**********************************************************************/ +void downgradePBNWriteLock(PBNLock *lock) +{ + ASSERT_LOG_ONLY(!isPBNReadLock(lock), + "PBN lock must not already have been downgraded"); + ASSERT_LOG_ONLY(!hasLockType(lock, VIO_BLOCK_MAP_WRITE_LOCK), + "must not downgrade block map write locks"); + ASSERT_LOG_ONLY(lock->holderCount == 1, + "PBN write lock should have one holder but has %u", + lock->holderCount); + if (hasLockType(lock, VIO_WRITE_LOCK)) { + // DataVIO write locks are downgraded in place--the writer retains the + // hold on the lock. They've already had a single incRef journaled. + lock->incrementLimit = MAXIMUM_REFERENCE_COUNT - 1; + } else { + // Compressed block write locks are downgraded when they are shared with + // all their hash locks. The writer is releasing its hold on the lock. + lock->holderCount = 0; + lock->incrementLimit = MAXIMUM_REFERENCE_COUNT; + } + setPBNLockType(lock, VIO_READ_LOCK); +} + +/**********************************************************************/ +bool claimPBNLockIncrement(PBNLock *lock) +{ + /* + * Claim the next free reference atomically since hash locks from multiple + * hash zone threads might be concurrently deduplicating against a single + * PBN lock on compressed block. As long as hitting the increment limit will + * lead to the PBN lock being released in a sane time-frame, we won't + * overflow a 32-bit claim counter, allowing a simple add instead of a + * compare-and-swap. + */ + uint32_t claimNumber = atomicAdd32(&lock->incrementsClaimed, 1); + return (claimNumber <= lock->incrementLimit); +} + +/**********************************************************************/ +void assignProvisionalReference(PBNLock *lock) +{ + ASSERT_LOG_ONLY(!lock->hasProvisionalReference, + "lock does not have a provisional reference"); + lock->hasProvisionalReference = true; +} + +/**********************************************************************/ +void unassignProvisionalReference(PBNLock *lock) +{ + lock->hasProvisionalReference = false; +} + +/**********************************************************************/ +void releaseProvisionalReference(PBNLock *lock, + PhysicalBlockNumber lockedPBN, + BlockAllocator *allocator) +{ + if (hasProvisionalReference(lock)) { + releaseBlockReference(allocator, lockedPBN, + lock->implementation->releaseReason); + unassignProvisionalReference(lock); + } +} diff --git a/vdo/base/pbnLock.h b/vdo/base/pbnLock.h new file mode 100644 index 0000000..bd6512b --- /dev/null +++ b/vdo/base/pbnLock.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.h#3 $ + */ + +#ifndef PBN_LOCK_H +#define PBN_LOCK_H + +#include "atomic.h" +#include "types.h" + +/** + * The type of a PBN lock. + **/ +typedef enum { + VIO_READ_LOCK = 0, + VIO_WRITE_LOCK, + VIO_COMPRESSED_WRITE_LOCK, + VIO_BLOCK_MAP_WRITE_LOCK, +} PBNLockType; + +typedef struct pbnLockImplementation PBNLockImplementation; + +/** + * A PBN lock. + **/ +struct pbnLock { + /** The implementation of the lock */ + const PBNLockImplementation *implementation; + + /** The number of VIOs holding or sharing this lock */ + VIOCount holderCount; + /** + * The number of compressed block writers holding a share of this lock while + * they are acquiring a reference to the PBN. + **/ + uint8_t fragmentLocks; + + /** + * Whether the locked PBN has been provisionally referenced on behalf of the + * lock holder. + **/ + bool hasProvisionalReference; + + /** + * For read locks, the number of references that were known to be available + * on the locked block at the time the lock was acquired. + **/ + uint8_t incrementLimit; + + /** + * For read locks, the number of DataVIOs that have tried to claim one of + * the available increments during the lifetime of the lock. Each claim will + * first increment this counter, so it can exceed the increment limit. + **/ + Atomic32 incrementsClaimed; +}; + +/** + * Initialize a PBNLock. + * + * @param lock The lock to initialize + * @param type The type of the lock + **/ +void initializePBNLock(PBNLock *lock, PBNLockType type); + +/** + * Check whether a PBNLock is a read lock. + * + * @param lock The lock to check + * + * @return true if the lock is a read lock + **/ +bool isPBNReadLock(const PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Downgrade a PBN write lock to a PBN read lock. The lock holder count is + * cleared and the caller is responsible for setting the new count. + * + * @param lock The PBN write lock to downgrade + **/ +void downgradePBNWriteLock(PBNLock *lock); + +/** + * Try to claim one of the available reference count increments on a read + * lock. Claims may be attempted from any thread. A claim is only valid until + * the PBN lock is released. + * + * @param lock The PBN read lock from which to claim an increment + * + * @return true if the claim succeeded, guaranteeing one + * increment can be made without overflowing the PBN's reference count + **/ +bool claimPBNLockIncrement(PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Check whether a PBN lock has a provisional reference. + * + * @param lock The PBN lock + **/ +static inline bool hasProvisionalReference(PBNLock *lock) +{ + return ((lock != NULL) && lock->hasProvisionalReference); +} + +/** + * Inform a PBN lock that it is responsible for a provisional reference. + * + * @param lock The PBN lock + **/ +void assignProvisionalReference(PBNLock *lock); + +/** + * Inform a PBN lock that it is no longer responsible for a provisional + * reference. + * + * @param lock The PBN lock + **/ +void unassignProvisionalReference(PBNLock *lock); + +/** + * If the lock is responsible for a provisional reference, release that + * reference. This method is called when the lock is released. + * + * @param lock The lock + * @param lockedPBN The PBN covered by the lock + * @param allocator The block allocator from which to release the reference + **/ +void releaseProvisionalReference(PBNLock *lock, + PhysicalBlockNumber lockedPBN, + BlockAllocator *allocator); + +#endif /* PBN_LOCK_H */ diff --git a/vdo/base/pbnLockPool.c b/vdo/base/pbnLockPool.c new file mode 100644 index 0000000..38e2f32 --- /dev/null +++ b/vdo/base/pbnLockPool.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.c#2 $ + */ + +#include "pbnLockPool.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "ringNode.h" +#include "pbnLock.h" + +/** + * Unused (idle) PBN locks are kept in a ring. Just like in a malloc + * implementation, the lock structure is unused memory, so we can save a bit + * of space (and not pollute the lock structure proper) by using a union to + * overlay the lock structure with the free list. + **/ +typedef union idlePBNLock { + /** Only used while locks are in the pool */ + RingNode node; + /** Only used while locks are not in the pool */ + PBNLock lock; +} IdlePBNLock; + +/** + * The lock pool is little more than the memory allocated for the locks. + **/ +struct pbnLockPool { + /** The number of locks allocated for the pool */ + size_t capacity; + /** The number of locks currently borrowed from the pool */ + size_t borrowed; + /** A ring containing all idle PBN lock instances */ + RingNode idleRing; + /** The memory for all the locks allocated by this pool */ + IdlePBNLock locks[]; +}; + +/**********************************************************************/ +int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) +{ + PBNLockPool *pool; + int result = ALLOCATE_EXTENDED(PBNLockPool, capacity, IdlePBNLock, __func__, + &pool); + if (result != VDO_SUCCESS) { + return result; + } + + pool->capacity = capacity; + pool->borrowed = capacity; + initializeRing(&pool->idleRing); + + for (size_t i = 0; i < capacity; i++) { + PBNLock *lock = &pool->locks[i].lock; + returnPBNLockToPool(pool, &lock); + } + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePBNLockPool(PBNLockPool **poolPtr) +{ + if (*poolPtr == NULL) { + return; + } + + PBNLockPool *pool = *poolPtr; + ASSERT_LOG_ONLY(pool->borrowed == 0, + "All PBN locks must be returned to the pool before it is" + " freed, but %zu locks are still on loan", + pool->borrowed); + FREE(pool); + *poolPtr = NULL; +} + +/**********************************************************************/ +int borrowPBNLockFromPool(PBNLockPool *pool, + PBNLockType type, + PBNLock **lockPtr) +{ + if (pool->borrowed >= pool->capacity) { + return logErrorWithStringError(VDO_LOCK_ERROR, + "no free PBN locks left to borrow"); + } + pool->borrowed += 1; + + RingNode *idleNode = popRingNode(&pool->idleRing); + // The lock was zeroed when it was placed in the pool, but the overlapping + // ring pointers are non-zero after a pop. + memset(idleNode, 0, sizeof(*idleNode)); + + STATIC_ASSERT(offsetof(IdlePBNLock, node) == offsetof(IdlePBNLock, lock)); + PBNLock *lock = (PBNLock *) idleNode; + initializePBNLock(lock, type); + + *lockPtr = lock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr) +{ + // Take what should be the last lock reference from the caller + PBNLock *lock = *lockPtr; + *lockPtr = NULL; + + // A bit expensive, but will promptly catch some use-after-free errors. + memset(lock, 0, sizeof(*lock)); + + RingNode *idleNode = (RingNode *) lock; + initializeRing(idleNode); + pushRingNode(&pool->idleRing, idleNode); + + ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed"); + pool->borrowed -= 1; +} diff --git a/vdo/base/pbnLockPool.h b/vdo/base/pbnLockPool.h new file mode 100644 index 0000000..6853f84 --- /dev/null +++ b/vdo/base/pbnLockPool.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.h#1 $ + */ + +#ifndef PBN_LOCK_POOL_H +#define PBN_LOCK_POOL_H + +#include "pbnLock.h" +#include "types.h" + +typedef struct pbnLockPool PBNLockPool; + +/** + * Create a new PBN lock pool and all the lock instances it can loan out. + * + * @param [in] capacity The number of PBN locks to allocate for the pool + * @param [out] poolPtr A pointer to receive the new pool + * + * @return a VDO_SUCCESS or an error code + **/ +int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Free a PBN lock pool null out the reference to it. This also frees all all + * the PBN locks it allocated, so the caller must ensure that all locks have + * been returned to the pool. + * + * @param [in,out] poolPtr The reference to the lock pool to free + **/ +void freePBNLockPool(PBNLockPool **poolPtr); + +/** + * Borrow a PBN lock from the pool and initialize it with the provided type. + * Pools do not grow on demand or allocate memory, so this will fail if the + * pool is empty. Borrowed locks are still associated with this pool and must + * be returned to only this pool. + * + * @param [in] pool The pool from which to borrow + * @param [in] type The type with which to initialize the lock + * @param [out] lockPtr A pointer to receive the borrowed lock + * + * @return VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty + **/ +int borrowPBNLockFromPool(PBNLockPool *pool, + PBNLockType type, + PBNLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Return to the pool a lock that was borrowed from it, and null out the + * caller's reference to it. It must be the last live reference, as if the + * memory were being freed (the lock memory will re-initialized or zeroed). + * + * @param [in] pool The pool from which the lock was borrowed + * @param [in,out] lockPtr The last reference to the lock being returned + **/ +void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr); + +#endif // PBN_LOCK_POOL_H diff --git a/vdo/base/physicalLayer.c b/vdo/base/physicalLayer.c new file mode 100644 index 0000000..231a3bf --- /dev/null +++ b/vdo/base/physicalLayer.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.c#1 $ + */ + +#include "physicalLayer.h" + +static PhysicalLayerGetter *physicalLayerGetter; + +/**********************************************************************/ +void registerPhysicalLayerGetter(PhysicalLayerGetter *getter) +{ + physicalLayerGetter = getter; +} + +/**********************************************************************/ +PhysicalLayer *getPhysicalLayer(void) +{ + if (physicalLayerGetter != NULL) { + return (*physicalLayerGetter)(); + } + return NULL; +} diff --git a/vdo/base/physicalLayer.h b/vdo/base/physicalLayer.h new file mode 100644 index 0000000..18d6a20 --- /dev/null +++ b/vdo/base/physicalLayer.h @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.h#2 $ + */ + +#ifndef PHYSICAL_LAYER_H +#define PHYSICAL_LAYER_H + +#include "types.h" + +static const CRC32Checksum INITIAL_CHECKSUM = 0xffffffff; + +enum { + /* The size of a CRC-32 checksum */ + CHECKSUM_SIZE = sizeof(CRC32Checksum), +}; + +/** + * A function to destroy a physical layer and NULL out the reference to it. + * + * @param layerPtr A pointer to the layer to destroy + **/ +typedef void LayerDestructor(PhysicalLayer **layerPtr); + +/** + * A function to update a running CRC-32 checksum. + * + * @param crc The current value of the crc + * @param buffer The data to add to the checksum + * @param length The length of the data + * + * @return The updated value of the checksum + **/ +typedef uint32_t CRC32Updater(CRC32Checksum crc, + const byte *buffer, + size_t length); + +/** + * A function to report the block count of a physicalLayer. + * + * @param layer The layer + * + * @return The block count of the layer + **/ +typedef BlockCount BlockCountGetter(PhysicalLayer *layer); + +/** + * A function which can allocate a buffer suitable for use in an + * ExtentReader or ExtentWriter. + * + * @param [in] layer The physical layer in question + * @param [in] bytes The size of the buffer, in bytes. + * @param [in] why The occasion for allocating the buffer + * @param [out] bufferPtr A pointer to hold the buffer + * + * @return a success or error code + **/ +typedef int BufferAllocator(PhysicalLayer *layer, + size_t bytes, + const char *why, + char **bufferPtr); + +/** + * A function which can read an extent from a physicalLayer. + * + * @param [in] layer The physical layer from which to read + * @param [in] startBlock The physical block number of the start of the + * extent + * @param [in] blockCount The number of blocks in the extent + * @param [out] buffer A buffer to hold the extent + * @param [out] blocksRead A pointer to hold the number of blocks read (may be + * NULL) + * + * @return a success or error code + **/ +typedef int ExtentReader(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksRead); + +/** + * A function which can write an extent to a physicalLayer. + * + * @param [in] layer The physical layer to which to write + * @param [in] startBlock The physical block number of the start of the + * extent + * @param [in] blockCount The number of blocks in the extent + * @param [in] buffer The buffer which contains the data + * @param [out] blocksWritten A pointer to hold the number of blocks written + * (may be NULL) + * + * @return a success or error code + **/ +typedef int ExtentWriter(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksWritten); + +/** + * A function to allocate a metadata VIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIOs + * @param [in] parent The parent of this VIO + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold the new VIO + * + * @return VDO_SUCCESS or an error + **/ +typedef int MetadataVIOCreator(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr); + +/** + * A function to allocate an AllocatingVIO for compressed writes. + * + * @param [in] layer The physical layer + * @param [in] parent The parent of this VIO + * @param [in] data The buffer + * @param [out] allocatingVIOPtr A pointer to hold the new AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +typedef int CompressedWriteVIOCreator(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr); + +/** + * A function to destroy a VIO. The pointer to the VIO will be nulled out. + * + * @param vioPtr A pointer to the VIO to destroy + **/ +typedef void VIODestructor(VIO **vioPtr); + +/** + * A function to zero the contents of a DataVIO. + * + * @param dataVIO The DataVIO to zero + **/ +typedef AsyncDataOperation DataVIOZeroer; + +/** + * A function to copy the contents of a DataVIO into another DataVIO. + * + * @param source The dataVIO to copy from + * @param destination The dataVIO to copy to + **/ +typedef void DataCopier(DataVIO *source, DataVIO *destination); + +/** + * A function to apply a partial write to a DataVIO which has completed the + * read portion of a read-modify-write operation. + * + * @param dataVIO The dataVIO to modify + **/ +typedef AsyncDataOperation DataModifier; + +/** + * A function to asynchronously hash the block data, setting the chunk name of + * the DataVIO. This is asynchronous to allow the computation to be done on + * different threads. + * + * @param dataVIO The DataVIO to hash + **/ +typedef AsyncDataOperation DataHasher; + +/** + * A function to determine whether a block is a duplicate. This function + * expects the 'physical' field of the DataVIO to be set to the physical block + * where the block will be written if it is not a duplicate. If the block does + * turn out to be a duplicate, the DataVIO's 'isDuplicate' field will be set to + * true, and the DataVIO's 'advice' field will be set to the physical block and + * mapping state of the already stored copy of the block. + * + * @param dataVIO The DataVIO containing the block to check. + **/ +typedef AsyncDataOperation DuplicationChecker; + +/** + * A function to verify the duplication advice by examining an already-stored + * data block. This function expects the 'physical' field of the DataVIO to be + * set to the physical block where the block will be written if it is not a + * duplicate, and the 'duplicate' field to be set to the physical block and + * mapping state where a copy of the data may already exist. If the block is + * not a duplicate, the DataVIO's 'isDuplicate' field will be cleared. + * + * @param dataVIO The dataVIO containing the block to check. + **/ +typedef AsyncDataOperation DuplicationVerifier; + +/** + * A function to read a single DataVIO from the layer. + * + * If the DataVIO does not describe a read-modify-write operation, the + * physical layer may safely acknowledge the related user I/O request + * as complete. + * + * @param dataVIO The DataVIO to read + **/ +typedef AsyncDataOperation DataReader; + +/** + * A function to read a single metadata VIO from the layer. + * + * @param vio The vio to read + **/ +typedef AsyncOperation MetadataReader; + +/** + * A function to write a single DataVIO to the layer + * + * @param dataVIO The DataVIO to write + **/ +typedef AsyncDataOperation DataWriter; + +/** + * A function to write a single metadata VIO from the layer. + * + * @param vio The vio to write + **/ +typedef AsyncOperation MetadataWriter; + +/** + * A function to inform the layer that a DataVIO's related I/O request can be + * safely acknowledged as complete, even though the DataVIO itself may have + * further processing to do. + * + * @param dataVIO The DataVIO to acknowledge + **/ +typedef AsyncDataOperation DataAcknowledger; + +/** + * A function to compare the contents of a DataVIO to another DataVIO. + * + * @param first The first DataVIO to compare + * @param second The second DataVIO to compare + * + * @return true if the contents of the two DataVIOs are the same + **/ +typedef bool DataVIOComparator(DataVIO *first, DataVIO *second); + +/** + * A function to compress the data in a DataVIO. + * + * @param dataVIO The DataVIO to compress + **/ +typedef AsyncDataOperation DataCompressor; + +/** + * Update albireo. + * + * @param dataVIO The DataVIO which needs to change the entry for its data + **/ +typedef AsyncDataOperation AlbireoUpdater; + +/** + * A function to finish flush requests + * + * @param vdoFlush The flush requests + **/ +typedef void FlushComplete(VDOFlush **vdoFlush); + +/** + * A function to query the write policy of the layer. + * + * @param layer The layer to query + * + * @return the write policy of the layer + **/ +typedef WritePolicy WritePolicyGetter(PhysicalLayer *layer); + +/** + * A function to create an object that can be enqueued to run in a specified + * thread. The Enqueueable will be put into the 'enqueueable' field of the + * supplied completion. + * + * @param completion The completion to invoke the callback of + * + * @return VDO_SUCCESS or an error code + **/ +typedef int EnqueueableCreator(VDOCompletion *completion); + +/** + * A function to destroy and deallocate an Enqueueable object. + * + * @param enqueueablePtr Pointer to the object pointer to be destroyed + **/ +typedef void EnqueueableDestructor(Enqueueable **enqueueablePtr); + +/** + * A function to enqueue the Enqueueable object to run on the thread specified + * by its associated completion. + * + * @param enqueueable The object to be enqueued + **/ +typedef void Enqueuer(Enqueueable *enqueueable); + +/** + * A function to wait for an admin operation to complete. This function should + * not be called from a base-code thread. + * + * @param layer The layer on which to wait + **/ +typedef void OperationWaiter(PhysicalLayer *layer); + +/** + * A function to inform the layer of the result of an admin operation. + * + * @param layer The layer to inform + **/ +typedef void OperationComplete(PhysicalLayer *layer); + +/** + * A function to get the id of the current thread. + * + * @return The id of the current thread + **/ +typedef ThreadID ThreadIDGetter(void); + +/** + * A function to return the physical layer pointer for the current thread. + * + * @return The physical layer pointer + **/ +typedef PhysicalLayer *PhysicalLayerGetter(void); + +/** + * An abstraction representing the underlying physical layer. + **/ +struct physicalLayer { + // Management interface + LayerDestructor *destroy; + + // Synchronous interface + CRC32Updater *updateCRC32; + BlockCountGetter *getBlockCount; + + // Synchronous IO interface + BufferAllocator *allocateIOBuffer; + ExtentReader *reader; + ExtentWriter *writer; + + WritePolicyGetter *getWritePolicy; + + // Synchronous interfaces (vio-based) + MetadataVIOCreator *createMetadataVIO; + CompressedWriteVIOCreator *createCompressedWriteVIO; + VIODestructor *freeVIO; + DataVIOZeroer *zeroDataVIO; + DataCopier *copyData; + DataModifier *applyPartialWrite; + + // Asynchronous interface (vio-based) + DataHasher *hashData; + DuplicationChecker *checkForDuplication; + DuplicationVerifier *verifyDuplication; + DataReader *readData; + DataWriter *writeData; + CompressedWriter *writeCompressedBlock; + MetadataReader *readMetadata; + MetadataWriter *writeMetadata; + MetadataWriter *flush; + DataAcknowledger *acknowledgeDataVIO; + DataVIOComparator *compareDataVIOs; + DataCompressor *compressDataVIO; + AlbireoUpdater *updateAlbireo; + + // Asynchronous interface (other) + FlushComplete *completeFlush; + EnqueueableCreator *createEnqueueable; + EnqueueableDestructor *destroyEnqueueable; + Enqueuer *enqueue; + OperationWaiter *waitForAdminOperation; + OperationComplete *completeAdminOperation; + + // Thread specific interface + ThreadIDGetter *getCurrentThreadID; +}; + +/** + * Register the layer-specific implementation of getPhysicalLayer(). + * + * @param getter The function to be called + **/ +void registerPhysicalLayerGetter(PhysicalLayerGetter *getter); + +/** + * Fetch the physical layer pointer for the current thread. + * + * @return The physical layer pointer + **/ +PhysicalLayer *getPhysicalLayer(void); + +/** + * Get the id of the callback thread on which a completion is current running. + * + * @return the current thread ID + **/ +static inline ThreadID getCallbackThreadID(void) +{ + return getPhysicalLayer()->getCurrentThreadID(); +} + +#endif // PHYSICAL_LAYER_H diff --git a/vdo/base/physicalZone.c b/vdo/base/physicalZone.c new file mode 100644 index 0000000..accb631 --- /dev/null +++ b/vdo/base/physicalZone.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.c#3 $ + */ + +#include "physicalZone.h" + +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "dataVIO.h" +#include "flush.h" +#include "hashLock.h" +#include "intMap.h" +#include "pbnLock.h" +#include "pbnLockPool.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +enum { + // Each user DataVIO needs a PBN read lock and write lock, and each packer + // output bin has an AllocatingVIO that needs a PBN write lock. + LOCK_POOL_CAPACITY = 2 * MAXIMUM_USER_VIOS + DEFAULT_PACKER_OUTPUT_BINS, +}; + +struct physicalZone { + /** Which physical zone this is */ + ZoneCount zoneNumber; + /** The thread ID for this zone */ + ThreadID threadID; + /** In progress operations keyed by PBN */ + IntMap *pbnOperations; + /** Pool of unused PBNLock instances */ + PBNLockPool *lockPool; + /** The block allocator for this zone */ + BlockAllocator *allocator; +}; + +/**********************************************************************/ +int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) +{ + PhysicalZone *zone; + int result = ALLOCATE(1, PhysicalZone, __func__, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->pbnOperations); + if (result != VDO_SUCCESS) { + freePhysicalZone(&zone); + return result; + } + + result = makePBNLockPool(LOCK_POOL_CAPACITY, &zone->lockPool); + if (result != VDO_SUCCESS) { + freePhysicalZone(&zone); + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getPhysicalZoneThread(getThreadConfig(vdo), zoneNumber); + zone->allocator = getBlockAllocatorForZone(vdo->depot, zoneNumber); + + *zonePtr = zone; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePhysicalZone(PhysicalZone **zonePtr) +{ + if (*zonePtr == NULL) { + return; + } + + PhysicalZone *zone = *zonePtr; + freePBNLockPool(&zone->lockPool); + freeIntMap(&zone->pbnOperations); + FREE(zone); + *zonePtr = NULL; +} + +/**********************************************************************/ +ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) +{ + return zone->zoneNumber; +} + +/**********************************************************************/ +ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +BlockAllocator *getBlockAllocator(const PhysicalZone *zone) +{ + return zone->allocator; +} + +/**********************************************************************/ +PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) +{ + return ((zone == NULL) ? NULL : intMapGet(zone->pbnOperations, pbn)); +} + +/**********************************************************************/ +int attemptPBNLock(PhysicalZone *zone, + PhysicalBlockNumber pbn, + PBNLockType type, + PBNLock **lockPtr) +{ + // Borrow and prepare a lock from the pool so we don't have to do two IntMap + // accesses in the common case of no lock contention. + PBNLock *newLock; + int result = borrowPBNLockFromPool(zone->lockPool, type, &newLock); + if (result != VDO_SUCCESS) { + ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock"); + return result; + } + + PBNLock *lock; + result = intMapPut(zone->pbnOperations, pbn, newLock, false, + (void **) &lock); + if (result != VDO_SUCCESS) { + returnPBNLockToPool(zone->lockPool, &newLock); + return result; + } + + if (lock != NULL) { + // The lock is already held, so we don't need the borrowed lock. + returnPBNLockToPool(zone->lockPool, &newLock); + + result = ASSERT(lock->holderCount > 0, + "physical block %llu lock held", pbn); + if (result != VDO_SUCCESS) { + return result; + } + *lockPtr = lock; + } else { + *lockPtr = newLock; + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +void releasePBNLock(PhysicalZone *zone, + PhysicalBlockNumber lockedPBN, + PBNLock **lockPtr) +{ + PBNLock *lock = *lockPtr; + if (lock == NULL) { + return; + } + *lockPtr = NULL; + + ASSERT_LOG_ONLY(lock->holderCount > 0, + "should not be releasing a lock that is not held"); + + lock->holderCount -= 1; + if (lock->holderCount > 0) { + // The lock was shared and is still referenced, so don't release it yet. + return; + } + + PBNLock *holder = intMapRemove(zone->pbnOperations, lockedPBN); + ASSERT_LOG_ONLY((lock == holder), + "physical block lock mismatch for block %llu", + lockedPBN); + + releaseProvisionalReference(lock, lockedPBN, zone->allocator); + + returnPBNLockToPool(zone->lockPool, &lock); +} + +/**********************************************************************/ +void dumpPhysicalZone(const PhysicalZone *zone) +{ + dumpBlockAllocator(zone->allocator); +} diff --git a/vdo/base/physicalZone.h b/vdo/base/physicalZone.h new file mode 100644 index 0000000..2c02bbe --- /dev/null +++ b/vdo/base/physicalZone.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.h#1 $ + */ + +#ifndef PHYSICAL_ZONE_H +#define PHYSICAL_ZONE_H + +#include "pbnLock.h" +#include "types.h" + +/** + * Create a physical zone. + * + * @param [in] vdo The VDO to which the zone will belong + * @param [in] zoneNumber The number of the zone to create + * @param [out] zonePtr A pointer to hold the new PhysicalZone + * + * @return VDO_SUCCESS or an error code + **/ +int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) + __attribute__((warn_unused_result)); + +/** + * Free a physical zone and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freePhysicalZone(PhysicalZone **zonePtr); + +/** + * Get the zone number of a physical zone. + * + * @param zone The zone + * + * @return The number of the zone + **/ +ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the ID of a physical zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the block allocator from a physical zone. + * + * @param zone The zone + * + * @return The zone's allocator + **/ +BlockAllocator *getBlockAllocator(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the lock on a PBN if one exists. + * + * @param zone The physical zone responsible for the PBN + * @param pbn The physical block number whose lock is desired + * + * @return The lock or NULL if the PBN is not locked + **/ +PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Attempt to lock a physical block in the zone responsible for it. If the PBN + * is already locked, the existing lock will be returned. Otherwise, a new + * lock instance will be borrowed from the pool, initialized, and returned. + * The lock owner will be NULL for a new lock acquired by the caller, who is + * responsible for setting that field promptly. The lock owner will be + * non-NULL when there is already an existing lock on the PBN. + * + * @param [in] zone The physical zone responsible for the PBN + * @param [in] pbn The physical block number to lock + * @param [in] type The type with which to initialize a new lock + * @param [out] lockPtr A pointer to receive the lock, existing or new + * + * @return VDO_SUCCESS or an error + **/ +int attemptPBNLock(PhysicalZone *zone, + PhysicalBlockNumber pbn, + PBNLockType type, + PBNLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Release a physical block lock if it is held, return it to the lock pool, + * and null out the caller's reference to it. It must be the last live + * reference, as if the memory were being freed (the lock memory will + * re-initialized or zeroed). + * + * @param [in] zone The physical zone in which the lock was obtained + * @param [in] lockedPBN The physical block number to unlock + * @param [in,out] lockPtr The last reference to the lock being released + **/ +void releasePBNLock(PhysicalZone *zone, + PhysicalBlockNumber lockedPBN, + PBNLock **lockPtr); + +/** + * Dump information about a physical zone to the log for debugging. + * + * @param zone The zone to dump + **/ +void dumpPhysicalZone(const PhysicalZone *zone); + +#endif // PHYSICAL_ZONE_H diff --git a/vdo/base/pointerMap.c b/vdo/base/pointerMap.c new file mode 100644 index 0000000..395f266 --- /dev/null +++ b/vdo/base/pointerMap.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.c#1 $ + */ + +/** + * Hash table implementation of a map from integers to pointers, implemented + * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does + * not contain any of the locking/concurrency features of the algorithm, just + * the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear + * probing. All the entries are stored in a fixed array of buckets, with no + * dynamic allocation for collisions. Unlike linear probing, all the entries + * that hash to a given bucket are stored within a fixed neighborhood starting + * at that bucket. Chaining is effectively represented as a bit vector + * relative to each bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, + * subsequent neighborhoods are searched, and one or more entries will "hop" + * into those neighborhoods. When this process works, an empty bucket will + * move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the + * table must be resized and the all entries rehashed and added to the + * expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the + * worst case has a fixed upper bound (the size of the neighborhood). Those + * entries occupy a small number of memory cache lines, leading to improved + * use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load + * factors, so even with the increased memory burden for maintaining the hop + * vectors, less memory is needed to achieve that performance. Hopscotch is + * also immune to "contamination" from deleting entries since entries are + * genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper + * alludes to an offset scheme which is used by this implementation. Since the + * entries in the neighborhood are within N entries of the hash bucket at the + * start of the neighborhood, a pair of small offset fields each log2(N) bits + * wide is all that's needed to maintain the hops as a linked list. In order + * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, + * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => + * offset=1, etc.) We can represent neighborhoods of up to 255 entries with + * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the + * first entry in the list is always the bucket closest to the start of the + * neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations + * are very very expensive. If an upper bound on the latency of adding an + * entry to the table is needed, we either need to ensure the table is + * pre-sized to be large enough so no resize is ever needed, or we'll need to + * develop an approach to incrementally resize the table. + **/ + +#include "pointerMap.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table + NEIGHBORHOOD = 255, // the number of buckets in each neighborhood + MAX_PROBES = 1024, // limit on the number of probes for a free bucket + NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list + DEFAULT_LOAD = 75 // a compromise between memory use and performance +}; + +/** + * Buckets are packed together to reduce memory usage and improve cache + * efficiency. It would be tempting to encode the hop offsets separately and + * maintain alignment of key/value pairs, but it's crucial to keep the hop + * fields near the buckets that they use them so they'll tend to share cache + * lines. + **/ +typedef struct __attribute__((packed)) bucket { + uint8_t firstHop; // the biased offset of the first entry in the hop list + // of the neighborhood that hashes to this bucket + uint8_t nextHop; // the biased offset of the next bucket in the hop list + + const void *key; // the key stored in this bucket + void *value; // the value stored in this bucket (NULL if empty) +} Bucket; + +/** + * The concrete definition of the opaque PointerMap type. To avoid having to + * wrap the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array + * instead, which is why capacity and bucketCount are different. + **/ +struct pointerMap { + /** the number of entries stored in the map */ + size_t size; + /** the number of neighborhoods in the map */ + size_t capacity; + /** the number of buckets in the bucket array */ + size_t bucketCount; + /** the array of hash buckets */ + Bucket *buckets; + /** the function for comparing keys for equality */ + PointerKeyComparator *comparator; + /** the function for getting a hash code from a key */ + PointerKeyHasher *hasher; +}; + +/** + * Initialize a PointerMap. + * + * @param map the map to initialize + * @param capacity the initial capacity of the map + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateBuckets(PointerMap *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + // full neighborhood without have to wrap back around to element zero. + map->bucketCount = capacity + (NEIGHBORHOOD - 1); + return ALLOCATE(map->bucketCount, Bucket, "PointerMap buckets", + &map->buckets); +} + +/**********************************************************************/ +int makePointerMap(size_t initialCapacity, + unsigned int initialLoad, + PointerKeyComparator comparator, + PointerKeyHasher hasher, + PointerMap **mapPtr) +{ + // Use the default initial load if the caller did not specify one. + if (initialLoad == 0) { + initialLoad = DEFAULT_LOAD; + } + if (initialLoad > 100) { + return UDS_INVALID_ARGUMENT; + } + + PointerMap *map; + int result = ALLOCATE(1, PointerMap, "PointerMap", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->hasher = hasher; + map->comparator = comparator; + + // Use the default capacity if the caller did not specify one. + size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; + + // Scale up the capacity by the specified initial load factor. + // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + capacity = capacity * 100 / initialLoad; + + result = allocateBuckets(map, capacity); + if (result != UDS_SUCCESS) { + freePointerMap(&map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/** + * Free the bucket array for the map. + * + * @param map the map whose bucket array is to be freed + **/ +static void freeBuckets(PointerMap *map) +{ + FREE(map->buckets); + map->buckets = NULL; +} + +/**********************************************************************/ +void freePointerMap(PointerMap **mapPtr) +{ + if (*mapPtr != NULL) { + freeBuckets(*mapPtr); + FREE(*mapPtr); + *mapPtr = NULL; + } +} + +/**********************************************************************/ +size_t pointerMapSize(const PointerMap *map) +{ + return map->size; +} + +/** + * Convert a biased hop offset within a neighborhood to a pointer to the + * bucket it references. + * + * @param neighborhood the first bucket in the neighborhood + * @param hopOffset the biased hop offset to the desired bucket + * + * @return NULL if hopOffset is zero, otherwise a pointer to + * the bucket in the neighborhood at hopOffset - 1 + **/ +static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) +{ + if (hopOffset == NULL_HOP_OFFSET) { + return NULL; + } + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hopOffset - 1]; +} + +/** + * Add a bucket into the hop list for the neighborhood, inserting it into the + * list so the hop list remains sorted by hop offset. + * + * @param neighborhood the first bucket in the neighborhood + * @param newBucket the bucket to add to the hop list + **/ +static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) +{ + // Zero indicates a NULL hop offset, so bias the hop offset by one. + int hopOffset = 1 + (newBucket - neighborhood); + + // Handle the special case of adding a bucket at the start of the list. + int nextHop = neighborhood->firstHop; + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + neighborhood->firstHop = hopOffset; + return; + } + + // Search the hop list for the insertion point that maintains the sort + // order. + for (;;) { + Bucket *bucket = dereferenceHop(neighborhood, nextHop); + nextHop = bucket->nextHop; + + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + bucket->nextHop = hopOffset; + return; + } + } +} + +/** + * Select and return the hash bucket for a given search key. + * + * @param map the map to search + * @param key the mapping key + **/ +static Bucket *selectBucket(const PointerMap *map, const void *key) +{ + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary + * fraction and multiplying that by the capacity. If the hash is uniformly + * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be + * uniformly distributed over [0 .. capacity-1]. The multiply and shift is + * much faster than a divide (modulus) on X86 CPUs. + */ + uint64_t hash = map->hasher(key); + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * Search the hop list associated with given hash bucket for a given search + * key. If the key is found, returns a pointer to the entry (bucket or + * collision), otherwise returns NULL. + * + * @param [in] map the map being searched + * @param [in] bucket the map bucket to search for the key + * @param [in] key the mapping key + * @param [out] previousPtr if not NULL, a pointer in which to + * store the bucket in the list preceding the one + * that had the matching key + * + * @return an entry that matches the key, or NULL if not found + **/ +static Bucket *searchHopList(PointerMap *map, + Bucket *bucket, + const void *key, + Bucket **previousPtr) +{ + Bucket *previous = NULL; + unsigned int nextHop = bucket->firstHop; + while (nextHop != NULL_HOP_OFFSET) { + // Check the neighboring bucket indexed by the offset for the desired key. + Bucket *entry = dereferenceHop(bucket, nextHop); + if ((entry->value != NULL) && map->comparator(key, entry->key)) { + if (previousPtr != NULL) { + *previousPtr = previous; + } + return entry; + } + nextHop = entry->nextHop; + previous = entry; + } + return NULL; +} + +/**********************************************************************/ +void *pointerMapGet(PointerMap *map, const void *key) +{ + Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); + return ((match != NULL) ? match->value : NULL); +} + +/** + * Increase the number of hash buckets and rehash all the existing entries, + * storing them in the new buckets. + * + * @param map the map to resize + **/ +static int resizeBuckets(PointerMap *map) +{ + // Copy the top-level map data to the stack. + PointerMap oldMap = *map; + + // Re-initialize the map to be empty and 50% larger. + size_t newCapacity = map->capacity / 2 * 3; + logInfo("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, map->capacity, newCapacity, map->size); + int result = allocateBuckets(map, newCapacity); + if (result != UDS_SUCCESS) { + *map = oldMap; + return result; + } + + // Populate the new hash table from the entries in the old bucket array. + for (size_t i = 0; i < oldMap.bucketCount; i++) { + Bucket *entry = &oldMap.buckets[i]; + if (entry->value == NULL) { + continue; + } + + result = pointerMapPut(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + // Destroy the new partial map and restore the map from the stack. + freeBuckets(map); + *map = oldMap; + return result; + } + } + + // Destroy the old bucket array. + freeBuckets(&oldMap); + return UDS_SUCCESS; +} + +/** + * Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. NULL will be returned if + * the search reaches the end of the bucket array or if the number of linear + * probes exceeds a specified limit. + * + * @param map the map containing the buckets to search + * @param bucket the bucket at which to start probing + * @param maxProbes the maximum number of buckets to search + * + * @return the next empty bucket, or NULL if the search failed + **/ +static Bucket *findEmptyBucket(PointerMap *map, + Bucket *bucket, + unsigned int maxProbes) +{ + // Limit the search to either the nearer of the end of the bucket array or a + // fixed distance beyond the initial bucket. + size_t remaining = &map->buckets[map->bucketCount] - bucket; + Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; + + for (Bucket *entry = bucket; entry < sentinel; entry++) { + if (entry->value == NULL) { + return entry; + } + } + return NULL; +} + +/** + * Move an empty bucket closer to the start of the bucket array. This searches + * the neighborhoods that contain the empty bucket for a non-empty bucket + * closer to the start of the array. If such a bucket is found, this swaps the + * two buckets by moving the entry to the empty bucket. + * + * @param map the map containing the bucket + * @param hole the empty bucket to fill with an entry that precedes it in one + * of its enclosing neighborhoods + * + * @return the bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved + **/ +static Bucket *moveEmptyBucket(PointerMap *map __attribute__((unused)), + Bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting + * with the one in which it is the last bucket. No boundary check is needed + * for the negative array arithmetic since this function is only called when + * hole is at least NEIGHBORHOOD cells deeper into the array than a valid + * bucket. + */ + for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + // Find the entry that is nearest to the bucket, which means it will be + // nearest to the hash bucket whose neighborhood is full. + Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); + if (newHole == NULL) { + // There are no buckets in this neighborhood that are in use by this one + // (they must all be owned by overlapping neighborhoods). + continue; + } + + // Skip this bucket if its first entry is actually further away than the + // hole that we're already trying to fill. + if (hole < newHole) { + continue; + } + + /* + * We've found an entry in this neighborhood that we can "hop" further + * away, moving the hole closer to the hash bucket, if not all the way + * into its neighborhood. + */ + + // The entry that will be the new hole is the first bucket in the list, + // so setting firstHop is all that's needed remove it from the list. + bucket->firstHop = newHole->nextHop; + newHole->nextHop = NULL_HOP_OFFSET; + + // Move the entry into the original hole. + hole->key = newHole->key; + hole->value = newHole->value; + newHole->value = NULL; + + // Insert the filled hole into the hop list for the neighborhood. + insertInHopList(bucket, hole); + return newHole; + } + + // We couldn't find an entry to relocate to the hole. + return NULL; +} + +/** + * Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * + * @param [in] map the PointerMap to attempt to modify + * @param [in] neighborhood the first bucket in the neighborhood that + * would contain the search key + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store the old value + * (unmodified if no mapping was found) + * + * @return true if the map contains a mapping for the key + * false if it does not + **/ +static bool updateMapping(PointerMap *map, + Bucket *neighborhood, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) +{ + Bucket *bucket = searchHopList(map, neighborhood, key, NULL); + if (bucket == NULL) { + // There is no bucket containing the key in the neighborhood. + return false; + } + + // Return the value of the current mapping (if desired) and update the + // mapping with the new value (if desired). + if (oldValuePtr != NULL) { + *oldValuePtr = bucket->value; + } + if (update) { + // We're dropping the old key pointer on the floor here, assuming it's a + // property of the value or that it's otherwise safe to just forget. + bucket->key = key; + bucket->value = newValue; + } + return true; +} + +/** + * Find an empty bucket in a specified neighborhood for a new mapping or + * attempt to re-arrange mappings so there is such a bucket. This operation + * may fail (returning NULL) if an empty bucket is not available or could not + * be relocated to the neighborhood. + * + * @param map the PointerMap to search or modify + * @param neighborhood the first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping + * + * @return a pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged + **/ +static Bucket *findOrMakeVacancy(PointerMap *map, Bucket *neighborhood) +{ + // Probe within and beyond the neighborhood for the first empty bucket. + Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); + + // Keep trying until the empty bucket is in the bucket's neighborhood or we + // are unable to move it any closer by swapping it with a filled bucket. + while (hole != NULL) { + int distance = hole - neighborhood; + if (distance < NEIGHBORHOOD) { + // We've found or relocated an empty bucket close enough to the initial + // hash bucket to be referenced by its hop vector. + return hole; + } + + // The nearest empty bucket isn't within the neighborhood that must + // contain the new entry, so try to swap it with bucket that is closer. + hole = moveEmptyBucket(map, hole); + } + + return NULL; +} + +/**********************************************************************/ +int pointerMapPut(PointerMap *map, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) +{ + if (newValue == NULL) { + return UDS_INVALID_ARGUMENT; + } + + // Select the bucket at the start of the neighborhood that must contain any + // entry for the provided key. + Bucket *neighborhood = selectBucket(map, key); + + // Check whether the neighborhood already contains an entry for the key, in + // which case we optionally update it, returning the old value. + if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { + return UDS_SUCCESS; + } + + /* + * Find an empty bucket in the desired neighborhood for the new entry or + * re-arrange entries in the map so there is such a bucket. This operation + * will usually succeed; the loop body will only be executed on the rare + * occasions that we have to resize the map. + */ + Bucket *bucket; + while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the current + * map, so we're forced to allocate a new bucket array with a larger + * capacity, re-hash all the entries into those buckets, and try again (a + * very expensive operation for large maps). + */ + int result = resizeBuckets(map); + if (result != UDS_SUCCESS) { + return result; + } + + // Resizing the map invalidates all pointers to buckets, so recalculate + // the neighborhood pointer. + neighborhood = selectBucket(map, key); + } + + // Put the new entry in the empty bucket, adding it to the neighborhood. + bucket->key = key; + bucket->value = newValue; + insertInHopList(neighborhood, bucket); + map->size += 1; + + // There was no existing entry, so there was no old value to be returned. + if (oldValuePtr != NULL) { + *oldValuePtr = NULL; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void *pointerMapRemove(PointerMap *map, const void *key) +{ + // Select the bucket to search and search it for an existing entry. + Bucket *bucket = selectBucket(map, key); + Bucket *previous; + Bucket *victim = searchHopList(map, bucket, key, &previous); + + if (victim == NULL) { + // There is no matching entry to remove. + return NULL; + } + + // We found an entry to remove. Save the mapped value to return later and + // empty the bucket. + map->size -= 1; + void *value = victim->value; + victim->value = NULL; + victim->key = 0; + + // The victim bucket is now empty, but it still needs to be spliced out of + // the hop list. + if (previous == NULL) { + // The victim is the head of the list, so swing firstHop. + bucket->firstHop = victim->nextHop; + } else { + previous->nextHop = victim->nextHop; + } + victim->nextHop = NULL_HOP_OFFSET; + + return value; +} diff --git a/vdo/base/pointerMap.h b/vdo/base/pointerMap.h new file mode 100644 index 0000000..1bd0bd2 --- /dev/null +++ b/vdo/base/pointerMap.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.h#1 $ + */ + +#ifndef POINTER_MAP_H +#define POINTER_MAP_H + +#include "common.h" + +/** + * PointerMap associates pointer values (void *) with the data + * referenced by pointer keys (void *). NULL pointer + * values are not supported. A NULL key value is supported when + * the instance's key comparator and hasher functions support it. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + * + * The key and value pointers passed to the map are retained and used by the + * map, but are not owned by the map. Freeing the map does not attempt to free + * the pointers. The client is entirely responsible for the memory managment + * of the keys and values. The current interface and implementation assume + * that keys will be properties of the values, or that keys will not be memory + * managed, or that keys will not need to be freed as a result of being + * replaced when a key is re-mapped. + **/ + +typedef struct pointerMap PointerMap; + +/** + * The prototype of functions that compare the referents of two pointer keys + * for equality. If two keys are equal, then both keys must have the same the + * hash code associated with them by the hasher function defined below. + + * @param thisKey The first element to compare + * @param thatKey The second element to compare + * + * @return true if and only if the referents of the two + * key pointers are to be treated as the same key by the map + **/ +typedef bool PointerKeyComparator(const void *thisKey, const void *thatKey); + +/** + * The prototype of functions that get or calculate a hash code associated + * with the referent of pointer key. The hash code must be uniformly + * distributed over all uint32_t values. The hash code associated with a given + * key must not change while the key is in the map. If the comparator function + * says two keys are equal, then this function must return the same hash code + * for both keys. This function may be called many times for a key while an + * entry is stored for it in the map. + * + * @param key The pointer key to hash + * + * @return the hash code for the key + **/ +typedef uint32_t PointerKeyHasher(const void *key); + +/** + * Allocate and initialize a PointerMap. + * + * @param [in] initialCapacity The number of entries the map should + * initially be capable of holding (zero tells + * the map to use its own small default) + * @param [in] initialLoad The load factor of the map, expressed as an + * integer percentage (typically in the range + * 50 to 90, with zero telling the map to use + * its own default) + * @param [in] comparator The function to use to compare the referents + * of two pointer keys for equality + * @param [in] hasher The function to use obtain the hash code + * associated with each pointer key + * @param [out] mapPtr A pointer to hold the new PointerMap + * + * @return UDS_SUCCESS or an error code + **/ +int makePointerMap(size_t initialCapacity, + unsigned int initialLoad, + PointerKeyComparator comparator, + PointerKeyHasher hasher, + PointerMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free a PointerMap and null out the reference to it. NOTE: The map does not + * own the pointer keys and values stored in the map and they are not freed by + * this call. + * + * @param [in,out] mapPtr The reference to the PointerMap to free + **/ +void freePointerMap(PointerMap **mapPtr); + +/** + * Get the number of entries stored in a PointerMap. + * + * @param map The PointerMap to query + * + * @return the number of entries in the map + **/ +size_t pointerMapSize(const PointerMap *map); + +/** + * Retrieve the value associated with a given key from the PointerMap. + * + * @param map The PointerMap to query + * @param key The key to look up (may be NULL if the + * comparator and hasher functions support it) + * + * @return the value associated with the given key, or NULL + * if the key is not mapped to any value + **/ +void *pointerMapGet(PointerMap *map, const void *key); + +/** + * Try to associate a value (a pointer) with an integer in a PointerMap. + * If the map already contains a mapping for the provided key, the old value is + * only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of update. + * + * If the value stored in the map is updated, then the key stored in the map + * will also be updated with the key provided by this call. The old key will + * not be returned due to the memory managment assumptions described in the + * interface header comment. + * + * @param [in] map The PointerMap to attempt to modify + * @param [in] key The key with which to associate the new value + * (may be NULL if the comparator and + * hasher functions support it) + * @param [in] newValue The value to be associated with the key + * @param [in] update Whether to overwrite an existing value + * @param [out] oldValuePtr A pointer in which to store either the old value + * (if the key was already mapped) or + * NULL if the map did not contain the + * key; NULL may be provided if the + * caller does not need to know the old value + * + * @return UDS_SUCCESS or an error code + **/ +int pointerMapPut(PointerMap *map, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) + __attribute__((warn_unused_result)); + +/** + * Remove the mapping for a given key from the PointerMap. + * + * @param map The PointerMap from which to remove the mapping + * @param key The key whose mapping is to be removed (may be NULL + * if the comparator and hasher functions support it) + * + * @return the value that was associated with the key, or + * NULL if it was not mapped + **/ +void *pointerMapRemove(PointerMap *map, const void *key); + +#endif /* POINTER_MAP_H */ diff --git a/vdo/base/priorityTable.c b/vdo/base/priorityTable.c new file mode 100644 index 0000000..deb423b --- /dev/null +++ b/vdo/base/priorityTable.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.c#1 $ + */ + +#include "priorityTable.h" + +#include "errors.h" +#include "memoryAlloc.h" +#include "numUtils.h" + +#include "statusCodes.h" + +/** We use a single 64-bit search vector, so the maximum priority is 63 */ +enum { MAX_PRIORITY = 63 }; + +/** + * All the entries with the same priority are queued in a circular list in a + * bucket for that priority. The table is essentially an array of buckets. + **/ +typedef struct bucket { + /** The head of a queue of table entries, all having the same priority */ + RingNode queue; + /** The priority of all the entries in this bucket */ + unsigned int priority; +} Bucket; + +/** + * A priority table is an array of buckets, indexed by priority. New entries + * are added to the end of the queue in the appropriate bucket. The dequeue + * operation finds the highest-priority non-empty bucket by searching a bit + * vector represented as a single 8-byte word, which is very fast with + * compiler and CPU support. + **/ +struct priorityTable { + /** The maximum priority of entries that may be stored in this table */ + unsigned int maxPriority; + /** A bit vector flagging all buckets that are currently non-empty */ + uint64_t searchVector; + /** The array of all buckets, indexed by priority */ + Bucket buckets[]; +}; + +/** + * Convert a queue head to to the bucket that contains it. + * + * @param head The bucket queue ring head pointer to convert + * + * @return the enclosing bucket + **/ +static inline Bucket *asBucket(RingNode *head) +{ + STATIC_ASSERT(offsetof(Bucket, queue) == 0); + return (Bucket *) head; +} + +/**********************************************************************/ +int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) +{ + if (maxPriority > MAX_PRIORITY) { + return UDS_INVALID_ARGUMENT; + } + + PriorityTable *table; + int result = ALLOCATE_EXTENDED(PriorityTable, maxPriority + 1, Bucket, + __func__, &table); + if (result != VDO_SUCCESS) { + return result; + } + + for (unsigned int priority = 0; priority <= maxPriority; priority++) { + Bucket *bucket = &table->buckets[priority]; + bucket->priority = priority; + initializeRing(&bucket->queue); + } + + table->maxPriority = maxPriority; + table->searchVector = 0; + + *tablePtr = table; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePriorityTable(PriorityTable **tablePtr) +{ + PriorityTable *table = *tablePtr; + if (table == NULL) { + return; + } + + // Unlink the buckets from any entries still in the table so the entries + // won't be left with dangling pointers to freed memory. + resetPriorityTable(table); + + FREE(table); + *tablePtr = NULL; +} + +/**********************************************************************/ +void resetPriorityTable(PriorityTable *table) +{ + table->searchVector = 0; + for (unsigned int priority = 0; priority <= table->maxPriority; priority++) { + unspliceRingNode(&table->buckets[priority].queue); + } +} + +/**********************************************************************/ +void priorityTableEnqueue(PriorityTable *table, + unsigned int priority, + RingNode *entry) +{ + ASSERT_LOG_ONLY((priority <= table->maxPriority), + "entry priority must be valid for the table"); + + // Append the entry to the queue in the specified bucket. + pushRingNode(&table->buckets[priority].queue, entry); + + // Flag the bucket in the search vector since it must be non-empty. + table->searchVector |= (1ULL << priority); +} + +/**********************************************************************/ +static inline void markBucketEmpty(PriorityTable *table, Bucket *bucket) +{ + table->searchVector &= ~(1ULL << bucket->priority); +} + +/**********************************************************************/ +RingNode *priorityTableDequeue(PriorityTable *table) +{ + // Find the highest priority non-empty bucket by finding the highest-order + // non-zero bit in the search vector. + int topPriority = logBaseTwo(table->searchVector); + + if (topPriority < 0) { + // All buckets are empty. + return NULL; + } + + // Dequeue the first entry in the bucket. + Bucket *bucket = &table->buckets[topPriority]; + RingNode *entry = unspliceRingNode(bucket->queue.next); + + // Clear the bit in the search vector if the bucket has been emptied. + if (isRingEmpty(&bucket->queue)) { + markBucketEmpty(table, bucket); + } + + return entry; +} + +/**********************************************************************/ +void priorityTableRemove(PriorityTable *table, RingNode *entry) +{ + // We can't guard against calls where the entry is on a ring for a different + // table, but it's easy to deal with an entry not in any table or ring. + if (isRingEmpty(entry)) { + return; + } + + // Remove the entry from the bucket ring, remembering a pointer to another + // entry in the ring. + RingNode *nextNode = entry->next; + unspliceRingNode(entry); + + // If the rest of the ring is now empty, the next node must be the ring head + // in the bucket and we can use it to update the search vector. + if (isRingEmpty(nextNode)) { + markBucketEmpty(table, asBucket(nextNode)); + } +} + +/**********************************************************************/ +bool isPriorityTableEmpty(PriorityTable *table) +{ + return (table->searchVector == 0); +} diff --git a/vdo/base/priorityTable.h b/vdo/base/priorityTable.h new file mode 100644 index 0000000..d48a570 --- /dev/null +++ b/vdo/base/priorityTable.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.h#2 $ + */ + +#ifndef PRIORITY_TABLE_H +#define PRIORITY_TABLE_H + +#include "ringNode.h" + +/** + * A PriorityTable is a simple implementation of a priority queue for entries + * with priorities that are small non-negative integer values. It implements + * the obvious priority queue operations of enqueuing an entry and dequeuing + * an entry with the maximum priority. It also supports removing an arbitrary + * entry. The priority of an entry already in the table can be changed by + * removing it and re-enqueuing it with a different priority. All operations + * have O(1) complexity. + * + * The links for the table entries must be embedded in the entries themselves. + * RingNode is used to link entries in the table and no wrapper type is + * declared, so an existing RingNode link in an object can also be used to + * queue it in a PriorityTable, assuming the field is not used for anything + * else while so queued. + * + * The table is implemented as an array of queues (circular lists) indexed by + * priority, along with a hint for which queues are non-empty. Steven Skiena + * calls a very similar structure a "bounded height priority queue", but given + * the resemblance to a hash table, "priority table" seems both shorter and + * more apt, if somewhat novel. + **/ + +typedef struct priorityTable PriorityTable; + +/** + * Allocate and initialize a new PriorityTable. + * + * @param [in] maxPriority The maximum priority value for table entries + * @param [out] tablePtr A pointer to hold the new table + * + * @return VDO_SUCCESS or an error code + **/ +int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) + __attribute__((warn_unused_result)); + +/** + * Free a PriorityTable and null out the reference to it. NOTE: The table does + * not own the entries stored in it and they are not freed by this call. + * + * @param [in,out] tablePtr The reference to the table to free + **/ +void freePriorityTable(PriorityTable **tablePtr); + +/** + * Add a new entry to the priority table, appending it to the queue for + * entries with the specified priority. + * + * @param table The table in which to store the entry + * @param priority The priority of the entry + * @param entry The RingNode embedded in the entry to store in the table + * (the caller must have initialized it) + **/ +void priorityTableEnqueue(PriorityTable *table, + unsigned int priority, + RingNode *entry); + +/** + * Reset a priority table, leaving it in the same empty state as when newly + * constructed. NOTE: The table does not own the entries stored in it and they + * are not freed (or even unlinked from each other) by this call. + * + * @param table The table to reset + **/ +void resetPriorityTable(PriorityTable *table); + +/** + * Find the highest-priority entry in the table, remove it from the table, and + * return it. If there are multiple entries with the same priority, the one + * that has been in the table with that priority the longest will be returned. + * + * @param table The priority table from which to remove an entry + * + * @return the dequeued entry, or NULL if the table is currently empty + **/ +RingNode *priorityTableDequeue(PriorityTable *table) + __attribute__((warn_unused_result)); + +/** + * Remove a specified entry from its priority table. + * + * @param table The table from which to remove the entry + * @param entry The entry to remove from the table + **/ +void priorityTableRemove(PriorityTable *table, RingNode *entry); + +/** + * Return whether the priority table is empty. + * + * @param table The table to check + * + * @return true if the table is empty + **/ +bool isPriorityTableEmpty(PriorityTable *table) + __attribute__((warn_unused_result)); + +#endif /* PRIORITY_TABLE_H */ diff --git a/vdo/base/readOnlyNotifier.c b/vdo/base/readOnlyNotifier.c new file mode 100644 index 0000000..ba837ac --- /dev/null +++ b/vdo/base/readOnlyNotifier.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.c#5 $ + */ + +#include "readOnlyNotifier.h" + +#include "atomic.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "completion.h" +#include "physicalLayer.h" +#include "threadConfig.h" + +/** + * A ReadOnlyNotifier has a single completion which is used to perform + * read-only notifications, however, enterReadOnlyMode() may be called from any + * base thread. A pair of atomic fields are used to control the read-only mode + * entry process. The first field holds the read-only error. The second is the + * state field, which may hold any of the four special values enumerated here. + * + * When enterReadOnlyMode() is called from some base thread, a compare-and-swap + * is done on the readOnlyError, setting it to the supplied error if the value + * was VDO_SUCCESS. If this fails, some other thread has already intiated + * read-only entry or scheduled a pending entry, so the call exits. Otherwise, + * a compare-and-swap is done on the state, setting it to NOTIFYING if the + * value was MAY_NOTIFY. If this succeeds, the caller initiates the + * notification. If this failed due to notifications being disallowed, the + * notifier will be in the MAY_NOT_NOTIFY state but readOnlyError will not be + * VDO_SUCCESS. This configuration will indicate to allowReadOnlyModeEntry() + * that there is a pending notification to perform. + **/ +enum { + /** Notifications are allowed but not in progress */ + MAY_NOTIFY = 0, + /** A notification is in progress */ + NOTIFYING, + /** Notifications are not allowed */ + MAY_NOT_NOTIFY, + /** A notification has completed */ + NOTIFIED, +}; + +/** + * An object to be notified when the VDO enters read-only mode + **/ +typedef struct readOnlyListener ReadOnlyListener; + +struct readOnlyListener { + /** The listener */ + void *listener; + /** The method to call to notifiy the listener */ + ReadOnlyNotification *notify; + /** A pointer to the next listener */ + ReadOnlyListener *next; +}; + +/** + * Data associated with each base code thread. + **/ +typedef struct threadData { + /** + * Each thread maintains its own notion of whether the VDO is read-only so + * that the read-only state can be checked from any base thread without + * worrying about synchronization or thread safety. This does mean that + * knowledge of the VDO going read-only does not occur simultaneously across + * the VDO's threads, but that does not seem to cause any problems. + */ + bool isReadOnly; + /** + * A list of objects waiting to be notified on this thread that the VDO has + * entered read-only mode. + **/ + ReadOnlyListener *listeners; +} ThreadData; + +struct readOnlyNotifier { + /** The completion for entering read-only mode */ + VDOCompletion completion; + /** A completion waiting for notifications to be drained or enabled */ + VDOCompletion *waiter; + /** The code of the error which put the VDO into read-only mode */ + Atomic32 readOnlyError; + /** The current state of the notifier (values described above) */ + Atomic32 state; + /** The thread config of the VDO */ + const ThreadConfig *threadConfig; + /** The array of per-thread data */ + ThreadData threadData[]; +}; + +/** + * Convert a generic VDOCompletion to a ReadOnlyNotifier. + * + * @param completion The completion to convert + * + * @return The completion as a ReadOnlyNotifier + **/ +static inline ReadOnlyNotifier *asNotifier(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ReadOnlyNotifier, completion) == 0); + assertCompletionType(completion->type, READ_ONLY_MODE_COMPLETION); + return (ReadOnlyNotifier *) completion; +} + +/**********************************************************************/ +int makeReadOnlyNotifier(bool isReadOnly, + const ThreadConfig *threadConfig, + PhysicalLayer *layer, + ReadOnlyNotifier **notifierPtr) +{ + ReadOnlyNotifier *notifier; + int result = ALLOCATE_EXTENDED(ReadOnlyNotifier, + threadConfig->baseThreadCount, ThreadData, + __func__, ¬ifier); + if (result != VDO_SUCCESS) { + return result; + } + + notifier->threadConfig = threadConfig; + if (isReadOnly) { + atomicStore32(¬ifier->readOnlyError, (uint32_t) VDO_READ_ONLY); + atomicStore32(¬ifier->state, NOTIFIED); + } else { + atomicStore32(¬ifier->state, MAY_NOTIFY); + } + result = initializeEnqueueableCompletion(¬ifier->completion, + READ_ONLY_MODE_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeReadOnlyNotifier(¬ifier); + return result; + } + + for (ThreadCount id = 0; id < threadConfig->baseThreadCount; id++) { + notifier->threadData[id].isReadOnly = isReadOnly; + } + + *notifierPtr = notifier; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr) +{ + ReadOnlyNotifier *notifier = *notifierPtr; + if (notifier == NULL) { + return; + } + + for (ThreadCount id = 0; id < notifier->threadConfig->baseThreadCount; + id++) { + ThreadData *threadData = ¬ifier->threadData[id]; + ReadOnlyListener *listener = threadData->listeners; + while (listener != NULL) { + ReadOnlyListener *toFree = listener; + listener = listener->next; + FREE(toFree); + } + } + + destroyEnqueueable(¬ifier->completion); + FREE(notifier); + *notifierPtr = NULL; +} + +/** + * Check that a function was called on the admin thread. + * + * @param notifier The notifier + * @param caller The name of the function (for logging) + **/ +static void assertOnAdminThread(ReadOnlyNotifier *notifier, const char *caller) +{ + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((getAdminThread(notifier->threadConfig) == threadID), + "%s called on admin thread", caller); +} + + +/**********************************************************************/ +void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, + VDOCompletion *parent) +{ + if (notifier == NULL) { + finishCompletion(parent, VDO_SUCCESS); + return; + } + + assertOnAdminThread(notifier, __func__); + if (notifier->waiter != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + uint32_t state = atomicLoad32(¬ifier->state); + if ((state == MAY_NOT_NOTIFY) || (state == NOTIFIED)) { + // Notifications are already done or disallowed. + completeCompletion(parent); + return; + } + + if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, MAY_NOT_NOTIFY)) { + // A notification was not in progress, and now they are disallowed. + completeCompletion(parent); + return; + } + + /* + * A notification is in progress, so wait for it to finish. There is no race + * here since the notification can't finish while the admin thread is in this + * method. + */ + notifier->waiter = parent; +} + +/** + * Complete the process of entering read only mode. + * + * @param completion The read-only mode completion + **/ +static void finishEnteringReadOnlyMode(VDOCompletion *completion) +{ + ReadOnlyNotifier *notifier = asNotifier(completion); + assertOnAdminThread(notifier, __func__); + atomicStore32(¬ifier->state, NOTIFIED); + + VDOCompletion *waiter = notifier->waiter; + if (waiter != NULL) { + notifier->waiter = NULL; + finishCompletion(waiter, completion->result); + } +} + +/** + * Inform each thread that the VDO is in read-only mode. + * + * @param completion The read-only mode completion + **/ +static void makeThreadReadOnly(VDOCompletion *completion) +{ + ThreadID threadID = completion->callbackThreadID; + ReadOnlyNotifier *notifier = asNotifier(completion); + ReadOnlyListener *listener = completion->parent; + if (listener == NULL) { + // This is the first call on this thread + ThreadData *threadData = ¬ifier->threadData[threadID]; + threadData->isReadOnly = true; + listener = threadData->listeners; + if (threadID == 0) { + // Note: This message must be recognizable by Permabit::UserMachine. + logErrorWithStringError((int) atomicLoad32(¬ifier->readOnlyError), + "Unrecoverable error, entering read-only mode"); + } + } else { + // We've just finished notifying a listener + listener = listener->next; + } + + if (listener != NULL) { + // We have a listener to notify + prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, + threadID, listener); + listener->notify(listener->listener, completion); + return; + } + + // We're done with this thread + if (++threadID >= notifier->threadConfig->baseThreadCount) { + // There are no more threads + prepareCompletion(completion, finishEnteringReadOnlyMode, + finishEnteringReadOnlyMode, + getAdminThread(notifier->threadConfig), NULL); + } else { + prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, + threadID, NULL); + } + + invokeCallback(completion); +} + +/**********************************************************************/ +void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, VDOCompletion *parent) +{ + assertOnAdminThread(notifier, __func__); + if (notifier->waiter != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + if (!compareAndSwap32(¬ifier->state, MAY_NOT_NOTIFY, MAY_NOTIFY)) { + // Notifications were already allowed or complete + completeCompletion(parent); + return; + } + + if ((int) atomicLoad32(¬ifier->readOnlyError) == VDO_SUCCESS) { + // We're done + completeCompletion(parent); + return; + } + + // There may have been a pending notification + if (!compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { + /* + * There wasn't, the error check raced with a thread calling + * enterReadOnlyMode() after we set the state to MAY_NOTIFY. It has already + * started the notification. + */ + completeCompletion(parent); + return; + } + + // Do the pending notification. + notifier->waiter = parent; + makeThreadReadOnly(¬ifier->completion); +} + +/**********************************************************************/ +void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode) +{ + ThreadData *threadData = ¬ifier->threadData[getCallbackThreadID()]; + if (threadData->isReadOnly) { + // This thread has already gone read-only. + return; + } + + // Record for this thread that the VDO is read-only. + threadData->isReadOnly = true; + + if (!compareAndSwap32(¬ifier->readOnlyError, (uint32_t) VDO_SUCCESS, + (uint32_t) errorCode)) { + // The notifier is already aware of a read-only error + return; + } + + if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { + // Initiate a notification starting on the lowest numbered thread. + launchCallback(¬ifier->completion, makeThreadReadOnly, 0); + } +} + +/**********************************************************************/ +bool isReadOnly(ReadOnlyNotifier *notifier) +{ + return notifier->threadData[getCallbackThreadID()].isReadOnly; +} + +/**********************************************************************/ +bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) +{ + return (((int) relaxedLoad32(¬ifier->readOnlyError)) != VDO_SUCCESS); +} + +/**********************************************************************/ +int registerReadOnlyListener(ReadOnlyNotifier *notifier, + void *listener, + ReadOnlyNotification *notification, + ThreadID threadID) +{ + ReadOnlyListener *readOnlyListener; + int result = ALLOCATE(1, ReadOnlyListener, __func__, &readOnlyListener); + if (result != VDO_SUCCESS) { + return result; + } + + ThreadData *threadData = ¬ifier->threadData[threadID]; + *readOnlyListener = (ReadOnlyListener) { + .listener = listener, + .notify = notification, + .next = threadData->listeners, + }; + + threadData->listeners = readOnlyListener; + return VDO_SUCCESS; +} diff --git a/vdo/base/readOnlyNotifier.h b/vdo/base/readOnlyNotifier.h new file mode 100644 index 0000000..b5eb322 --- /dev/null +++ b/vdo/base/readOnlyNotifier.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.h#3 $ + */ + +/* + * A ReadOnlyNotifier is responsible for propogating the fact that the VDO + * has encountered an unrecoverable error to all base threads. It also persists + * the read-only state to the super block. + * + * The notifier also provides the ability to wait for any notifications to be + * complete in order to not cause super block write races when shutting down + * the VDO. + */ + +#ifndef READ_ONLY_NOTIFIER_H +#define READ_ONLY_NOTIFIER_H + +#include "completion.h" + +/** + * A function to notify a listener that the VDO has gone read-only. + * + * @param listener The object to notify + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +typedef void ReadOnlyNotification(void *listener, VDOCompletion *parent); + +/** + * Create a read-only notifer. + * + * @param [in] isReadOnly Whether the VDO is already read-only + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] layer The physical layer of the VDO + * @param [out] notifierPtr A pointer to receive the new notifier + * + * @return VDO_SUCCESS or an error + **/ +int makeReadOnlyNotifier(bool isReadOnly, + const ThreadConfig *threadConfig, + PhysicalLayer *layer, + ReadOnlyNotifier **notifierPtr) + __attribute__((warn_unused_result)); + +/** + * Free a ReadOnlyNotifier and null out the reference to it. + * + * @param notifierPtr The reference to the notifier to free + **/ +void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr); + +/** + * Wait until no read-only notifications are in progress and prevent any + * subsequent notifications. Notifications may be re-enabled by calling + * allowReadOnlyModeEntry(). + * + * @param notifier The read-only notifier on which to wait + * @param parent The completion to notify when no threads are entering + * read-only mode + **/ +void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, + VDOCompletion *parent); + +/** + * Allow the notifier to put the VDO into read-only mode, reversing the effects + * of waitUntilNotEnteringReadOnlyMode(). If some thread tried to put the VDO + * into read-only mode while notifications were disallowed, it will be done + * when this method is called. If that happens, the parent will not be notified + * until the VDO has actually entered read-only mode and attempted to save the + * super block. + * + *

This method may only be called from the admin thread. + * + * @param notifier The notifier + * @param parent The object to notify once the operation is complete + **/ +void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, + VDOCompletion *parent); + +/** + * Put a VDO into read-only mode and save the read-only state in the super + * block. This method is a no-op if the VDO is already read-only. + * + * @param notifier The read-only notifier of the VDO + * @param errorCode The error which caused the VDO to enter read-only + * mode + **/ +void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode); + +/** + * Check whether the VDO is read-only. This method may be called from any + * thread, as opposed to examining the VDO's state field which is only safe + * to check from the admin thread. + * + * @param notifier The read-only notifier of the VDO + * + * @return true if the VDO is read-only + **/ +bool isReadOnly(ReadOnlyNotifier *notifier) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is or will be read-only (i.e. some thread has started + * the process of entering read-only mode, but not all threads have been + * notified yet). This method should only be called in cases where the expense + * of reading atomic state is not a problem. It was introduced in order to allow + * suppresion of spurious error messages resulting from VIO cleanup racing with + * read-only notification. + * + * @param notifier The read-only notifier of the VDO + * + * @return true if the VDO has started (and possibly finished) + * the process of entering read-only mode + **/ +bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) + __attribute__((warn_unused_result)); + +/** + * Register a listener to be notified when the VDO goes read-only. + * + * @param notifier The notifier to register with + * @param listener The object to notify + * @param notification The function to call to send the notification + * @param threadID The id of the thread on which to send the notification + * + * @return VDO_SUCCESS or an error + **/ +int registerReadOnlyListener(ReadOnlyNotifier *notifier, + void *listener, + ReadOnlyNotification *notification, + ThreadID threadID); + +#endif /* READ_ONLY_NOTIFIER_H */ diff --git a/vdo/base/readOnlyRebuild.c b/vdo/base/readOnlyRebuild.c new file mode 100644 index 0000000..7e9df0c --- /dev/null +++ b/vdo/base/readOnlyRebuild.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.c#9 $ + */ + +#include "readOnlyRebuild.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMapInternals.h" +#include "blockMapRecovery.h" +#include "completion.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalInternals.h" +#include "recoveryUtils.h" +#include "referenceCountRebuild.h" +#include "slabDepot.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +typedef struct { + /** The completion header */ + VDOCompletion completion; + /** A sub task completion */ + VDOCompletion subTaskCompletion; + /** The VDO in question */ + VDO *vdo; + /** A buffer to hold the data read off disk */ + char *journalData; + /** The entry data for the block map rebuild */ + NumberedBlockMapping *entries; + /** The number of entries in the entry array */ + size_t entryCount; + /** The sequence number of the first valid block of the journal (if known) */ + SequenceNumber head; + /** The sequence number of the last valid block of the journal (if known) */ + SequenceNumber tail; + /** The number of logical blocks in use */ + BlockCount logicalBlocksUsed; + /** The number of allocated block map pages */ + BlockCount blockMapDataBlocks; +} ReadOnlyRebuildCompletion; + +/** + * Convert a generic completion to a ReadOnlyRebuildCompletion. + * + * @param completion The completion to convert + * + * @return the journal rebuild completion + **/ +__attribute__((warn_unused_result)) +static inline ReadOnlyRebuildCompletion * +asReadOnlyRebuildCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ReadOnlyRebuildCompletion, completion) == 0); + assertCompletionType(completion->type, READ_ONLY_REBUILD_COMPLETION); + return (ReadOnlyRebuildCompletion *) completion; +} + +/** + * Free a rebuild completion and all underlying structures. + * + * @param rebuildPtr A pointer to the rebuild completion to free + */ +static void freeRebuildCompletion(ReadOnlyRebuildCompletion **rebuildPtr) +{ + ReadOnlyRebuildCompletion *rebuild = *rebuildPtr; + if (rebuild == NULL) { + return; + } + + destroyEnqueueable(&rebuild->subTaskCompletion); + FREE(rebuild->journalData); + FREE(rebuild->entries); + FREE(rebuild); + *rebuildPtr = NULL; +} + +/** + * Allocate and initialize a read only rebuild completion. + * + * @param [in] vdo The VDO in question + * @param [out] rebuildPtr A pointer to return the created rebuild completion + * + * @return VDO_SUCCESS or an error code + **/ +static int makeRebuildCompletion(VDO *vdo, + ReadOnlyRebuildCompletion **rebuildPtr) +{ + ReadOnlyRebuildCompletion *rebuild; + int result = ALLOCATE(1, ReadOnlyRebuildCompletion, __func__, &rebuild); + if (result != VDO_SUCCESS) { + return result; + } + + initializeCompletion(&rebuild->completion, READ_ONLY_REBUILD_COMPLETION, + vdo->layer); + + result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRebuildCompletion(&rebuild); + return result; + } + + rebuild->vdo = vdo; + *rebuildPtr = rebuild; + return VDO_SUCCESS; +} + +/** + * Clean up the rebuild process, whether or not it succeeded, by freeing the + * rebuild completion and notifying the parent of the outcome. + * + * @param completion The rebuild completion + **/ +static void completeRebuild(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + int result = completion->result; + ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); + VDO *vdo = rebuild->vdo; + setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, false); + freeRebuildCompletion(&rebuild); + finishCompletion(parent, result); +} + +/** + * Finish rebuilding, free the rebuild completion and notify the parent. + * + * @param completion The rebuild completion + **/ +static void finishRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); + initializeRecoveryJournalPostRebuild(rebuild->vdo->recoveryJournal, + rebuild->vdo->completeRecoveries, + rebuild->tail, + rebuild->logicalBlocksUsed, + rebuild->blockMapDataBlocks); + logInfo("Read-only rebuild complete"); + completeRebuild(completion); +} + +/** + * Handle a rebuild error. + * + * @param completion The rebuild completion + **/ +static void abortRebuild(VDOCompletion *completion) +{ + logInfo("Read-only rebuild aborted"); + completeRebuild(completion); +} + +/** + * Abort a rebuild if there is an error. + * + * @param result The result to check + * @param rebuild The journal rebuild completion + * + * @return true if the result was an error + **/ +__attribute__((warn_unused_result)) +static bool abortRebuildOnError(int result, + ReadOnlyRebuildCompletion *rebuild) +{ + if (result == VDO_SUCCESS) { + return false; + } + + finishCompletion(&rebuild->completion, result); + return true; +} + +/** + * Clean up after finishing the reference count rebuild. This callback is + * registered in launchReferenceCountRebuild(). + * + * @param completion The sub-task completion + **/ +static void finishReferenceCountRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = completion->parent; + VDO *vdo = rebuild->vdo; + assertOnAdminThread(vdo, __func__); + if (vdo->loadState != VDO_REBUILD_FOR_UPGRADE) { + // A "rebuild" for upgrade should not increment this count. + vdo->completeRecoveries++; + } + + logInfo("Saving rebuilt state"); + prepareToFinishParent(completion, &rebuild->completion); + drainSlabDepot(vdo->depot, ADMIN_STATE_REBUILDING, completion); +} + +/** + * Rebuild the reference counts from the block map now that all journal entries + * have been applied to the block map. This callback is registered in + * applyJournalEntries(). + * + * @param completion The sub-task completion + **/ +static void launchReferenceCountRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = completion->parent; + VDO *vdo = rebuild->vdo; + + // We must allocate RefCounts before we can rebuild them. + int result = allocateSlabRefCounts(vdo->depot); + if (abortRebuildOnError(result, rebuild)) { + return; + } + + prepareCompletion(completion, finishReferenceCountRebuild, + finishParentCallback, getAdminThread(getThreadConfig(vdo)), + completion->parent); + rebuildReferenceCounts(vdo, completion, &rebuild->logicalBlocksUsed, + &rebuild->blockMapDataBlocks); +} + +/** + * Append an array of recovery journal entries from a journal block sector to + * the array of numbered mappings in the rebuild completion, numbering each + * entry in the order they are appended. + * + * @param rebuild The journal rebuild completion + * @param sector The recovery journal sector with entries + * @param entryCount The number of entries to append + **/ +static void appendSectorEntries(ReadOnlyRebuildCompletion *rebuild, + PackedJournalSector *sector, + JournalEntryCount entryCount) +{ + for (JournalEntryCount i = 0; i < entryCount; i++) { + RecoveryJournalEntry entry + = unpackRecoveryJournalEntry(§or->entries[i]); + int result = validateRecoveryJournalEntry(rebuild->vdo, &entry); + if (result != VDO_SUCCESS) { + // When recovering from read-only mode, ignore damaged entries. + continue; + } + + if (isIncrementOperation(entry.operation)) { + rebuild->entries[rebuild->entryCount] = (NumberedBlockMapping) { + .blockMapSlot = entry.slot, + .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), + .number = rebuild->entryCount, + }; + rebuild->entryCount++; + } + } +} + +/** + * Create an array of all valid journal entries, in order, and store + * it in the rebuild completion. + * + * @param rebuild The journal rebuild completion + * + * @return VDO_SUCCESS or an error code + **/ +static int extractJournalEntries(ReadOnlyRebuildCompletion *rebuild) +{ + VDO *vdo = rebuild->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + SequenceNumber first = rebuild->head; + SequenceNumber last = rebuild->tail; + BlockCount maxCount = ((last - first + 1) * journal->entriesPerBlock); + + // Allocate a NumberedBlockMapping array large enough to transcribe every + // PackedRecoveryJournalEntry from every valid journal block. + int result = ALLOCATE(maxCount, NumberedBlockMapping, __func__, + &rebuild->entries); + if (result != VDO_SUCCESS) { + return result; + } + + for (SequenceNumber i = first; i <= last; i++) { + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, rebuild->journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isExactRecoveryJournalBlock(journal, &header, i)) { + // This block is invalid, so skip it. + continue; + } + + // Don't extract more than the expected maximum entries per block. + JournalEntryCount blockEntries = minBlock(journal->entriesPerBlock, + header.entryCount); + for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { + // Stop when all entries counted in the header are applied or skipped. + if (blockEntries == 0) { + break; + } + + PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); + if (!isValidRecoveryJournalSector(&header, sector)) { + blockEntries -= minBlock(blockEntries, + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + continue; + } + + // Don't extract more than the expected maximum entries per sector. + JournalEntryCount sectorEntries + = minBlock(sector->entryCount, RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + // Only extract as many as the block header calls for. + sectorEntries = minBlock(sectorEntries, blockEntries); + appendSectorEntries(rebuild, sector, sectorEntries); + // Even if the sector wasn't full, count it as full when counting up + // to the entry count the block header claims. + blockEntries -= minBlock(blockEntries, + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + } + } + + return VDO_SUCCESS; +} + +/** + * Determine the limits of the valid recovery journal and apply all + * valid entries to the block map. This callback is registered in + * rebuildJournalAsync(). + * + * @param completion The sub-task completion + **/ +static void applyJournalEntries(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild + = asReadOnlyRebuildCompletion(completion->parent); + VDO *vdo = rebuild->vdo; + + logInfo("Finished reading recovery journal"); + assertOnLogicalZoneThread(vdo, 0, __func__); + + bool foundEntries = findHeadAndTail(vdo->recoveryJournal, + rebuild->journalData, &rebuild->tail, + &rebuild->head, NULL); + if (foundEntries) { + int result = extractJournalEntries(rebuild); + if (abortRebuildOnError(result, rebuild)) { + return; + } + } + + // Suppress block map errors. + setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, true); + + // Play the recovery journal into the block map. + prepareCompletion(completion, launchReferenceCountRebuild, + finishParentCallback, completion->callbackThreadID, + completion->parent); + recoverBlockMap(vdo, rebuild->entryCount, rebuild->entries, completion); +} + +/** + * Begin loading the journal. + * + * @param completion The sub task completion + **/ +static void loadJournal(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild + = asReadOnlyRebuildCompletion(completion->parent); + VDO *vdo = rebuild->vdo; + assertOnLogicalZoneThread(vdo, 0, __func__); + + prepareCompletion(completion, applyJournalEntries, finishParentCallback, + completion->callbackThreadID, completion->parent); + loadJournalAsync(vdo->recoveryJournal, completion, &rebuild->journalData); +} + +/**********************************************************************/ +void launchRebuild(VDO *vdo, VDOCompletion *parent) +{ + // Note: These messages must be recognizable by Permabit::VDODeviceBase. + if (vdo->loadState == VDO_REBUILD_FOR_UPGRADE) { + logWarning("Rebuilding reference counts for upgrade"); + } else { + logWarning("Rebuilding reference counts to clear read-only mode"); + vdo->readOnlyRecoveries++; + } + + ReadOnlyRebuildCompletion *rebuild; + int result = makeRebuildCompletion(vdo, &rebuild); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOCompletion *completion = &rebuild->completion; + prepareCompletion(completion, finishRebuild, abortRebuild, + parent->callbackThreadID, parent); + + VDOCompletion *subTaskCompletion = &rebuild->subTaskCompletion; + prepareCompletion(subTaskCompletion, loadJournal, finishParentCallback, + getLogicalZoneThread(getThreadConfig(vdo), 0), + completion); + loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_REBUILD, + subTaskCompletion, NULL); +} diff --git a/vdo/base/readOnlyRebuild.h b/vdo/base/readOnlyRebuild.h new file mode 100644 index 0000000..9f40ce6 --- /dev/null +++ b/vdo/base/readOnlyRebuild.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.h#1 $ + */ + +#ifndef READ_ONLY_REBUILD_H +#define READ_ONLY_REBUILD_H + +#include "completion.h" +#include "vdo.h" + +/** + * Construct a ReadOnlyRebuildCompletion and launch it. Apply all valid journal + * block entries to all VDO structures. Must be launched from logical zone 0. + * + * @param vdo The VDO to rebuild + * @param parent The completion to notify when the rebuild is complete + **/ +void launchRebuild(VDO *vdo, VDOCompletion *parent); + +#endif // READ_ONLY_REBUILD_H diff --git a/vdo/base/recoveryJournal.c b/vdo/base/recoveryJournal.c new file mode 100644 index 0000000..c44053c --- /dev/null +++ b/vdo/base/recoveryJournal.c @@ -0,0 +1,1403 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.c#30 $ + */ + +#include "recoveryJournal.h" +#include "recoveryJournalInternals.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "constants.h" +#include "dataVIO.h" +#include "extent.h" +#include "header.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalBlock.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "waitQueue.h" + +typedef struct { + SequenceNumber journalStart; // Sequence number to start the journal + BlockCount logicalBlocksUsed; // Number of logical blocks used by VDO + BlockCount blockMapDataBlocks; // Number of block map pages allocated +} __attribute__((packed)) RecoveryJournalState7_0; + +static const Header RECOVERY_JOURNAL_HEADER_7_0 = { + .id = RECOVERY_JOURNAL, + .version = { + .majorVersion = 7, + .minorVersion = 0, + }, + .size = sizeof(RecoveryJournalState7_0), +}; + +static const uint64_t RECOVERY_COUNT_MASK = 0xff; + +enum { + /* + * The number of reserved blocks must be large enough to prevent a + * new recovery journal block write from overwriting a block which + * appears to still be a valid head block of the journal. Currently, + * that means reserving enough space for all 2048 VIOs, or 8 blocks. + */ + RECOVERY_JOURNAL_RESERVED_BLOCKS = 8, +}; + +/**********************************************************************/ +const char *getJournalOperationName(JournalOperation operation) +{ + switch (operation) { + case DATA_DECREMENT: + return "data decrement"; + + case DATA_INCREMENT: + return "data increment"; + + case BLOCK_MAP_DECREMENT: + return "block map decrement"; + + case BLOCK_MAP_INCREMENT: + return "block map increment"; + + default: + return "unknown journal operation"; + } +} + +/** + * Get a block from the end of the free list. + * + * @param journal The journal + * + * @return The block or NULL if the list is empty + **/ +static RecoveryJournalBlock *popFreeList(RecoveryJournal *journal) +{ + return blockFromRingNode(popRingNode(&journal->freeTailBlocks)); +} + +/** + * Get a block from the end of the active list. + * + * @param journal The journal + * + * @return The block or NULL if the list is empty + **/ +static RecoveryJournalBlock *popActiveList(RecoveryJournal *journal) +{ + return blockFromRingNode(popRingNode(&journal->activeTailBlocks)); +} + +/** + * Assert that we are running on the journal thread. + * + * @param journal The journal + * @param functionName The function doing the check (for logging) + **/ +static void assertOnJournalThread(RecoveryJournal *journal, + const char *functionName) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == journal->threadID), + "%s() called on journal thread", functionName); +} + +/** + * WaiterCallback implementation invoked whenever a DataVIO is to be released + * from the journal, either because its entry was committed to disk, + * or because there was an error. + **/ +static void continueWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("$F($j-$js);" + "cb=continueJournalWaiter($j-$js)")); + int waitResult = *((int *) context); + continueDataVIO(dataVIO, waitResult); +} + +/** + * Check whether the journal has any waiters on any blocks. + * + * @param journal The journal in question + * + * @return true if any block has a waiter + **/ +static inline bool hasBlockWaiters(RecoveryJournal *journal) +{ + // Either the first active tail block (if it exists) has waiters, + // or no active tail block has waiters. + if (isRingEmpty(&journal->activeTailBlocks)) { + return false; + } + + RecoveryJournalBlock *block + = blockFromRingNode(journal->activeTailBlocks.next); + return (hasWaiters(&block->entryWaiters) + || hasWaiters(&block->commitWaiters)); +} + +/**********************************************************************/ +static void recycleJournalBlocks(RecoveryJournal *block); +static void recycleJournalBlock(RecoveryJournalBlock *block); +static void notifyCommitWaiters(RecoveryJournal *journal); + +/** + * Check whether the journal has drained. + * + * @param journal The journal which may have just drained + **/ +static void checkForDrainComplete(RecoveryJournal *journal) +{ + int result = VDO_SUCCESS; + if (isReadOnly(journal->readOnlyNotifier)) { + result = VDO_READ_ONLY; + /* + * Clean up any full active blocks which were not written due to being + * in read-only mode. + * + * XXX: This would probably be better as a short-circuit in writeBlock(). + */ + notifyCommitWaiters(journal); + recycleJournalBlocks(journal); + + // Release any DataVIOs waiting to be assigned entries. + notifyAllWaiters(&journal->decrementWaiters, continueWaiter, &result); + notifyAllWaiters(&journal->incrementWaiters, continueWaiter, &result); + } + + if (!isDraining(&journal->state) + || journal->reaping || hasBlockWaiters(journal) + || hasWaiters(&journal->incrementWaiters) + || hasWaiters(&journal->decrementWaiters) + || !suspendLockCounter(journal->lockCounter)) { + return; + } + + if (isSaving(&journal->state)) { + if (journal->activeBlock != NULL) { + ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) + || !isRecoveryBlockDirty(journal->activeBlock)), + "journal being saved has clean active block"); + recycleJournalBlock(journal->activeBlock); + } + + ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), + "all blocks in a journal being saved must be inactive"); + } + + finishDrainingWithResult(&journal->state, result); +} + +/** + * Notifiy a recovery journal that the VDO has gone read-only. + * + *

Implements ReadOnlyNotification. + * + * @param listener The journal + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyRecoveryJournalOfReadOnlyMode(void *listener, + VDOCompletion *parent) +{ + checkForDrainComplete(listener); + completeCompletion(parent); +} + +/** + * Put the journal in read-only mode. All attempts to add entries after + * this function is called will fail. All VIOs waiting for commits will be + * awakened with an error. + * + * @param journal The journal which has failed + * @param errorCode The error result triggering this call + **/ +static void enterJournalReadOnlyMode(RecoveryJournal *journal, int errorCode) +{ + enterReadOnlyMode(journal->readOnlyNotifier, errorCode); + checkForDrainComplete(journal); +} + +/**********************************************************************/ +SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal) +{ + return journal->tail; +} + +/** + * Get the head of the recovery journal, which is the lowest sequence number of + * the block map head and the slab journal head. + * + * @param journal The journal + * + * @return the head of the journal + **/ +static inline SequenceNumber getRecoveryJournalHead(RecoveryJournal *journal) +{ + return minSequenceNumber(journal->blockMapHead, journal->slabJournalHead); +} + +/** + * Compute the recovery count byte for a given recovery count. + * + * @param recoveryCount The recovery count + * + * @return The byte corresponding to the recovery count + **/ +__attribute__((warn_unused_result)) +static inline uint8_t computeRecoveryCountByte(uint64_t recoveryCount) +{ + return (uint8_t) (recoveryCount & RECOVERY_COUNT_MASK); +} + +/** + * Check whether the journal is over the threshold, and if so, force the oldest + * slab journal tail block to commit. + * + * @param journal The journal + **/ +static void checkSlabJournalCommitThreshold(RecoveryJournal *journal) +{ + BlockCount currentLength = journal->tail - journal->slabJournalHead; + if (currentLength > journal->slabJournalCommitThreshold) { + journal->events.slabJournalCommitsRequested++; + commitOldestSlabJournalTailBlocks(journal->depot, + journal->slabJournalHead); + } +} + +/**********************************************************************/ +static void reapRecoveryJournal(RecoveryJournal *journal); +static void assignEntries(RecoveryJournal *journal); + +/** + * Finish reaping the journal. + * + * @param journal The journal being reaped + **/ +static void finishReaping(RecoveryJournal *journal) +{ + SequenceNumber oldHead = getRecoveryJournalHead(journal); + journal->blockMapHead = journal->blockMapReapHead; + journal->slabJournalHead = journal->slabJournalReapHead; + BlockCount blocksReaped = getRecoveryJournalHead(journal) - oldHead; + journal->availableSpace += blocksReaped * journal->entriesPerBlock; + journal->reaping = false; + checkSlabJournalCommitThreshold(journal); + assignEntries(journal); + checkForDrainComplete(journal); +} + +/** + * Finish reaping the journal after flushing the lower layer. This is the + * callback registered in reapRecoveryJournal(). + * + * @param completion The journal's flush VIO + **/ +static void completeReaping(VDOCompletion *completion) +{ + RecoveryJournal *journal = completion->parent; + finishReaping(journal); + + // Try reaping again in case more locks were released while flush was out. + reapRecoveryJournal(journal); +} + +/** + * Handle an error when flushing the lower layer due to reaping. + * + * @param completion The journal's flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + RecoveryJournal *journal = completion->parent; + journal->reaping = false; + enterJournalReadOnlyMode(journal, completion->result); +} + +/** + * Set all journal fields appropriately to start journaling from the current + * active block. + * + * @param journal The journal to be reset based on its active block + **/ +static void initializeJournalState(RecoveryJournal *journal) +{ + journal->appendPoint.sequenceNumber = journal->tail; + journal->lastWriteAcknowledged = journal->tail; + journal->blockMapHead = journal->tail; + journal->slabJournalHead = journal->tail; + journal->blockMapReapHead = journal->tail; + journal->slabJournalReapHead = journal->tail; + journal->blockMapHeadBlockNumber + = getRecoveryJournalBlockNumber(journal, journal->blockMapHead); + journal->slabJournalHeadBlockNumber + = getRecoveryJournalBlockNumber(journal, journal->slabJournalHead); +} + +/**********************************************************************/ +BlockCount getRecoveryJournalLength(BlockCount journalSize) +{ + BlockCount reservedBlocks = journalSize / 4; + if (reservedBlocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) { + reservedBlocks = RECOVERY_JOURNAL_RESERVED_BLOCKS; + } + return (journalSize - reservedBlocks); +} + +/** + * Attempt to reap the journal now that all the locks on some journal block + * have been released. This is the callback registered with the lock counter. + * + * @param completion The lock counter completion + **/ +static void reapRecoveryJournalCallback(VDOCompletion *completion) +{ + RecoveryJournal *journal = (RecoveryJournal *) completion->parent; + // The acknowledgement must be done before reaping so that there is no + // race between acknowledging the notification and unlocks wishing to notify. + acknowledgeUnlock(journal->lockCounter); + + if (isQuiescing(&journal->state)) { + // Don't start reaping when the journal is trying to quiesce. Do check if + // this notification is the last thing the drain is waiting on. + checkForDrainComplete(journal); + return; + } + + reapRecoveryJournal(journal); + checkSlabJournalCommitThreshold(journal); +} + +/********************************************************************** + * Set the journal's tail sequence number. + * + * @param journal The journal whose tail is to be set + * @param tail The new tail value + **/ +static void setJournalTail(RecoveryJournal *journal, SequenceNumber tail) +{ + // VDO does not support sequence numbers above 1 << 48 in the slab journal. + if (tail >= (1ULL << 48)) { + enterJournalReadOnlyMode(journal, VDO_JOURNAL_OVERFLOW); + } + + journal->tail = tail; +} + +/**********************************************************************/ +int makeRecoveryJournal(Nonce nonce, + PhysicalLayer *layer, + Partition *partition, + uint64_t recoveryCount, + BlockCount journalSize, + BlockCount tailBufferSize, + ReadOnlyNotifier *readOnlyNotifier, + const ThreadConfig *threadConfig, + RecoveryJournal **journalPtr) +{ + RecoveryJournal *journal; + int result = ALLOCATE(1, RecoveryJournal, __func__, &journal); + if (result != VDO_SUCCESS) { + return result; + } + + initializeRing(&journal->freeTailBlocks); + initializeRing(&journal->activeTailBlocks); + initializeWaitQueue(&journal->pendingWrites); + + journal->threadID = getJournalZoneThread(threadConfig); + journal->partition = partition; + journal->nonce = nonce; + journal->recoveryCount = computeRecoveryCountByte(recoveryCount); + journal->size = journalSize; + journal->readOnlyNotifier = readOnlyNotifier; + journal->tail = 1; + journal->slabJournalCommitThreshold = (journalSize * 2) / 3; + initializeJournalState(journal); + + journal->entriesPerBlock = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; + BlockCount journalLength = getRecoveryJournalLength(journalSize); + journal->availableSpace = journal->entriesPerBlock * journalLength; + + // Only make the tail buffer and VIO in normal operation since the formatter + // doesn't need them. + if (layer->createMetadataVIO != NULL) { + for (BlockCount i = 0; i < tailBufferSize; i++) { + RecoveryJournalBlock *block; + result = makeRecoveryBlock(layer, journal, &block); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + pushRingNode(&journal->freeTailBlocks, &block->ringNode); + } + + result = makeLockCounter(layer, journal, reapRecoveryJournalCallback, + journal->threadID, threadConfig->logicalZoneCount, + threadConfig->physicalZoneCount, journal->size, + &journal->lockCounter); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = ALLOCATE(VDO_BLOCK_SIZE, char, "journal flush data", + &journal->unusedFlushVIOData); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, + journal, journal->unusedFlushVIOData, + &journal->flushVIO); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = registerReadOnlyListener(readOnlyNotifier, journal, + notifyRecoveryJournalOfReadOnlyMode, + journal->threadID); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + journal->flushVIO->completion.callbackThreadID = journal->threadID; + } + + *journalPtr = journal; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRecoveryJournal(RecoveryJournal **journalPtr) +{ + RecoveryJournal *journal = *journalPtr; + if (journal == NULL) { + return; + } + + freeLockCounter(&journal->lockCounter); + freeVIO(&journal->flushVIO); + FREE(journal->unusedFlushVIOData); + + // XXX: eventually, the journal should be constructed in a quiescent state + // which requires opening before use. + if (!isQuiescent(&journal->state)) { + ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), + "journal being freed has no active tail blocks"); + } else if (!isSaved(&journal->state) + && !isRingEmpty(&journal->activeTailBlocks)) { + logWarning("journal being freed has uncommited entries"); + } + + RecoveryJournalBlock *block; + while ((block = popActiveList(journal)) != NULL) { + freeRecoveryBlock(&block); + } + + while ((block = popFreeList(journal)) != NULL) { + freeRecoveryBlock(&block); + } + + FREE(journal); + *journalPtr = NULL; +} + +/**********************************************************************/ +void setRecoveryJournalPartition(RecoveryJournal *journal, + Partition *partition) +{ + journal->partition = partition; +} + +/**********************************************************************/ +void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail) +{ + setJournalTail(journal, tail + 1); + journal->recoveryCount = computeRecoveryCountByte(recoveryCount); + initializeJournalState(journal); +} + +/**********************************************************************/ +void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail, + BlockCount logicalBlocksUsed, + BlockCount blockMapDataBlocks) +{ + initializeRecoveryJournalPostRecovery(journal, recoveryCount, tail); + journal->logicalBlocksUsed = logicalBlocksUsed; + journal->blockMapDataBlocks = blockMapDataBlocks; +} + +/**********************************************************************/ +BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) +{ + return journal->blockMapDataBlocks; +} + +/**********************************************************************/ +void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, + BlockCount pages) +{ + journal->blockMapDataBlocks = pages; +} + +/**********************************************************************/ +ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) +{ + return journal->threadID; +} + +/**********************************************************************/ +void openRecoveryJournal(RecoveryJournal *journal, + SlabDepot *depot, + BlockMap *blockMap) +{ + journal->depot = depot; + journal->blockMap = blockMap; + journal->state.state = ADMIN_STATE_NORMAL_OPERATION; +} + +/**********************************************************************/ +size_t getRecoveryJournalEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(RecoveryJournalState7_0); +} + +/**********************************************************************/ +int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + SequenceNumber journalStart; + if (isSaved(&journal->state)) { + // If the journal is saved, we should start one past the active block + // (since the active block is not guaranteed to be empty). + journalStart = journal->tail; + } else { + // When we're merely suspended or have gone read-only, we must record the + // first block that might have entries that need to be applied. + journalStart = getRecoveryJournalHead(journal); + } + + int result = encodeHeader(&RECOVERY_JOURNAL_HEADER_7_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt64LEIntoBuffer(buffer, journalStart); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, journal->logicalBlocksUsed); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, journal->blockMapDataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == encodedSize, + "encoded recovery journal component size" + " must match header size"); +} + +/** + * Decode recovery journal component state version 7.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeRecoveryJournalState_7_0(Buffer *buffer, + RecoveryJournalState7_0 *state) +{ + size_t initialLength = contentLength(buffer); + + SequenceNumber journalStart; + int result = getUInt64LEFromBuffer(buffer, &journalStart); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount logicalBlocksUsed; + result = getUInt64LEFromBuffer(buffer, &logicalBlocksUsed); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount blockMapDataBlocks; + result = getUInt64LEFromBuffer(buffer, &blockMapDataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + *state = (RecoveryJournalState7_0) { + .journalStart = journalStart, + .logicalBlocksUsed = logicalBlocksUsed, + .blockMapDataBlocks = blockMapDataBlocks, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == decodedSize, + "decoded slab depot component size must match header size"); +} + +/**********************************************************************/ +int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&RECOVERY_JOURNAL_HEADER_7_0, &header, + true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryJournalState7_0 state; + result = decodeRecoveryJournalState_7_0(buffer, &state); + if (result != VDO_SUCCESS) { + return result; + } + + // Update recovery journal in-memory information. + setJournalTail(journal, state.journalStart); + journal->logicalBlocksUsed = state.logicalBlocksUsed; + journal->blockMapDataBlocks = state.blockMapDataBlocks; + initializeJournalState(journal); + + // XXX: this is a hack until we make initial resume of a VDO a real resume + journal->state.state = ADMIN_STATE_SUSPENDED; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + // Sodium uses version 7.0, same as head, currently. + return decodeRecoveryJournal(journal, buffer); +} + +/** + * Advance the tail of the journal. + * + * @param journal The journal whose tail should be advanced + * + * @return true if the tail was advanced + **/ +static bool advanceTail(RecoveryJournal *journal) +{ + journal->activeBlock = popFreeList(journal); + if (journal->activeBlock == NULL) { + return false; + } + + pushRingNode(&journal->activeTailBlocks, &journal->activeBlock->ringNode); + initializeRecoveryBlock(journal->activeBlock); + setJournalTail(journal, journal->tail + 1); + advanceBlockMapEra(journal->blockMap, journal->tail); + return true; +} + +/** + * Check whether there is space to make a given type of entry. + * + * @param journal The journal to check + * @param increment Set to true if the desired entry is an + * increment + * + * @return true if there is space in the journal to make an + * entry of the specified type + **/ +static bool checkForEntrySpace(RecoveryJournal *journal, bool increment) +{ + if (increment) { + return ((journal->availableSpace - journal->pendingDecrementCount) > 1); + } + + return (journal->availableSpace > 0); +} + +/** + * Prepare the currently active block to receive an entry and check whether + * an entry of the given type may be assigned at this time. + * + * @param journal The journal receiving an entry + * @param increment Set to true if the desired entry is an + * increment + * + * @return true if there is space in the journal to store an + * entry of the specified type + **/ +static bool prepareToAssignEntry(RecoveryJournal *journal, bool increment) +{ + if (!checkForEntrySpace(journal, increment)) { + if (!increment) { + // There must always be room to make a decrement entry. + logError("No space for decrement entry in recovery journal"); + enterJournalReadOnlyMode(journal, VDO_RECOVERY_JOURNAL_FULL); + } + return false; + } + + if (isRecoveryBlockFull(journal->activeBlock) && !advanceTail(journal)) { + return false; + } + + if (!isRecoveryBlockEmpty(journal->activeBlock)) { + return true; + } + + if ((journal->tail - getRecoveryJournalHead(journal)) > journal->size) { + // Cannot use this block since the journal is full. + journal->events.diskFull++; + return false; + } + + /* + * Don't allow the new block to be reaped until all of its entries have been + * committed to the block map and until the journal block has been fully + * committed as well. Because the block map update is done only after any + * slab journal entries have been made, the per-entry lock for the block map + * entry serves to protect those as well. + */ + initializeLockCount(journal->lockCounter, journal->activeBlock->blockNumber, + journal->entriesPerBlock + 1); + return true; +} + +/**********************************************************************/ +static void writeBlocks(RecoveryJournal *journal); + +/** + * Queue a block for writing. The block is expected to be full. If the block + * is currently writing, this is a noop as the block will be queued for + * writing when the write finishes. The block must not currently be queued + * for writing. + * + * @param journal The journal in question + * @param block The block which is now ready to write + **/ +static void scheduleBlockWrite(RecoveryJournal *journal, + RecoveryJournalBlock *block) +{ + if (block->committing) { + return; + } + + int result = enqueueWaiter(&journal->pendingWrites, &block->writeWaiter); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if ((layer->getWritePolicy(layer) == WRITE_POLICY_ASYNC)) { + /* + * At the end of adding entries, or discovering this partial block + * is now full and ready to rewrite, we will call writeBlocks() and + * write a whole batch. + */ + return; + } + writeBlocks(journal); +} + +/** + * Release a reference to a journal block. + * + * @param block The journal block from which to release a reference + **/ +static void releaseJournalBlockReference(RecoveryJournalBlock *block) +{ + releaseJournalZoneReference(block->journal->lockCounter, block->blockNumber); +} + +/** + * Implements WaiterCallback. Assign an entry waiter to the active block. + **/ +static void assignEntry(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + RecoveryJournalBlock *block = (RecoveryJournalBlock *) context; + RecoveryJournal *journal = block->journal; + + // Record the point at which we will make the journal entry. + dataVIO->recoveryJournalPoint = (JournalPoint) { + .sequenceNumber = block->sequenceNumber, + .entryCount = block->entryCount, + }; + + switch (dataVIO->operation.type) { + case DATA_INCREMENT: + if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { + journal->logicalBlocksUsed++; + } + journal->pendingDecrementCount++; + break; + + case DATA_DECREMENT: + if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { + journal->logicalBlocksUsed--; + } + + // Per-entry locks need not be held for decrement entries since the lock + // held for the incref entry will protect this entry as well. + releaseJournalBlockReference(block); + ASSERT_LOG_ONLY((journal->pendingDecrementCount != 0), + "decrement follows increment"); + journal->pendingDecrementCount--; + break; + + case BLOCK_MAP_INCREMENT: + journal->blockMapDataBlocks++; + break; + + default: + logError("Invalid journal operation %u", dataVIO->operation.type); + enterJournalReadOnlyMode(journal, VDO_NOT_IMPLEMENTED); + continueDataVIO(dataVIO, VDO_NOT_IMPLEMENTED); + return; + } + + journal->availableSpace--; + int result = enqueueRecoveryBlockEntry(block, dataVIO); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + continueDataVIO(dataVIO, result); + } + + if (isRecoveryBlockFull(block)) { + // The block is full, so we can write it anytime henceforth. If it is + // already committing, we'll queue it for writing when it comes back. + scheduleBlockWrite(journal, block); + } + + // Force out slab journal tail blocks when threshold is reached. + checkSlabJournalCommitThreshold(journal); +} + +/**********************************************************************/ +static bool assignEntriesFromQueue(RecoveryJournal *journal, + WaitQueue *queue, + bool increment) +{ + while (hasWaiters(queue)) { + if (!prepareToAssignEntry(journal, increment)) { + return false; + } + + notifyNextWaiter(queue, assignEntry, journal->activeBlock); + } + + return true; +} + +/**********************************************************************/ +static void assignEntries(RecoveryJournal *journal) +{ + if (journal->addingEntries) { + // Protect against re-entrancy. + return; + } + + journal->addingEntries = true; + if (assignEntriesFromQueue(journal, &journal->decrementWaiters, false)) { + assignEntriesFromQueue(journal, &journal->incrementWaiters, true); + } + + // Now that we've finished with entries, see if we have a batch of blocks to + // write. + writeBlocks(journal); + journal->addingEntries = false; +} + +/** + * Prepare an in-memory journal block to be reused now that it has been fully + * committed. + * + * @param block The block to be recycled + **/ +static void recycleJournalBlock(RecoveryJournalBlock *block) +{ + RecoveryJournal *journal = block->journal; + pushRingNode(&journal->freeTailBlocks, &block->ringNode); + + // Release any unused entry locks. + for (BlockCount i = block->entryCount; i < journal->entriesPerBlock; i++) { + releaseJournalBlockReference(block); + } + + // Release our own lock against reaping now that the block is completely + // committed, or we're giving up because we're in read-only mode. + if (block->entryCount > 0) { + releaseJournalBlockReference(block); + } + + if (block == journal->activeBlock) { + journal->activeBlock = NULL; + } +} + +/** + * WaiterCallback implementation invoked whenever a VIO is to be released + * from the journal because its entry was committed to disk. + **/ +static void continueCommittedWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + RecoveryJournal *journal = (RecoveryJournal *) context; + ASSERT_LOG_ONLY(beforeJournalPoint(&journal->commitPoint, + &dataVIO->recoveryJournalPoint), + "DataVIOs released from recovery journal in order. " + "Recovery journal point is (%llu, %" PRIu16 "), " + "but commit waiter point is (%llu, %" PRIu16 ")", + journal->commitPoint.sequenceNumber, + journal->commitPoint.entryCount, + dataVIO->recoveryJournalPoint.sequenceNumber, + dataVIO->recoveryJournalPoint.entryCount); + journal->commitPoint = dataVIO->recoveryJournalPoint; + + int result + = (isReadOnly(journal->readOnlyNotifier) ? VDO_READ_ONLY : VDO_SUCCESS); + continueWaiter(waiter, &result); +} + +/** + * Notify any VIOs whose entries have now committed. + * + * @param journal The recovery journal to update + **/ +static void notifyCommitWaiters(RecoveryJournal *journal) +{ + if (isRingEmpty(&journal->activeTailBlocks)) { + return; + } + + for (RingNode *node = journal->activeTailBlocks.next; + node != &journal->activeTailBlocks; + node = node->next) { + RecoveryJournalBlock *block = blockFromRingNode(node); + + if (block->committing) { + return; + } + + notifyAllWaiters(&block->commitWaiters, continueCommittedWaiter, journal); + if (isReadOnly(journal->readOnlyNotifier)) { + notifyAllWaiters(&block->entryWaiters, continueCommittedWaiter, journal); + } else if (isRecoveryBlockDirty(block) || !isRecoveryBlockFull(block)) { + // Stop at partially-committed or partially-filled blocks. + return; + } + } +} + +/** + * Recycle any journal blocks which have been fully committed. + * + * @param journal The recovery journal to update + **/ +static void recycleJournalBlocks(RecoveryJournal *journal) +{ + while (!isRingEmpty(&journal->activeTailBlocks)) { + RecoveryJournalBlock *block + = blockFromRingNode(journal->activeTailBlocks.next); + + if (block->committing) { + // Don't recycle committing blocks. + return; + } + + if (!isReadOnly(journal->readOnlyNotifier) + && (isRecoveryBlockDirty(block) + || !isRecoveryBlockFull(block))) { + // Don't recycle partially written or partially full + // blocks, except in read-only mode. + return; + } + recycleJournalBlock(block); + } +} + +/** + * Handle post-commit processing. This is the callback registered by + * writeBlock(). If more entries accumulated in the block being committed while + * the commit was in progress, another commit will be initiated. + * + * @param completion The completion of the VIO writing this block + **/ +static void completeWrite(VDOCompletion *completion) +{ + RecoveryJournalBlock *block = completion->parent; + RecoveryJournal *journal = block->journal; + assertOnJournalThread(journal, __func__); + + journal->pendingWriteCount -= 1; + journal->events.blocks.committed += 1; + journal->events.entries.committed += block->entriesInCommit; + block->uncommittedEntryCount -= block->entriesInCommit; + block->entriesInCommit = 0; + block->committing = false; + + // If this block is the latest block to be acknowledged, record that fact. + if (block->sequenceNumber > journal->lastWriteAcknowledged) { + journal->lastWriteAcknowledged = block->sequenceNumber; + } + + RecoveryJournalBlock *lastActiveBlock + = blockFromRingNode(journal->activeTailBlocks.next); + ASSERT_LOG_ONLY((block->sequenceNumber >= lastActiveBlock->sequenceNumber), + "completed journal write is still active"); + + notifyCommitWaiters(journal); + + // Is this block now full? Reaping, and adding entries, might have already + // sent it off for rewriting; else, queue it for rewrite. + if (isRecoveryBlockDirty(block) && isRecoveryBlockFull(block)) { + scheduleBlockWrite(journal, block); + } + + recycleJournalBlocks(journal); + writeBlocks(journal); + + checkForDrainComplete(journal); +} + +/**********************************************************************/ +static void handleWriteError(VDOCompletion *completion) +{ + RecoveryJournalBlock *block = completion->parent; + RecoveryJournal *journal = block->journal; + logErrorWithStringError(completion->result, + "cannot write recovery journal block %llu", + block->sequenceNumber); + enterJournalReadOnlyMode(journal, completion->result); + completeWrite(completion); +} + +/** + * Issue a block for writing. Implements WaiterCallback. + **/ +static void writeBlock(Waiter *waiter, void *context __attribute__((unused))) +{ + RecoveryJournalBlock *block = blockFromWaiter(waiter); + if (isReadOnly(block->journal->readOnlyNotifier)) { + return; + } + + int result = commitRecoveryBlock(block, completeWrite, handleWriteError); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(block->journal, result); + } +} + +/** + * Attempt to commit blocks, according to write policy. + * + * @param journal The recovery journal + **/ +static void writeBlocks(RecoveryJournal *journal) +{ + assertOnJournalThread(journal, __func__); + /* + * In sync and async-unsafe modes, we call this function each time we queue + * a full block on pending writes; in addition, in all cases we call this + * function after adding entries to the journal and finishing a block write. + * Thus, when this function terminates we must either have no VIOs waiting + * in the journal or have some outstanding IO to provide a future wakeup. + * + * In all modes, if there are no outstanding writes and some unwritten + * entries, we must issue a block, even if it's the active block and it + * isn't full. Otherwise, in sync/async-unsafe modes, we want to issue + * all full blocks every time; since we call it each time we fill a block, + * this is equivalent to issuing every full block as soon as its full. In + * async mode, we want to only issue full blocks if there are no + * pending writes. + */ + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if ((layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC) + || (journal->pendingWriteCount == 0)) { + // Write all the full blocks. + notifyAllWaiters(&journal->pendingWrites, writeBlock, NULL); + } + + // Do we need to write the active block? Only if we have no outstanding + // writes, even after issuing all of the full writes. + if ((journal->pendingWriteCount == 0) + && canCommitRecoveryBlock(journal->activeBlock)) { + writeBlock(&journal->activeBlock->writeWaiter, NULL); + } +} + +/**********************************************************************/ +void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO) +{ + assertOnJournalThread(journal, __func__); + if (!isNormal(&journal->state)) { + continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); + return; + } + + if (isReadOnly(journal->readOnlyNotifier)) { + continueDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + bool increment = isIncrementOperation(dataVIO->operation.type); + ASSERT_LOG_ONLY((!increment || (dataVIO->recoverySequenceNumber == 0)), + "journal lock not held for increment"); + + advanceJournalPoint(&journal->appendPoint, journal->entriesPerBlock); + int result = enqueueDataVIO((increment + ? &journal->incrementWaiters + : &journal->decrementWaiters), dataVIO, + THIS_LOCATION("$F($j-$js);io=journal($j-$js)")); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + continueDataVIO(dataVIO, result); + return; + } + + assignEntries(journal); +} + +/** + * Conduct a sweep on a recovery journal to reclaim unreferenced blocks. + * + * @param journal The recovery journal + **/ +static void reapRecoveryJournal(RecoveryJournal *journal) +{ + if (journal->reaping) { + // We already have an outstanding reap in progress. We need to wait for it + // to finish. + return; + } + + if (isQuiescent(&journal->state)) { + // We are supposed to not do IO. Don't botch it by reaping. + return; + } + + // Start reclaiming blocks only when the journal head has no references. Then + // stop when a block is referenced. + while ((journal->blockMapReapHead < journal->lastWriteAcknowledged) + && !isLocked(journal->lockCounter, journal->blockMapHeadBlockNumber, + ZONE_TYPE_LOGICAL)) { + journal->blockMapReapHead++; + if (++journal->blockMapHeadBlockNumber == journal->size) { + journal->blockMapHeadBlockNumber = 0; + } + } + + while ((journal->slabJournalReapHead < journal->lastWriteAcknowledged) + && !isLocked(journal->lockCounter, + journal->slabJournalHeadBlockNumber, + ZONE_TYPE_PHYSICAL)) { + journal->slabJournalReapHead++; + if (++journal->slabJournalHeadBlockNumber == journal->size) { + journal->slabJournalHeadBlockNumber = 0; + } + } + + if ((journal->blockMapReapHead == journal->blockMapHead) + && (journal->slabJournalReapHead == journal->slabJournalHead)) { + // Nothing happened. + return; + } + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { + /* + * If the block map head will advance, we must flush any block map page + * modified by the entries we are reaping. If the slab journal head will + * advance, we must flush the slab summary update covering the slab journal + * that just released some lock. + * + * In sync mode, this is unnecessary because we won't record these numbers + * on disk until the next journal block write, and in sync mode every + * journal block write is preceded by a flush, which does the block map + * page and slab summary update flushing itself. + */ + journal->reaping = true; + launchFlush(journal->flushVIO, completeReaping, handleFlushError); + return; + } + + finishReaping(journal); +} + +/**********************************************************************/ +void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + acquireLockCountReference(journal->lockCounter, blockNumber, zoneType, + zoneID); +} + +/**********************************************************************/ +void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + releaseLockCountReference(journal->lockCounter, blockNumber, zoneType, + zoneID); +} + +/**********************************************************************/ +void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, + SequenceNumber sequenceNumber) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + releaseJournalZoneReferenceFromOtherZone(journal->lockCounter, blockNumber); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, RecoveryJournal, state)); +} + +/**********************************************************************/ +void drainRecoveryJournal(RecoveryJournal *journal, + AdminStateCode operation, + VDOCompletion *parent) +{ + assertOnJournalThread(journal, __func__); + startDraining(&journal->state, operation, parent, initiateDrain); +} + +/**********************************************************************/ +void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent) +{ + assertOnJournalThread(journal, __func__); + bool saved = isSaved(&journal->state); + setCompletionResult(parent, resumeIfQuiescent(&journal->state)); + + if (isReadOnly(journal->readOnlyNotifier)) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + if (saved) { + initializeJournalState(journal); + } + + if (resumeLockCounter(journal->lockCounter)) { + // We might have missed a notification. + reapRecoveryJournal(journal); + } + + completeCompletion(parent); +} + +/**********************************************************************/ +BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) +{ + return journal->logicalBlocksUsed; +} + +/**********************************************************************/ +RecoveryJournalStatistics +getRecoveryJournalStatistics(const RecoveryJournal *journal) +{ + return journal->events; +} + +/**********************************************************************/ +void dumpRecoveryJournalStatistics(const RecoveryJournal *journal) +{ + RecoveryJournalStatistics stats = getRecoveryJournalStatistics(journal); + logInfo("Recovery Journal"); + logInfo(" blockMapHead=%llu slabJournalHead=%" PRIu64 + " lastWriteAcknowledged=%llu tail=%" PRIu64 + " blockMapReapHead=%llu slabJournalReapHead=%" PRIu64 + " diskFull=%llu slabJournalCommitsRequested=%" PRIu64 + " incrementWaiters=%zu decrementWaiters=%zu", + journal->blockMapHead, journal->slabJournalHead, + journal->lastWriteAcknowledged, journal->tail, + journal->blockMapReapHead, journal->slabJournalReapHead, + stats.diskFull, stats.slabJournalCommitsRequested, + countWaiters(&journal->incrementWaiters), + countWaiters(&journal->decrementWaiters)); + logInfo(" entries: started=%llu written=%llu committed=%" + PRIu64, + stats.entries.started, stats.entries.written, + stats.entries.committed); + logInfo(" blocks: started=%llu written=%llu committed=%" + PRIu64, + stats.blocks.started, stats.blocks.written, + stats.blocks.committed); + + logInfo(" active blocks:"); + const RingNode *head = &journal->activeTailBlocks; + for (RingNode *node = head->next; node != head; node = node->next) { + dumpRecoveryBlock(blockFromRingNode(node)); + } +} diff --git a/vdo/base/recoveryJournal.h b/vdo/base/recoveryJournal.h new file mode 100644 index 0000000..8ae7de0 --- /dev/null +++ b/vdo/base/recoveryJournal.h @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $ + */ + +#ifndef RECOVERY_JOURNAL_H +#define RECOVERY_JOURNAL_H + +#include "buffer.h" + +#include "adminState.h" +#include "completion.h" +#include "fixedLayout.h" +#include "flush.h" +#include "readOnlyNotifier.h" +#include "statistics.h" +#include "trace.h" +#include "types.h" + +/** + * The RecoveryJournal provides a log of all block mapping changes + * which have not yet been stably written to the block map. It exists + * to help provide resiliency guarantees by allowing synchronous + * writes to be acknowledged as soon as the corresponding journal + * entry is committed instead of having to wait for the block map + * update. For asynchronous writes, the journal aids in meeting the + * five second data loss window by ensuring that writes will not be + * lost as long as they are committed to the journal before the window + * expires. This should be less work than committing all of the + * required block map pages. + * + * The journal consists of a set of on-disk blocks arranged as a + * circular log with monotonically increasing sequence numbers. Three + * sequence numbers serve to define the active extent of the + * journal. The 'head' is the oldest active block in the journal. The + * 'tail' is the end of the half-open interval containing the active + * blocks. 'active' is the number of the block actively receiving + * entries. In an empty journal, head == active == tail. Once any + * entries are added, tail = active + 1, and head may be any value in + * the interval [tail - size, active]. + * + * The journal also contains a set of in-memory blocks which are used + * to buffer up entries until they can be committed. In general the + * number of in-memory blocks ('tailBufferCount') will be less than + * the on-disk size. Each in-memory block is also a VDOCompletion. + * Each in-memory block has a VDOExtent which is used to commit that + * block to disk. The extent's data is a PackedJournalBlock (which is a + * formatted journal block). In addition each in-memory block has a + * buffer which is used to accumulate entries while a partial commit + * of the block is in progress. In-memory blocks are kept on two + * rings. Free blocks live on the 'freeTailBlocks' ring. When a block + * becomes active (see below) it is moved to the 'activeTailBlocks' + * ring. When a block is fully committed, it is moved back to the + * 'freeTailBlocks' ring. + * + * When entries are added to the journal, they are added to the active + * in-memory block, as indicated by the 'activeBlock' field. If the + * caller wishes to wait for the entry to be committed, the requesting + * VIO will be attached to the in-memory block to which the caller's + * entry was added. If the caller does wish to wait, or if the entry + * filled the active block, an attempt will be made to commit that + * block to disk. If there is already another commit in progress, the + * attempt will be ignored and then automatically retried when the + * in-progress commit completes. If there is no commit in progress, + * any VIOs waiting on the block are transferred to the extent. The + * extent is then written, automatically waking all of the waiters + * when it completes. When the extent completes, any entries which + * accumulated in the block are copied to the extent's data buffer. + * + * Finally, the journal maintains a set of counters, one for each on + * disk journal block. These counters are used as locks to prevent + * premature reaping of journal blocks. Each time a new sequence + * number is used, the counter for the corresponding block is + * incremented. The counter is subsequently decremented when that + * block is filled and then committed for the last time. This prevents + * blocks from being reaped while they are still being updated. The + * counter is also incremented once for each entry added to a block, + * and decremented once each time the block map is updated in memory + * for that request. This prevents blocks from being reaped while + * their VIOs are still active. Finally, each in-memory block map page + * tracks the oldest journal block that contains entries corresponding to + * uncommitted updates to that block map page. Each time an in-memory block + * map page is updated, it checks if the journal block for the VIO + * is earlier than the one it references, in which case it increments + * the count on the earlier journal block and decrements the count on the + * later journal block, maintaining a lock on the oldest journal block + * containing entries for that page. When a block map page has been flushed + * from the cache, the counter for the journal block it references is + * decremented. Whenever the counter for the head block goes to 0, the + * head is advanced until it comes to a block whose counter is not 0 + * or until it reaches the active block. This is the mechanism for + * reclaiming journal space on disk. + * + * If there is no in-memory space when a VIO attempts to add an entry, + * the VIO will be attached to the 'commitCompletion' and will be + * woken the next time a full block has committed. If there is no + * on-disk space when a VIO attempts to add an entry, the VIO will be + * attached to the 'reapCompletion', and will be woken the next time a + * journal block is reaped. + **/ + +/** + * Return whether a given JournalOperation is an increment type. + * + * @param operation The operation in question + * + * @return true if the type is an increment type + **/ +static inline bool isIncrementOperation(JournalOperation operation) +{ + return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT)); +} + +/** + * Get the name of a journal operation. + * + * @param operation The operation to name + * + * @return The name of the operation + **/ +const char *getJournalOperationName(JournalOperation operation) + __attribute__((warn_unused_result)); + +/** + * Create a recovery journal. + * + * @param [in] nonce the nonce of the VDO + * @param [in] layer the physical layer for the journal + * @param [in] partition the partition for the journal + * @param [in] recoveryCount The VDO's number of completed recoveries + * @param [in] journalSize the number of blocks in the journal on disk + * @param [in] tailBufferSize the number of blocks for tail buffer + * @param [in] readOnlyNotifier the read-only mode notifier + * @param [in] threadConfig the thread configuration of the VDO + * @param [out] journalPtr the pointer to hold the new recovery journal + * + * @return a success or error code + **/ +int makeRecoveryJournal(Nonce nonce, + PhysicalLayer *layer, + Partition *partition, + uint64_t recoveryCount, + BlockCount journalSize, + BlockCount tailBufferSize, + ReadOnlyNotifier *readOnlyNotifier, + const ThreadConfig *threadConfig, + RecoveryJournal **journalPtr) + __attribute__((warn_unused_result)); + +/** + * Free a recovery journal and null out the reference to it. + * + * @param [in,out] journalPtr The reference to the recovery journal to free + **/ +void freeRecoveryJournal(RecoveryJournal **journalPtr); + +/** + * Move the backing partition pointer of the recovery journal. + * Assumes that the data in the old and the new partitions is identical. + * + * @param journal the journal being moved + * @param partition the new journal partition + **/ +void setRecoveryJournalPartition(RecoveryJournal *journal, + Partition *partition); + +/** + * Initialize the journal after a recovery. + * + * @param journal The journal in question + * @param recoveryCount The number of completed recoveries + * @param tail The new tail block sequence number + **/ +void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail); + +/** + * Initialize the journal after a rebuild. + * + * @param journal The journal in question + * @param recoveryCount The number of completed recoveries + * @param tail The new tail block sequence number + * @param logicalBlocksUsed The new number of logical blocks used + * @param blockMapDataBlocks The new number of block map data blocks + **/ +void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail, + BlockCount logicalBlocksUsed, + BlockCount blockMapDataBlocks); + +/** + * Get the number of block map pages, allocated from data blocks, currently + * in use. + * + * @param journal The journal in question + * + * @return The number of block map pages allocated from slabs + **/ +BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Set the number of block map pages, allocated from data blocks, currently + * in use. + * + * @param journal The journal in question + * @param pages The number of block map pages allocated from slabs + **/ +void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, + BlockCount pages); + +/** + * Get the ID of a recovery journal's thread. + * + * @param journal The journal to query + * + * @return The ID of the journal's thread. + **/ +ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Prepare the journal for new entries. + * + * @param journal The journal in question + * @param depot The slab depot for this VDO + * @param blockMap The block map for this VDO + **/ +void openRecoveryJournal(RecoveryJournal *journal, + SlabDepot *depot, + BlockMap *blockMap); + +/** + * Obtain the recovery journal's current sequence number. Exposed only so + * the block map can be initialized therefrom. + * + * @param journal The journal in question + * + * @return the sequence number of the tail block + **/ +SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal); + +/** + * Get the number of usable recovery journal blocks. + * + * @param journalSize The size of the recovery journal in blocks + * + * @return the number of recovery journal blocks usable for entries + **/ +BlockCount getRecoveryJournalLength(BlockCount journalSize) + __attribute__((warn_unused_result)); + +/** + * Get the size of the encoded state of a recovery journal. + * + * @return the encoded size of the journal's state + **/ +size_t getRecoveryJournalEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a recovery journal. + * + * @param journal the recovery journal + * @param buffer the buffer to encode into + * + * @return VDO_SUCCESS or an error code + **/ +int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a recovery journal saved in a buffer. + * + * @param journal the recovery journal + * @param buffer the buffer containing the saved state + * + * @return VDO_SUCCESS or an error code + **/ +int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a Sodium recovery journal saved in a buffer. + * + * @param journal the recovery journal + * @param buffer the buffer containing the saved state + * + * @return VDO_SUCCESS or an error code + **/ +int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Add an entry to a recovery journal. This method is asynchronous. The DataVIO + * will not be called back until the entry is committed to the on-disk journal. + * + * @param journal The journal in which to make an entry + * @param dataVIO The DataVIO for which to add the entry. The entry will be + * taken from the logical and newMapped fields of the + * DataVIO. The DataVIO's recoverySequenceNumber field will + * be set to the sequence number of the journal block in + * which the entry was made. + **/ +void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO); + +/** + * Acquire a reference to a recovery journal block from somewhere other than + * the journal itself. + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param zoneType The type of the zone making the adjustment + * @param zoneID The ID of the zone making the adjustment + **/ +void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID); + + +/** + * Release a reference to a recovery journal block from somewhere other than + * the journal itself. If this is the last reference for a given zone type, + * an attempt will be made to reap the journal. + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param zoneType The type of the zone making the adjustment + * @param zoneID The ID of the zone making the adjustment + **/ +void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a single per-entry reference count for a recovery journal block. This + * method may be called from any zone (but shouldn't be called from the journal + * zone as it would be inefficient). + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + **/ +void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, + SequenceNumber sequenceNumber); + +/** + * Drain recovery journal I/O. All uncommitted entries will be written out. + * + * @param journal The journal to drain + * @param operation The drain operation (suspend or save) + * @param parent The completion to finish once the journal is drained + **/ +void drainRecoveryJournal(RecoveryJournal *journal, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a recovery journal which has been drained. + * + * @param journal The journal to resume + * @param parent The completion to finish once the journal is resumed + * + * @return VDO_SUCCESS or an error + **/ +void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent); + +/** + * Get the number of logical blocks in use by the VDO + * + * @param journal the journal + * + * @return the number of logical blocks in use by the VDO + **/ +BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Get the current statistics from the recovery journal. + * + * @param journal The recovery journal to query + * + * @return a copy of the current statistics for the journal + **/ +RecoveryJournalStatistics +getRecoveryJournalStatistics(const RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Dump some current statistics and other debug info from the recovery + * journal. + * + * @param journal The recovery journal to dump + **/ +void dumpRecoveryJournalStatistics(const RecoveryJournal *journal); + +#endif // RECOVERY_JOURNAL_H diff --git a/vdo/base/recoveryJournalBlock.c b/vdo/base/recoveryJournalBlock.c new file mode 100644 index 0000000..1bbacfc --- /dev/null +++ b/vdo/base/recoveryJournalBlock.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.c#13 $ + */ + +#include "recoveryJournalBlock.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "dataVIO.h" +#include "fixedLayout.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "ringNode.h" +#include "vio.h" +#include "waitQueue.h" + +/**********************************************************************/ +int makeRecoveryBlock(PhysicalLayer *layer, + RecoveryJournal *journal, + RecoveryJournalBlock **blockPtr) +{ + // Ensure that a block is large enough to store + // RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. + STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK + <= ((VDO_BLOCK_SIZE - sizeof(PackedJournalHeader)) + / sizeof(PackedRecoveryJournalEntry))); + + RecoveryJournalBlock *block; + int result = ALLOCATE(1, RecoveryJournalBlock, __func__, &block); + if (result != VDO_SUCCESS) { + return result; + } + + // Allocate a full block for the journal block even though not all of the + // space is used since the VIO needs to write a full disk block. + result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedJournalBlock", &block->block); + if (result != VDO_SUCCESS) { + freeRecoveryBlock(&block); + return result; + } + + result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, + block, block->block, &block->vio); + if (result != VDO_SUCCESS) { + freeRecoveryBlock(&block); + return result; + } + + block->vio->completion.callbackThreadID = journal->threadID; + initializeRing(&block->ringNode); + block->journal = journal; + + *blockPtr = block; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRecoveryBlock(RecoveryJournalBlock **blockPtr) +{ + RecoveryJournalBlock *block = *blockPtr; + if (block == NULL) { + return; + } + + FREE(block->block); + freeVIO(&block->vio); + FREE(block); + *blockPtr = NULL; +} + +/** + * Get a pointer to the packed journal block header in the block buffer. + * + * @param block The recovery block + * + * @return The block's header + **/ +static inline +PackedJournalHeader *getBlockHeader(const RecoveryJournalBlock *block) +{ + return (PackedJournalHeader *) block->block; +} + +/** + * Set the current sector of the current block and initialize it. + * + * @param block The block to update + * @param sector A pointer to the first byte of the new sector + **/ +static void setActiveSector(RecoveryJournalBlock *block, void *sector) +{ + block->sector = (PackedJournalSector *) sector; + block->sector->checkByte = getBlockHeader(block)->fields.checkByte; + block->sector->recoveryCount = block->journal->recoveryCount; + block->sector->entryCount = 0; +} + +/**********************************************************************/ +void initializeRecoveryBlock(RecoveryJournalBlock *block) +{ + memset(block->block, 0x0, VDO_BLOCK_SIZE); + + RecoveryJournal *journal = block->journal; + block->sequenceNumber = journal->tail; + block->entryCount = 0; + block->uncommittedEntryCount = 0; + + block->blockNumber = getRecoveryJournalBlockNumber(journal, journal->tail); + + RecoveryBlockHeader unpacked = { + .metadataType = VDO_METADATA_RECOVERY_JOURNAL, + .blockMapDataBlocks = journal->blockMapDataBlocks, + .logicalBlocksUsed = journal->logicalBlocksUsed, + .nonce = journal->nonce, + .recoveryCount = journal->recoveryCount, + .sequenceNumber = journal->tail, + .checkByte = computeRecoveryCheckByte(journal, journal->tail), + }; + PackedJournalHeader *header = getBlockHeader(block); + packRecoveryBlockHeader(&unpacked, header); + + setActiveSector(block, getJournalBlockSector(header, 1)); +} + +/**********************************************************************/ +int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) +{ + // First queued entry indicates this is a journal block we've just opened + // or a committing block we're extending and will have to write again. + bool newBatch = !hasWaiters(&block->entryWaiters); + + // Enqueue the DataVIO to wait for its entry to commit. + int result = enqueueDataVIO(&block->entryWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + return result; + } + + block->entryCount++; + block->uncommittedEntryCount++; + + // Update stats to reflect the journal entry we're going to write. + if (newBatch) { + block->journal->events.blocks.started++; + } + block->journal->events.entries.started++; + + return VDO_SUCCESS; +} + +/** + * Check whether the current sector of a block is full. + * + * @param block The block to check + * + * @return true if the sector is full + **/ +__attribute__((warn_unused_result)) +static bool isSectorFull(const RecoveryJournalBlock *block) +{ + return (block->sector->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); +} + +/** + * Actually add entries from the queue to the given block. + * + * @param block The journal block + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int addQueuedRecoveryEntries(RecoveryJournalBlock *block) +{ + while (hasWaiters(&block->entryWaiters)) { + DataVIO *dataVIO + = waiterAsDataVIO(dequeueNextWaiter(&block->entryWaiters)); + if (dataVIO->operation.type == DATA_INCREMENT) { + // In order to not lose committed sectors of this partial write, we must + // flush before the partial write entries are committed. + block->hasPartialWriteEntry = (block->hasPartialWriteEntry + || dataVIO->isPartialWrite); + /* + * In order to not lose acknowledged writes with the FUA flag set, we + * must issue a flush to cover the data write and also all previous + * journal writes, and we must issue a FUA on the journal write. + */ + block->hasFUAEntry = (block->hasFUAEntry + || vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))); + } + + // Compose and encode the entry. + PackedRecoveryJournalEntry *packedEntry + = &block->sector->entries[block->sector->entryCount++]; + TreeLock *lock = &dataVIO->treeLock; + RecoveryJournalEntry newEntry = { + .mapping = { + .pbn = dataVIO->operation.pbn, + .state = dataVIO->operation.state, + }, + .operation = dataVIO->operation.type, + .slot = lock->treeSlots[lock->height].blockMapSlot, + }; + *packedEntry = packRecoveryJournalEntry(&newEntry); + + if (isIncrementOperation(dataVIO->operation.type)) { + dataVIO->recoverySequenceNumber = block->sequenceNumber; + } + + // Enqueue the DataVIO to wait for its entry to commit. + int result = enqueueDataVIO(&block->commitWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + continueDataVIO(dataVIO, result); + return result; + } + + if (isSectorFull(block)) { + setActiveSector(block, (char *) block->sector + VDO_SECTOR_SIZE); + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int getRecoveryBlockPBN(RecoveryJournalBlock *block, + PhysicalBlockNumber *pbnPtr) +{ + RecoveryJournal *journal = block->journal; + int result = translateToPBN(journal->partition, block->blockNumber, pbnPtr); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Error translating recovery journal block " + "number %llu", block->blockNumber); + } + return result; +} + +/**********************************************************************/ +bool canCommitRecoveryBlock(RecoveryJournalBlock *block) +{ + // Cannot commit in read-only mode, if already committing the block, or + // if there are no entries to commit. + return ((block != NULL) + && !block->committing + && hasWaiters(&block->entryWaiters) + && !isReadOnly(block->journal->readOnlyNotifier)); +} + +/**********************************************************************/ +int commitRecoveryBlock(RecoveryJournalBlock *block, + VDOAction *callback, + VDOAction *errorHandler) +{ + int result = ASSERT(canCommitRecoveryBlock(block), "should never call %s" + " when the block can't be committed", __func__); + if (result != VDO_SUCCESS) { + return result; + } + + PhysicalBlockNumber blockPBN; + result = getRecoveryBlockPBN(block, &blockPBN); + if (result != VDO_SUCCESS) { + return result; + } + + block->entriesInCommit = countWaiters(&block->entryWaiters); + result = addQueuedRecoveryEntries(block); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryJournal *journal = block->journal; + PackedJournalHeader *header = getBlockHeader(block); + + // Update stats to reflect the block and entries we're about to write. + journal->pendingWriteCount += 1; + journal->events.blocks.written += 1; + journal->events.entries.written += block->entriesInCommit; + + storeUInt64LE(header->fields.blockMapHead, journal->blockMapHead); + storeUInt64LE(header->fields.slabJournalHead, journal->slabJournalHead); + storeUInt16LE(header->fields.entryCount, block->entryCount); + + block->committing = true; + + /* + * In sync or async mode, when we are writing an increment entry for a + * request with FUA, or when making the increment entry for a partial + * write, we need to make sure all the data being mapped to by this block + * is stable on disk and also that the recovery journal is stable up to + * the current block, so we must flush before writing. + * + * In sync mode, and for FUA, we also need to make sure that the write we + * are doing is stable, so we issue the write with FUA. + */ + PhysicalLayer *layer = vioAsCompletion(block->vio)->layer; + bool fua = (block->hasFUAEntry + || (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC)); + bool flush = (block->hasFUAEntry + || (layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC_UNSAFE) + || block->hasPartialWriteEntry); + block->hasFUAEntry = false; + block->hasPartialWriteEntry = false; + launchWriteMetadataVIOWithFlush(block->vio, blockPBN, callback, errorHandler, + flush, fua); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void dumpRecoveryBlock(const RecoveryJournalBlock *block) +{ + logInfo(" sequence number %llu; entries %" PRIu16 + "; %s; %zu entry waiters; %zu commit waiters", + block->sequenceNumber, + block->entryCount, + (block->committing ? "committing" : "waiting"), + countWaiters(&block->entryWaiters), + countWaiters(&block->commitWaiters)); +} diff --git a/vdo/base/recoveryJournalBlock.h b/vdo/base/recoveryJournalBlock.h new file mode 100644 index 0000000..f26f8e8 --- /dev/null +++ b/vdo/base/recoveryJournalBlock.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.h#8 $ + */ + +#ifndef RECOVERY_JOURNAL_BLOCK_H +#define RECOVERY_JOURNAL_BLOCK_H + +#include "permassert.h" + +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalInternals.h" +#include "ringNode.h" +#include "types.h" +#include "waitQueue.h" + +struct recoveryJournalBlock { + /** The doubly linked pointers for the free or active lists */ + RingNode ringNode; + /** The waiter for the pending full block list */ + Waiter writeWaiter; + /** The journal to which this block belongs */ + RecoveryJournal *journal; + /** A pointer to a block-sized buffer holding the packed block data */ + char *block; + /** A pointer to the current sector in the packed block buffer */ + PackedJournalSector *sector; + /** The VIO for writing this block */ + VIO *vio; + /** The sequence number for this block */ + SequenceNumber sequenceNumber; + /** The location of this block in the on-disk journal */ + PhysicalBlockNumber blockNumber; + /** Whether this block is being committed */ + bool committing; + /** Whether this block has an uncommitted increment for a partial write */ + bool hasPartialWriteEntry; + /** Whether this block has an uncommitted increment for a write with FUA */ + bool hasFUAEntry; + /** The total number of entries in this block */ + JournalEntryCount entryCount; + /** The total number of uncommitted entries (queued or committing) */ + JournalEntryCount uncommittedEntryCount; + /** The number of new entries in the current commit */ + JournalEntryCount entriesInCommit; + /** The queue of VIOs which will make entries for the next commit */ + WaitQueue entryWaiters; + /** The queue of VIOs waiting for the current commit */ + WaitQueue commitWaiters; +}; + +/** + * Return the block associated with a ring node. + * + * @param node The ring node to recast as a block + * + * @return The block + **/ +static inline RecoveryJournalBlock *blockFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(RecoveryJournalBlock, ringNode) == 0); + return (RecoveryJournalBlock *) node; +} + +/** + * Return the block associated with a waiter + * + * @param waiter The waiter to recast as a block + * + * @return The block + **/ +static inline RecoveryJournalBlock *blockFromWaiter(Waiter *waiter) +{ + return (RecoveryJournalBlock *) + ((uintptr_t) waiter - offsetof(RecoveryJournalBlock, writeWaiter)); +} + +/** + * Check whether a recovery block is dirty, indicating it has any uncommitted + * entries, which includes both entries not written and entries written but + * not yet acknowledged. + * + * @param block The block to check + * + * @return true if the block has any uncommitted entries + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockDirty(const RecoveryJournalBlock *block) +{ + return (block->uncommittedEntryCount > 0); +} + +/** + * Check whether a journal block is empty. + * + * @param block The block to check + * + * @return true if the block has no entries + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockEmpty(const RecoveryJournalBlock *block) +{ + return (block->entryCount == 0); +} + +/** + * Check whether a journal block is full. + * + * @param block The block to check + * + * @return true if the the block is full + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockFull(const RecoveryJournalBlock *block) +{ + return ((block == NULL) + || (block->journal->entriesPerBlock == block->entryCount)); +} + +/** + * Construct a journal block. + * + * @param [in] layer The layer from which to construct VIOs + * @param [in] journal The journal to which the block will belong + * @param [out] blockPtr A pointer to receive the new block + * + * @return VDO_SUCCESS or an error + **/ +int makeRecoveryBlock(PhysicalLayer *layer, + RecoveryJournal *journal, + RecoveryJournalBlock **blockPtr) + __attribute__((warn_unused_result)); + +/** + * Free a tail block and null out the reference to it. + * + * @param blockPtr The reference to the tail block to free + **/ +void freeRecoveryBlock(RecoveryJournalBlock **blockPtr); + +/** + * Initialize the next active recovery journal block. + * + * @param block The journal block to initialize + **/ +void initializeRecoveryBlock(RecoveryJournalBlock *block); + +/** + * Enqueue a DataVIO to asynchronously encode and commit its next recovery + * journal entry in this block. The DataVIO will not be continued until the + * entry is committed to the on-disk journal. The caller is responsible for + * ensuring the block is not already full. + * + * @param block The journal block in which to make an entry + * @param dataVIO The DataVIO to enqueue + * + * @return VDO_SUCCESS or an error code if the DataVIO could not be enqueued + **/ +int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Attempt to commit a block. If the block is not the oldest block with + * uncommitted entries or if it is already being committed, nothing will be + * done. + * + * @param block The block to write + * @param callback The function to call when the write completes + * @param errorHandler The handler for flush or write errors + * + * @return VDO_SUCCESS, or an error if the write could not be launched + **/ +int commitRecoveryBlock(RecoveryJournalBlock *block, + VDOAction *callback, + VDOAction *errorHandler) + __attribute__((warn_unused_result)); + +/** + * Dump the contents of the recovery block to the log. + * + * @param block The block to dump + **/ +void dumpRecoveryBlock(const RecoveryJournalBlock *block); + +/** + * Check whether a journal block can be committed. + * + * @param block The journal block in question + * + * @return true if the block can be committed now + **/ +bool canCommitRecoveryBlock(RecoveryJournalBlock *block) + __attribute__((warn_unused_result)); + +#endif // RECOVERY_JOURNAL_BLOCK_H diff --git a/vdo/base/recoveryJournalEntry.h b/vdo/base/recoveryJournalEntry.h new file mode 100644 index 0000000..bf2a3e0 --- /dev/null +++ b/vdo/base/recoveryJournalEntry.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalEntry.h#1 $ + */ + +#ifndef RECOVERY_JOURNAL_ENTRY_H +#define RECOVERY_JOURNAL_ENTRY_H + +#include "numeric.h" + +#include "blockMapEntry.h" +#include "journalPoint.h" +#include "types.h" + +/** + * A recovery journal entry stores two physical locations: a data location + * that is the value of a single mapping in the block map tree, and the + * location of the block map page and and slot that is either acquiring or + * releasing a reference to the data location. The journal entry also stores + * an operation code that says whether the reference is being acquired (an + * increment) or released (a decrement), and whether the mapping is for a + * logical block or for the block map tree itself. + **/ +typedef struct { + BlockMapSlot slot; + DataLocation mapping; + JournalOperation operation; +} RecoveryJournalEntry; + +/** The packed, on-disk representation of a recovery journal entry. */ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** + * In little-endian bit order: + * Bits 15..12: The four highest bits of the 36-bit physical block number + * of the block map tree page + * Bits 11..2: The 10-bit block map page slot number + * Bits 1..0: The 2-bit JournalOperation of the entry + **/ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned operation : 2; + unsigned slotLow : 6; + unsigned slotHigh : 4; + unsigned pbnHighNibble : 4; +#else + unsigned slotLow : 6; + unsigned operation : 2; + unsigned pbnHighNibble : 4; + unsigned slotHigh : 4; +#endif + + /** + * Bits 47..16: The 32 low-order bits of the block map page PBN, + * in little-endian byte order + **/ + byte pbnLowWord[4]; + + /** + * Bits 87..48: The five-byte block map entry encoding the location that + * was or will be stored in the block map page slot + **/ + BlockMapEntry blockMapEntry; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[11]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned operation : 2; + unsigned slot : 10; + unsigned pbnHighNibble : 4; + uint32_t pbnLowWord; + BlockMapEntry blockMapEntry; + } littleEndian; +#endif +} PackedRecoveryJournalEntry; + +/** + * Return the packed, on-disk representation of a recovery journal entry. + * + * @param entry The journal entry to pack + * + * @return The packed representation of the journal entry + **/ +static inline PackedRecoveryJournalEntry +packRecoveryJournalEntry(const RecoveryJournalEntry *entry) +{ + PackedRecoveryJournalEntry packed = { + .fields = { + .operation = entry->operation, + .slotLow = entry->slot.slot & 0x3F, + .slotHigh = (entry->slot.slot >> 6) & 0x0F, + .pbnHighNibble = (entry->slot.pbn >> 32) & 0x0F, + .blockMapEntry = packPBN(entry->mapping.pbn, entry->mapping.state), + } + }; + storeUInt32LE(packed.fields.pbnLowWord, entry->slot.pbn & UINT_MAX); + return packed; +} + +/** + * Unpack the on-disk representation of a recovery journal entry. + * + * @param entry The recovery journal entry to unpack + * + * @return The unpacked entry + **/ +static inline RecoveryJournalEntry +unpackRecoveryJournalEntry(const PackedRecoveryJournalEntry *entry) +{ + PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); + PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; + return (RecoveryJournalEntry) { + .operation = entry->fields.operation, + .slot = { + .pbn = ((high4 << 32) | low32), + .slot = (entry->fields.slotLow | (entry->fields.slotHigh << 6)), + }, + .mapping = unpackBlockMapEntry(&entry->fields.blockMapEntry), + }; +} + +#endif // RECOVERY_JOURNAL_ENTRY_H diff --git a/vdo/base/recoveryJournalInternals.h b/vdo/base/recoveryJournalInternals.h new file mode 100644 index 0000000..0266990 --- /dev/null +++ b/vdo/base/recoveryJournalInternals.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalInternals.h#10 $ + */ + +#ifndef RECOVERY_JOURNAL_INTERNALS_H +#define RECOVERY_JOURNAL_INTERNALS_H + +#include "numeric.h" + +#include "adminState.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "lockCounter.h" +#include "recoveryJournal.h" +#include "ringNode.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +typedef struct recoveryJournalBlock RecoveryJournalBlock; + +struct recoveryJournal { + /** The thread ID of the journal zone */ + ThreadID threadID; + /** The slab depot which can hold locks on this journal */ + SlabDepot *depot; + /** The block map which can hold locks on this journal */ + BlockMap *blockMap; + /** The queue of VIOs waiting to make increment entries */ + WaitQueue incrementWaiters; + /** The queue of VIOs waiting to make decrement entries */ + WaitQueue decrementWaiters; + /** The number of free entries in the journal */ + uint64_t availableSpace; + /** The number of decrement entries which need to be made */ + VIOCount pendingDecrementCount; + /** + * Whether the journal is adding entries from the increment or + * decrement waiters queues + **/ + bool addingEntries; + /** The notifier for read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The administrative state of the journal */ + AdminState state; + /** Whether a reap is in progress */ + bool reaping; + /** The partition which holds the journal on disk */ + Partition *partition; + /** The oldest active block in the journal on disk for block map rebuild */ + SequenceNumber blockMapHead; + /** The oldest active block in the journal on disk for slab journal replay */ + SequenceNumber slabJournalHead; + /** The newest block in the journal on disk to which a write has finished */ + SequenceNumber lastWriteAcknowledged; + /** The end of the half-open interval of the active journal */ + SequenceNumber tail; + /** The point at which the last entry will have been added */ + JournalPoint appendPoint; + /** The journal point of the VIO most recently released from the journal */ + JournalPoint commitPoint; + /** The nonce of the VDO */ + Nonce nonce; + /** The number of recoveries completed by the VDO */ + uint8_t recoveryCount; + /** The number of entries which fit in a single block */ + JournalEntryCount entriesPerBlock; + /** Unused in-memory journal blocks */ + RingNode freeTailBlocks; + /** In-memory journal blocks with records */ + RingNode activeTailBlocks; + /** A pointer to the active block (the one we are adding entries to now) */ + RecoveryJournalBlock *activeBlock; + /** Journal blocks that need writing */ + WaitQueue pendingWrites; + /** The new block map reap head after reaping */ + SequenceNumber blockMapReapHead; + /** The head block number for the block map rebuild range */ + BlockCount blockMapHeadBlockNumber; + /** The new slab journal reap head after reaping */ + SequenceNumber slabJournalReapHead; + /** The head block number for the slab journal replay range */ + BlockCount slabJournalHeadBlockNumber; + /** The VIO on which we can call flush (less ick, but still ick) */ + VIO *flushVIO; + /** The data block which must live in the VIO in the flush extent */ + char *unusedFlushVIOData; + /** The number of blocks in the on-disk journal */ + BlockCount size; + /** The number of logical blocks that are in-use */ + BlockCount logicalBlocksUsed; + /** The number of block map pages that are allocated */ + BlockCount blockMapDataBlocks; + /** The number of journal blocks written but not yet acknowledged */ + BlockCount pendingWriteCount; + /** The threshold at which slab journal tail blocks will be written out */ + BlockCount slabJournalCommitThreshold; + /** Counters for events in the journal that are reported as statistics */ + RecoveryJournalStatistics events; + /** The locks for each on-disk block */ + LockCounter *lockCounter; +}; + +/** + * Get the physical block number for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number of the desired block + * + * @return The block number corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber +getRecoveryJournalBlockNumber(const RecoveryJournal *journal, + SequenceNumber sequence) +{ + // Since journal size is a power of two, the block number modulus can just + // be extracted from the low-order bits of the sequence. + return (sequence & (journal->size - 1)); +} + +/** + * Compute the checkByte for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number + * + * @return The check byte corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline uint8_t computeRecoveryCheckByte(const RecoveryJournal *journal, + SequenceNumber sequence) +{ + // The check byte must change with each trip around the journal. + return (((sequence / journal->size) & 0x7F) | 0x80); +} + +#endif // RECOVERY_JOURNAL_INTERNALS_H diff --git a/vdo/base/recoveryUtils.c b/vdo/base/recoveryUtils.c new file mode 100644 index 0000000..44f16ee --- /dev/null +++ b/vdo/base/recoveryUtils.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.c#4 $ + */ + +#include "recoveryUtils.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "completion.h" +#include "extent.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +/** + * Finish loading the journal by freeing the extent and notifying the parent. + * This callback is registered in loadJournalAsync(). + * + * @param completion The load extent + **/ +static void finishJournalLoad(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + VDOExtent *extent = asVDOExtent(completion); + freeExtent(&extent); + finishCompletion(parent, result); +} + +/**********************************************************************/ +void loadJournalAsync(RecoveryJournal *journal, + VDOCompletion *parent, + char **journalDataPtr) +{ + int result = ALLOCATE(journal->size * VDO_BLOCK_SIZE, char, __func__, + journalDataPtr); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOExtent *extent; + result = createExtent(parent->layer, VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_METADATA, journal->size, + *journalDataPtr, &extent); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + prepareCompletion(&extent->completion, finishJournalLoad, finishJournalLoad, + parent->callbackThreadID, parent); + readMetadataExtent(extent, + getFixedLayoutPartitionOffset(journal->partition)); +} + +/** + * Determine whether the given header describe a valid block for the + * given journal that could appear at the given offset in the journal. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * @param offset An offset indicating where the block was in the journal + * + * @return True if the header matches + **/ +__attribute__((warn_unused_result)) +static bool isCongruentRecoveryJournalBlock(RecoveryJournal *journal, + const RecoveryBlockHeader *header, + PhysicalBlockNumber offset) +{ + PhysicalBlockNumber expectedOffset + = getRecoveryJournalBlockNumber(journal, header->sequenceNumber); + return ((expectedOffset == offset) + && isValidRecoveryJournalBlock(journal, header)); +} + +/**********************************************************************/ +bool findHeadAndTail(RecoveryJournal *journal, + char *journalData, + SequenceNumber *tailPtr, + SequenceNumber *blockMapHeadPtr, + SequenceNumber *slabJournalHeadPtr) +{ + SequenceNumber highestTail = journal->tail; + SequenceNumber blockMapHeadMax = 0; + SequenceNumber slabJournalHeadMax = 0; + bool foundEntries = false; + for (PhysicalBlockNumber i = 0; i < journal->size; i++) { + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isCongruentRecoveryJournalBlock(journal, &header, i)) { + // This block is old, unformatted, or doesn't belong at this location. + continue; + } + + if (header.sequenceNumber >= highestTail) { + foundEntries = true; + highestTail = header.sequenceNumber; + } + if (header.blockMapHead > blockMapHeadMax) { + blockMapHeadMax = header.blockMapHead; + } + if (header.slabJournalHead > slabJournalHeadMax) { + slabJournalHeadMax = header.slabJournalHead; + } + } + + *tailPtr = highestTail; + if (!foundEntries) { + return false; + } + + *blockMapHeadPtr = blockMapHeadMax; + if (slabJournalHeadPtr != NULL) { + *slabJournalHeadPtr = slabJournalHeadMax; + } + return true; +} + +/**********************************************************************/ +int validateRecoveryJournalEntry(const VDO *vdo, + const RecoveryJournalEntry *entry) +{ + if ((entry->slot.pbn >= vdo->config.physicalBlocks) + || (entry->slot.slot >= BLOCK_MAP_ENTRIES_PER_PAGE) + || !isValidLocation(&entry->mapping) + || !isPhysicalDataBlock(vdo->depot, entry->mapping.pbn)) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" + " (%llu, %" PRIu16 ") to %" PRIu64 + " (%s) is not within bounds", + entry->slot.pbn, entry->slot.slot, + entry->mapping.pbn, + getJournalOperationName(entry->operation)); + } + + if ((entry->operation == BLOCK_MAP_INCREMENT) + && (isCompressed(entry->mapping.state) + || (entry->mapping.pbn == ZERO_BLOCK))) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" + " (%llu, %" PRIu16 ") to %" PRIu64 + " (%s) is not a valid tree mapping", + entry->slot.pbn, entry->slot.slot, + entry->mapping.pbn, + getJournalOperationName(entry->operation)); + } + + return VDO_SUCCESS; +} diff --git a/vdo/base/recoveryUtils.h b/vdo/base/recoveryUtils.h new file mode 100644 index 0000000..6778af9 --- /dev/null +++ b/vdo/base/recoveryUtils.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.h#5 $ + */ + +#ifndef RECOVERY_UTILS_H +#define RECOVERY_UTILS_H + +#include "constants.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "types.h" + +/** + * Get the block header for a block at a position in the journal data. + * + * @param journal The recovery journal + * @param journalData The recovery journal data + * @param sequence The sequence number + * + * @return A pointer to a packed recovery journal block header. + **/ +__attribute__((warn_unused_result)) +static inline +PackedJournalHeader *getJournalBlockHeader(RecoveryJournal *journal, + char *journalData, + SequenceNumber sequence) +{ + off_t blockOffset = (getRecoveryJournalBlockNumber(journal, sequence) + * VDO_BLOCK_SIZE); + return (PackedJournalHeader *) &journalData[blockOffset]; +} + +/** + * Determine whether the given header describes a valid block for the + * given journal. A block is not valid if it is unformatted, or if it + * is older than the last successful recovery or reformat. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * + * @return True if the header is valid + **/ +__attribute__((warn_unused_result)) +static inline +bool isValidRecoveryJournalBlock(const RecoveryJournal *journal, + const RecoveryBlockHeader *header) +{ + return ((header->metadataType == VDO_METADATA_RECOVERY_JOURNAL) + && (header->nonce == journal->nonce) + && (header->recoveryCount == journal->recoveryCount)); +} + +/** + * Determine whether the given header describes the exact block indicated. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * @param sequence The expected sequence number + * + * @return True if the block matches + **/ +__attribute__((warn_unused_result)) +static inline +bool isExactRecoveryJournalBlock(const RecoveryJournal *journal, + const RecoveryBlockHeader *header, + SequenceNumber sequence) +{ + return ((header->sequenceNumber == sequence) + && isValidRecoveryJournalBlock(journal, header)); +} + +/** + * Determine whether the header of the given sector could describe a + * valid sector for the given journal block header. + * + * @param header The unpacked block header to compare against + * @param sector The packed sector to check + * + * @return True if the sector matches the block header + **/ +__attribute__((warn_unused_result)) +static inline +bool isValidRecoveryJournalSector(const RecoveryBlockHeader *header, + const PackedJournalSector *sector) +{ + return ((header->checkByte == sector->checkByte) + && (header->recoveryCount == sector->recoveryCount)); +} + +/** + * Load the journal data off the disk. + * + * @param [in] journal The recovery journal to load + * @param [in] parent The completion to notify when the load is + * complete + * @param [out] journalDataPtr A pointer to the journal data buffer (it is the + * caller's responsibility to free this buffer) + **/ +void loadJournalAsync(RecoveryJournal *journal, + VDOCompletion *parent, + char **journalDataPtr); + +/** + * Find the tail and the head of the journal by searching for the highest + * sequence number in a block with a valid nonce, and the highest head value + * among the blocks with valid nonces. + * + * @param [in] journal The recovery journal + * @param [in] journalData The journal data read from disk + * @param [out] tailPtr A pointer to return the tail found, or if + * no higher block is found, the value + * currently in the journal + * @param [out] blockMapHeadPtr A pointer to return the block map head + * @param [out] slabJournalHeadPtr An optional pointer to return the slab + * journal head + * + * @return True if there were valid journal blocks + **/ +bool findHeadAndTail(RecoveryJournal *journal, + char *journalData, + SequenceNumber *tailPtr, + SequenceNumber *blockMapHeadPtr, + SequenceNumber *slabJournalHeadPtr); + +/** + * Validate a recovery journal entry. + * + * @param vdo The VDO + * @param entry The entry to validate + * + * @return VDO_SUCCESS or an error + **/ +int validateRecoveryJournalEntry(const VDO *vdo, + const RecoveryJournalEntry *entry) + __attribute__((warn_unused_result)); + +#endif // RECOVERY_UTILS_H diff --git a/vdo/base/refCounts.c b/vdo/base/refCounts.c new file mode 100644 index 0000000..daf04c4 --- /dev/null +++ b/vdo/base/refCounts.c @@ -0,0 +1,1451 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.c#9 $ + */ + +#include "refCounts.h" +#include "refCountsInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "completion.h" +#include "extent.h" +#include "header.h" +#include "journalPoint.h" +#include "numUtils.h" +#include "pbnLock.h" +#include "readOnlyNotifier.h" +#include "referenceBlock.h" +#include "referenceOperation.h" +#include "slab.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "slabSummary.h" +#include "statusCodes.h" +#include "stringUtils.h" +#include "vdo.h" +#include "vioPool.h" +#include "waitQueue.h" + +static const uint64_t BYTES_PER_WORD = sizeof(uint64_t); +static const bool NORMAL_OPERATION = true; + +/** + * Return the RefCounts from the RefCounts waiter. + * + * @param waiter The waiter to convert + * + * @return The RefCounts + **/ +__attribute__((warn_unused_result)) +static inline RefCounts *refCountsFromWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (RefCounts *) + ((uintptr_t) waiter - offsetof(RefCounts, slabSummaryWaiter)); +} + +/** + * Convert the index of a reference counter back to the block number of the + * physical block for which it is counting references. The index is assumed to + * be valid and in-range. + * + * @param refCounts The reference counts object + * @param index The array index of the reference counter + * + * @return the physical block number corresponding to the index + **/ +static PhysicalBlockNumber indexToPBN(const RefCounts *refCounts, + uint64_t index) +{ + return (refCounts->slab->start + index); +} + +/** + * Convert a block number to the index of a reference counter for that block. + * Out of range values are pinned to the beginning or one past the end of the + * array. + * + * @param refCounts The reference counts object + * @param pbn The physical block number + * + * @return the index corresponding to the physical block number + **/ +static uint64_t pbnToIndex(const RefCounts *refCounts, PhysicalBlockNumber pbn) +{ + if (pbn < refCounts->slab->start) { + return 0; + } + uint64_t index = (pbn - refCounts->slab->start); + return minBlock(index, refCounts->blockCount); +} + +/**********************************************************************/ +ReferenceStatus referenceCountToStatus(ReferenceCount count) +{ + if (count == EMPTY_REFERENCE_COUNT) { + return RS_FREE; + } else if (count == 1) { + return RS_SINGLE; + } else if (count == PROVISIONAL_REFERENCE_COUNT) { + return RS_PROVISIONAL; + } else { + return RS_SHARED; + } +} + +/** + * Reset the free block search back to the first reference counter + * in the first reference block. + * + * @param refCounts The RefCounts object containing the search cursor + **/ +static void resetSearchCursor(RefCounts *refCounts) +{ + SearchCursor *cursor = &refCounts->searchCursor; + + cursor->block = cursor->firstBlock; + cursor->index = 0; + // Unit tests have slabs with only one reference block (and it's a runt). + cursor->endIndex = minBlock(COUNTS_PER_BLOCK, refCounts->blockCount); +} + +/** + * Advance the search cursor to the start of the next reference block, + * wrapping around to the first reference block if the current block is the + * last reference block. + * + * @param refCounts The RefCounts object containing the search cursor + * + * @return true unless the cursor was at the last reference block + **/ +static bool advanceSearchCursor(RefCounts *refCounts) +{ + SearchCursor *cursor = &refCounts->searchCursor; + + // If we just finished searching the last reference block, then wrap back + // around to the start of the array. + if (cursor->block == cursor->lastBlock) { + resetSearchCursor(refCounts); + return false; + } + + // We're not already at the end, so advance to cursor to the next block. + cursor->block++; + cursor->index = cursor->endIndex; + + if (cursor->block == cursor->lastBlock) { + // The last reference block will usually be a runt. + cursor->endIndex = refCounts->blockCount; + } else { + cursor->endIndex += COUNTS_PER_BLOCK; + } + return true; +} + +/**********************************************************************/ +int makeRefCounts(BlockCount blockCount, + Slab *slab, + PhysicalBlockNumber origin, + ReadOnlyNotifier *readOnlyNotifier, + RefCounts **refCountsPtr) +{ + BlockCount refBlockCount = getSavedReferenceCountSize(blockCount); + RefCounts *refCounts; + int result = ALLOCATE_EXTENDED(RefCounts, refBlockCount, ReferenceBlock, + "ref counts structure", &refCounts); + if (result != UDS_SUCCESS) { + return result; + } + + // Allocate such that the runt slab has a full-length memory array, + // plus a little padding so we can word-search even at the very end. + size_t bytes = ((refBlockCount * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD)); + result = ALLOCATE(bytes, ReferenceCount, "ref counts array", + &refCounts->counters); + if (result != UDS_SUCCESS) { + freeRefCounts(&refCounts); + return result; + } + + refCounts->slab = slab; + refCounts->blockCount = blockCount; + refCounts->freeBlocks = blockCount; + refCounts->origin = origin; + refCounts->referenceBlockCount = refBlockCount; + refCounts->readOnlyNotifier = readOnlyNotifier; + refCounts->statistics = &slab->allocator->refCountStatistics; + refCounts->searchCursor.firstBlock = &refCounts->blocks[0]; + refCounts->searchCursor.lastBlock = &refCounts->blocks[refBlockCount - 1]; + resetSearchCursor(refCounts); + + for (size_t index = 0; index < refBlockCount; index++) { + refCounts->blocks[index] = (ReferenceBlock) { + .refCounts = refCounts, + }; + } + + *refCountsPtr = refCounts; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRefCounts(RefCounts **refCountsPtr) +{ + RefCounts *refCounts = *refCountsPtr; + if (refCounts == NULL) { + return; + } + + FREE(refCounts->counters); + FREE(refCounts); + *refCountsPtr = NULL; +} + +/** + * Check whether a RefCounts has active I/O. + * + * @param refCounts The RefCounts to check + * + * @return true if there is reference block I/O or a summary + * update in progress + **/ +__attribute__((warn_unused_result)) +static bool hasActiveIO(RefCounts *refCounts) +{ + return ((refCounts->activeCount > 0) || refCounts->updatingSlabSummary); +} + +/**********************************************************************/ +bool areRefCountsActive(RefCounts *refCounts) +{ + if (hasActiveIO(refCounts)) { + return true; + } + + // When not suspending or recovering, the refCounts must be clean. + AdminStateCode code = refCounts->slab->state.state; + return (hasWaiters(&refCounts->dirtyBlocks) + && (code != ADMIN_STATE_SUSPENDING) + && (code != ADMIN_STATE_RECOVERING)); +} + +/**********************************************************************/ +static void enterRefCountsReadOnlyMode(RefCounts *refCounts, int result) +{ + enterReadOnlyMode(refCounts->readOnlyNotifier, result); + checkIfSlabDrained(refCounts->slab); +} + +/** + * Enqueue a block on the dirty queue. + * + * @param block The block to enqueue + **/ +static void enqueueDirtyBlock(ReferenceBlock *block) +{ + int result = enqueueWaiter(&block->refCounts->dirtyBlocks, &block->waiter); + if (result != VDO_SUCCESS) { + // This should never happen. + enterRefCountsReadOnlyMode(block->refCounts, result); + } +} + +/** + * Mark a reference count block as dirty, potentially adding it to the dirty + * queue if it wasn't already dirty. + * + * @param block The reference block to mark as dirty + **/ +static void dirtyBlock(ReferenceBlock *block) +{ + if (block->isDirty) { + return; + } + + block->isDirty = true; + if (block->isWriting) { + // The conclusion of the current write will enqueue the block again. + return; + } + + enqueueDirtyBlock(block); +} + +/**********************************************************************/ +BlockCount getUnreferencedBlockCount(RefCounts *refCounts) +{ + return refCounts->freeBlocks; +} + +/**********************************************************************/ +ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) +{ + return &refCounts->blocks[index / COUNTS_PER_BLOCK]; +} + +/** + * Get the reference counter that covers the given physical block number. + * + * @param [in] refCounts The refcounts object + * @param [in] pbn The physical block number + * @param [out] counterPtr A pointer to the reference counter + + **/ +static int getReferenceCounter(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceCount **counterPtr) +{ + SlabBlockNumber index; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &index); + if (result != VDO_SUCCESS) { + return result; + } + + *counterPtr = &refCounts->counters[index]; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) +{ + ReferenceCount *counterPtr = NULL; + int result = getReferenceCounter(refCounts, pbn, &counterPtr); + if (result != VDO_SUCCESS) { + return 0; + } + + if (*counterPtr == PROVISIONAL_REFERENCE_COUNT) { + return (MAXIMUM_REFERENCE_COUNT - 1); + } + + return (MAXIMUM_REFERENCE_COUNT - *counterPtr); +} + +/** + * Increment the reference count for a data block. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the data block + * before this increment + * @param [in] lock The PBNLock associated with this + * increment (may be NULL) + * @param [in,out] counterPtr A pointer to the count for the data block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * + * @return VDO_SUCCESS or an error + **/ +static int incrementForData(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + *counterPtr = 1; + block->allocatedCount++; + refCounts->freeBlocks--; + *freeStatusChanged = true; + break; + + case RS_PROVISIONAL: + *counterPtr = 1; + *freeStatusChanged = false; + break; + + default: + // Single or shared + if (*counterPtr >= MAXIMUM_REFERENCE_COUNT) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing a block already having" + " 254 references (slab %u, offset %" + PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + (*counterPtr)++; + *freeStatusChanged = false; + } + + if (lock != NULL) { + unassignProvisionalReference(lock); + } + return VDO_SUCCESS; +} + +/** + * Decrement the reference count for a data block. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the data block + * before this decrement + * @param [in] lock The PBNLock associated with the block + * being decremented (may be NULL) + * @param [in,out] counterPtr A pointer to the count for the data block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * + * @return VDO_SUCCESS or an error + **/ +static int decrementForData(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Decrementing free block at offset %" + PRIu32 " in slab %u", slabBlockNumber, + refCounts->slab->slabNumber); + + case RS_PROVISIONAL: + case RS_SINGLE: + if (lock != NULL) { + // There is a read lock on this block, so the block must not become + // unreferenced. + *counterPtr = PROVISIONAL_REFERENCE_COUNT; + *freeStatusChanged = false; + assignProvisionalReference(lock); + } else { + *counterPtr = EMPTY_REFERENCE_COUNT; + block->allocatedCount--; + refCounts->freeBlocks++; + *freeStatusChanged = true; + } + break; + + default: + // Shared + (*counterPtr)--; + *freeStatusChanged = false; + } + + return VDO_SUCCESS; +} + +/** + * Increment the reference count for a block map page. All block map increments + * should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map blocks + * never dedupe they should never be adjusted from any other state. The + * adjustment always results in MAXIMUM_REFERENCE_COUNT as this value is used to + * prevent dedupe against block map blocks. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the block + * before this increment + * @param [in] lock The PBNLock associated with this + * increment (may be NULL) + * @param [in] normalOperation Whether we are in normal operation vs. + * recovery or rebuild + * @param [in,out] counterPtr A pointer to the count for the block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of the + * block + * + * @return VDO_SUCCESS or an error + **/ +static int incrementForBlockMap(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + bool normalOperation, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + if (normalOperation) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing unallocated block map block" + " (slab %u, offset %" PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + + *counterPtr = MAXIMUM_REFERENCE_COUNT; + block->allocatedCount++; + refCounts->freeBlocks--; + *freeStatusChanged = true; + return VDO_SUCCESS; + + case RS_PROVISIONAL: + if (!normalOperation) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Block map block had provisional " + "reference during replay" + " (slab %u, offset %" PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + + *counterPtr = MAXIMUM_REFERENCE_COUNT; + *freeStatusChanged = false; + if (lock != NULL) { + unassignProvisionalReference(lock); + } + return VDO_SUCCESS; + + default: + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing a block map block which is " + "already referenced %u times (slab %u, " + "offset %" PRIu32 ")", + *counterPtr, + refCounts->slab->slabNumber, + slabBlockNumber); + } +} + +/** + * Update the reference count of a block. + * + * @param [in] refCounts The refCounts responsible for the + * block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] slabJournalPoint The slab journal point at which this + * update is journaled + * @param [in] operation How to update the count + * @param [in] normalOperation Whether we are in normal operation vs. + * recovery or rebuild + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * @param [out] provisionalDecrementPtr A pointer which will be set to true if + * this update was a decrement of a + * provisional reference + * + * @return VDO_SUCCESS or an error + **/ +static int updateReferenceCount(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + const JournalPoint *slabJournalPoint, + ReferenceOperation operation, + bool normalOperation, + bool *freeStatusChanged, + bool *provisionalDecrementPtr) +{ + ReferenceCount *counterPtr = &refCounts->counters[slabBlockNumber]; + ReferenceStatus oldStatus = referenceCountToStatus(*counterPtr); + PBNLock *lock = getReferenceOperationPBNLock(operation); + int result; + + switch (operation.type) { + case DATA_INCREMENT: + result = incrementForData(refCounts, block, slabBlockNumber, oldStatus, + lock, counterPtr, freeStatusChanged); + break; + + case DATA_DECREMENT: + result = decrementForData(refCounts, block, slabBlockNumber, oldStatus, + lock, counterPtr, freeStatusChanged); + if ((result == VDO_SUCCESS) && (oldStatus == RS_PROVISIONAL)) { + if (provisionalDecrementPtr != NULL) { + *provisionalDecrementPtr = true; + } + return VDO_SUCCESS; + } + break; + + case BLOCK_MAP_INCREMENT: + result = incrementForBlockMap(refCounts, block, slabBlockNumber, oldStatus, + lock, normalOperation, counterPtr, + freeStatusChanged); + break; + + default: + logError("Unknown reference count operation: %u", operation.type); + enterRefCountsReadOnlyMode(refCounts, VDO_NOT_IMPLEMENTED); + result = VDO_NOT_IMPLEMENTED; + } + + if (result != VDO_SUCCESS) { + return result; + } + + if (isValidJournalPoint(slabJournalPoint)) { + refCounts->slabJournalPoint = *slabJournalPoint; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int adjustReferenceCount(RefCounts *refCounts, + ReferenceOperation operation, + const JournalPoint *slabJournalPoint, + bool *freeStatusChanged) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, operation.pbn, + &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + bool provisionalDecrement = false; + result = updateReferenceCount(refCounts, block, slabBlockNumber, + slabJournalPoint, operation, + NORMAL_OPERATION, freeStatusChanged, + &provisionalDecrement); + if ((result != VDO_SUCCESS) || provisionalDecrement) { + return result; + } + + if (block->isDirty && (block->slabJournalLock > 0)) { + /* + * This block is already dirty and a slab journal entry has been made + * for it since the last time it was clean. We must release the per-entry + * slab journal lock for the entry associated with the update we are now + * doing. + */ + result = ASSERT(isValidJournalPoint(slabJournalPoint), + "Reference count adjustments need slab journal points."); + if (result != VDO_SUCCESS) { + return result; + } + + SequenceNumber entryLock = slabJournalPoint->sequenceNumber; + adjustSlabJournalBlockReference(refCounts->slab->journal, entryLock, -1); + return VDO_SUCCESS; + } + + /* + * This may be the first time we are applying an update for which there + * is a slab journal entry to this block since the block was + * cleaned. Therefore, we convert the per-entry slab journal lock to an + * uncommitted reference block lock, if there is a per-entry lock. + */ + if (isValidJournalPoint(slabJournalPoint)) { + block->slabJournalLock = slabJournalPoint->sequenceNumber; + } else { + block->slabJournalLock = 0; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int adjustReferenceCountForRebuild(RefCounts *refCounts, + PhysicalBlockNumber pbn, + JournalOperation operation) +{ + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + bool unusedFreeStatus; + ReferenceOperation physicalOperation = { + .type = operation, + }; + result = updateReferenceCount(refCounts, block, slabBlockNumber, NULL, + physicalOperation, !NORMAL_OPERATION, + &unusedFreeStatus, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int replayReferenceCountChange(RefCounts *refCounts, + const JournalPoint *entryPoint, + SlabJournalEntry entry) +{ + ReferenceBlock *block = getReferenceBlock(refCounts, entry.sbn); + SectorCount sector + = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; + if (!beforeJournalPoint(&block->commitPoints[sector], entryPoint)) { + // This entry is already reflected in the existing counts, so do nothing. + return VDO_SUCCESS; + } + + // This entry is not yet counted in the reference counts. + bool unusedFreeStatus; + ReferenceOperation operation = { + .type = entry.operation + }; + int result = updateReferenceCount(refCounts, block, entry.sbn, + entryPoint, operation, !NORMAL_OPERATION, + &unusedFreeStatus, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int getReferenceStatus(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceStatus *statusPtr) +{ + ReferenceCount *counterPtr = NULL; + int result = getReferenceCounter(refCounts, pbn, &counterPtr); + if (result != VDO_SUCCESS) { + return result; + } + + *statusPtr = referenceCountToStatus(*counterPtr); + return VDO_SUCCESS; +} + +/**********************************************************************/ +bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) +{ + if ((counterA->blockCount != counterB->blockCount) + || (counterA->freeBlocks != counterB->freeBlocks) + || (counterA->referenceBlockCount != counterB->referenceBlockCount)) { + return false; + } + + for (size_t i = 0; i < counterA->referenceBlockCount; i++) { + ReferenceBlock *blockA = &counterA->blocks[i]; + ReferenceBlock *blockB = &counterB->blocks[i]; + if (blockA->allocatedCount != blockB->allocatedCount) { + return false; + } + } + + return (memcmp(counterA->counters, counterB->counters, + sizeof(ReferenceCount) * counterA->blockCount) == 0); +} + +/** + * Find the array index of the first zero byte in word-sized range of + * reference counters. The search does no bounds checking; the function relies + * on the array being sufficiently padded. + * + * @param wordPtr A pointer to the eight counter bytes to check + * @param startIndex The array index corresponding to wordPtr[0] + * @param failIndex The array index to return if no zero byte is found + + * @return the array index of the first zero byte in the word, or + * the value passed as failIndex if no zero byte was found + **/ +static inline SlabBlockNumber findZeroByteInWord(const byte *wordPtr, + SlabBlockNumber startIndex, + SlabBlockNumber failIndex) +{ + uint64_t word = getUInt64LE(wordPtr); + + // This looks like a loop, but GCC will unroll the eight iterations for us. + for (unsigned int offset = 0; offset < BYTES_PER_WORD; offset++) { + // Assumes little-endian byte order, which we have on X86. + if ((word & 0xFF) == 0) { + return (startIndex + offset); + } + word >>= 8; + } + + return failIndex; +} + +/**********************************************************************/ +bool findFreeBlock(const RefCounts *refCounts, + SlabBlockNumber startIndex, + SlabBlockNumber endIndex, + SlabBlockNumber *indexPtr) +{ + SlabBlockNumber zeroIndex; + SlabBlockNumber nextIndex = startIndex; + byte *nextCounter = &refCounts->counters[nextIndex]; + byte *endCounter = &refCounts->counters[endIndex]; + + // Search every byte of the first unaligned word. (Array is padded so + // reading past end is safe.) + zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); + if (zeroIndex < endIndex) { + *indexPtr = zeroIndex; + return true; + } + + // On architectures where unaligned word access is expensive, this + // would be a good place to advance to an alignment boundary. + nextIndex += BYTES_PER_WORD; + nextCounter += BYTES_PER_WORD; + + // Now we're word-aligned; check an word at a time until we find a word + // containing a zero. (Array is padded so reading past end is safe.) + while (nextCounter < endCounter) { + /* + * The following code is currently an exact copy of the code preceding the + * loop, but if you try to merge them by using a do loop, it runs slower + * because a jump instruction gets added at the start of the iteration. + */ + zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); + if (zeroIndex < endIndex) { + *indexPtr = zeroIndex; + return true; + } + + nextIndex += BYTES_PER_WORD; + nextCounter += BYTES_PER_WORD; + } + + return false; +} + +/** + * Search the reference block currently saved in the search cursor for a + * reference count of zero, starting at the saved counter index. + * + * @param [in] refCounts The RefCounts object to search + * @param [out] freeIndexPtr A pointer to receive the array index of the + * zero reference count + * + * @return true if an unreferenced counter was found + **/ +static bool searchCurrentReferenceBlock(const RefCounts *refCounts, + SlabBlockNumber *freeIndexPtr) +{ + // Don't bother searching if the current block is known to be full. + return ((refCounts->searchCursor.block->allocatedCount < COUNTS_PER_BLOCK) + && findFreeBlock(refCounts, refCounts->searchCursor.index, + refCounts->searchCursor.endIndex, freeIndexPtr)); +} + +/** + * Search each reference block for a reference count of zero, starting at the + * reference block and counter index saved in the search cursor and searching + * up to the end of the last reference block. The search does not wrap. + * + * @param [in] refCounts The RefCounts object to search + * @param [out] freeIndexPtr A pointer to receive the array index of the + * zero reference count + * + * @return true if an unreferenced counter was found + **/ +static bool searchReferenceBlocks(RefCounts *refCounts, + SlabBlockNumber *freeIndexPtr) +{ + // Start searching at the saved search position in the current block. + if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { + return true; + } + + // Search each reference block up to the end of the slab. + while (advanceSearchCursor(refCounts)) { + if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { + return true; + } + } + + return false; +} + +/** + * Do the bookkeeping for making a provisional reference. + * + * @param refCounts The RefCounts + * @param slabBlockNumber The block to reference + **/ +static void makeProvisionalReference(RefCounts *refCounts, + SlabBlockNumber slabBlockNumber) +{ + // Make the initial transition from an unreferenced block to a provisionally + // allocated block. + refCounts->counters[slabBlockNumber] = PROVISIONAL_REFERENCE_COUNT; + + // Account for the allocation. + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + block->allocatedCount++; + refCounts->freeBlocks--; +} + +/**********************************************************************/ +int allocateUnreferencedBlock(RefCounts *refCounts, + PhysicalBlockNumber *allocatedPtr) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber freeIndex; + if (!searchReferenceBlocks(refCounts, &freeIndex)) { + return VDO_NO_SPACE; + } + + ASSERT_LOG_ONLY((refCounts->counters[freeIndex] == EMPTY_REFERENCE_COUNT), + "free block must have refCount of zero"); + makeProvisionalReference(refCounts, freeIndex); + + // Update the search hint so the next search will start at the array + // index just past the free block we just found. + refCounts->searchCursor.index = (freeIndex + 1); + + *allocatedPtr = indexToPBN(refCounts, freeIndex); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int provisionallyReferenceBlock(RefCounts *refCounts, + PhysicalBlockNumber pbn, + PBNLock *lock) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + if (refCounts->counters[slabBlockNumber] == EMPTY_REFERENCE_COUNT) { + makeProvisionalReference(refCounts, slabBlockNumber); + if (lock != NULL) { + assignProvisionalReference(lock); + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount countUnreferencedBlocks(RefCounts *refCounts, + PhysicalBlockNumber startPBN, + PhysicalBlockNumber endPBN) +{ + BlockCount freeBlocks = 0; + SlabBlockNumber startIndex = pbnToIndex(refCounts, startPBN); + SlabBlockNumber endIndex = pbnToIndex(refCounts, endPBN); + for (SlabBlockNumber index = startIndex; index < endIndex; index++) { + if (refCounts->counters[index] == EMPTY_REFERENCE_COUNT) { + freeBlocks++; + } + } + + return freeBlocks; +} + +/** + * Convert a ReferenceBlock's generic wait queue entry back into the + * ReferenceBlock. + * + * @param waiter The wait queue entry to convert + * + * @return The wrapping ReferenceBlock + **/ +static inline ReferenceBlock *waiterAsReferenceBlock(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(ReferenceBlock, waiter) == 0); + return (ReferenceBlock *) waiter; +} + +/** + * WaitCallback to clean dirty reference blocks when resetting. + * + * @param blockWaiter The dirty block + * @param context Unused + **/ +static void +clearDirtyReferenceBlocks(Waiter *blockWaiter, + void *context __attribute__((unused))) +{ + waiterAsReferenceBlock(blockWaiter)->isDirty = false; +} + +/**********************************************************************/ +void resetReferenceCounts(RefCounts *refCounts) +{ + // We can just use memset() since each ReferenceCount is exactly one byte. + STATIC_ASSERT(sizeof(ReferenceCount) == 1); + memset(refCounts->counters, 0, refCounts->blockCount); + refCounts->freeBlocks = refCounts->blockCount; + refCounts->slabJournalPoint = (JournalPoint) { + .sequenceNumber = 0, + .entryCount = 0, + }; + + for (size_t i = 0; i < refCounts->referenceBlockCount; i++) { + refCounts->blocks[i].allocatedCount = 0; + } + + notifyAllWaiters(&refCounts->dirtyBlocks, clearDirtyReferenceBlocks, NULL); +} + +/**********************************************************************/ +BlockCount getSavedReferenceCountSize(BlockCount blockCount) +{ + return computeBucketCount(blockCount, COUNTS_PER_BLOCK); +} + +/** + * A waiter callback that resets the writing state of refCounts. + **/ +static void finishSummaryUpdate(Waiter *waiter, void *context) +{ + RefCounts *refCounts = refCountsFromWaiter(waiter); + refCounts->updatingSlabSummary = false; + + int result = *((int *) context); + if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { + checkIfSlabDrained(refCounts->slab); + return; + } + + logErrorWithStringError(result, "failed to update slab summary"); + enterRefCountsReadOnlyMode(refCounts, result); +} + +/** + * Update slab summary that the RefCounts is clean. + * + * @param refCounts The RefCounts object that is being written + **/ +static void updateSlabSummaryAsClean(RefCounts *refCounts) +{ + SlabSummaryZone *summary = getSlabSummaryZone(refCounts->slab->allocator); + if (summary == NULL) { + return; + } + + // Update the slab summary to indicate this refCounts is clean. + TailBlockOffset offset + = getSummarizedTailBlockOffset(summary, refCounts->slab->slabNumber); + refCounts->updatingSlabSummary = true; + refCounts->slabSummaryWaiter.callback = finishSummaryUpdate; + updateSlabSummaryEntry(summary, &refCounts->slabSummaryWaiter, + refCounts->slab->slabNumber, offset, true, true, + getSlabFreeBlockCount(refCounts->slab)); +} + +/** + * Handle an I/O error reading or writing a reference count block. + * + * @param completion The VIO doing the I/O as a completion + **/ +static void handleIOError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + RefCounts *refCounts = ((ReferenceBlock *) entry->parent)->refCounts; + returnVIO(refCounts->slab->allocator, entry); + refCounts->activeCount--; + enterRefCountsReadOnlyMode(refCounts, result); +} + +/** + * After a reference block has written, clean it, release its locks, and return + * its VIO to the pool. + * + * @param completion The VIO that just finished writing + **/ +static void finishReferenceBlockWrite(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + ReferenceBlock *block = entry->parent; + RefCounts *refCounts = block->refCounts; + refCounts->activeCount--; + + // Release the slab journal lock. + adjustSlabJournalBlockReference(refCounts->slab->journal, + block->slabJournalLockToRelease, -1); + returnVIO(refCounts->slab->allocator, entry); + + /* + * We can't clear the isWriting flag earlier as releasing the slab journal + * lock may cause us to be dirtied again, but we don't want to double + * enqueue. + */ + block->isWriting = false; + + if (isReadOnly(refCounts->readOnlyNotifier)) { + checkIfSlabDrained(refCounts->slab); + return; + } + + // Re-queue the block if it was re-dirtied while it was writing. + if (block->isDirty) { + enqueueDirtyBlock(block); + if (isSlabDraining(refCounts->slab)) { + // We must be saving, and this block will otherwise not be relaunched. + saveDirtyReferenceBlocks(refCounts); + } + + return; + } + + // Mark the RefCounts as clean in the slab summary if there are no dirty + // or writing blocks and no summary update in progress. + if (!hasActiveIO(refCounts) && !hasWaiters(&refCounts->dirtyBlocks)) { + updateSlabSummaryAsClean(refCounts); + } +} + +/**********************************************************************/ +ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) +{ + size_t blockIndex = block - block->refCounts->blocks; + return &block->refCounts->counters[blockIndex * COUNTS_PER_BLOCK]; +} + +/**********************************************************************/ +void packReferenceBlock(ReferenceBlock *block, void *buffer) +{ + PackedJournalPoint commitPoint; + packJournalPoint(&block->refCounts->slabJournalPoint, &commitPoint); + + PackedReferenceBlock *packed = buffer; + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { + packed->sectors[i].commitPoint = commitPoint; + memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR), + (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); + } +} + +/** + * After a dirty block waiter has gotten a VIO from the VIO pool, copy its + * counters and associated data into the VIO, and launch the write. + * + * @param blockWaiter The waiter of the dirty block + * @param vioContext The VIO returned by the pool + **/ +static void writeReferenceBlock(Waiter *blockWaiter, void *vioContext) +{ + VIOPoolEntry *entry = vioContext; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + packReferenceBlock(block, entry->buffer); + + size_t blockOffset = (block - block->refCounts->blocks); + PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); + block->slabJournalLockToRelease = block->slabJournalLock; + entry->parent = block; + + /* + * Mark the block as clean, since we won't be committing any updates that + * happen after this moment. As long as VIO order is preserved, two + * VIOs updating this block at once will not cause complications. + */ + block->isDirty = false; + + // Flush before writing to ensure that the recovery journal and slab journal + // entries which cover this reference update are stable (VDO-2331). + relaxedAdd64(&block->refCounts->statistics->blocksWritten, 1); + entry->vio->completion.callbackThreadID + = block->refCounts->slab->allocator->threadID; + launchWriteMetadataVIOWithFlush(entry->vio, pbn, finishReferenceBlockWrite, + handleIOError, true, false); +} + +/** + * Launch the write of a dirty reference block by first acquiring a VIO for it + * from the pool. This can be asynchronous since the writer will have to wait + * if all VIOs in the pool are currently in use. + * + * @param blockWaiter The waiter of the block which is starting to write + * @param context The parent refCounts of the block + **/ +static void launchReferenceBlockWrite(Waiter *blockWaiter, void *context) +{ + RefCounts *refCounts = context; + if (isReadOnly(refCounts->readOnlyNotifier)) { + return; + } + + refCounts->activeCount++; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + block->isWriting = true; + blockWaiter->callback = writeReferenceBlock; + int result = acquireVIO(refCounts->slab->allocator, blockWaiter); + if (result != VDO_SUCCESS) { + // This should never happen. + refCounts->activeCount--; + enterRefCountsReadOnlyMode(refCounts, result); + } +} + +/**********************************************************************/ +void saveOldestReferenceBlock(RefCounts *refCounts) +{ + notifyNextWaiter(&refCounts->dirtyBlocks, launchReferenceBlockWrite, + refCounts); +} + +/**********************************************************************/ +void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor) +{ + BlockCount dirtyBlockCount = countWaiters(&refCounts->dirtyBlocks); + if (dirtyBlockCount == 0) { + return; + } + + BlockCount blocksToWrite = dirtyBlockCount / flushDivisor; + // Always save at least one block. + if (blocksToWrite == 0) { + blocksToWrite = 1; + } + + for (BlockCount written = 0; written < blocksToWrite; written++) { + saveOldestReferenceBlock(refCounts); + } +} + +/**********************************************************************/ +void saveDirtyReferenceBlocks(RefCounts *refCounts) +{ + notifyAllWaiters(&refCounts->dirtyBlocks, launchReferenceBlockWrite, + refCounts); + checkIfSlabDrained(refCounts->slab); +} + +/**********************************************************************/ +void dirtyAllReferenceBlocks(RefCounts *refCounts) +{ + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + dirtyBlock(&refCounts->blocks[i]); + } +} + +/** + * Clear the provisional reference counts from a reference block. + * + * @param block The block to clear + **/ +static void clearProvisionalReferences(ReferenceBlock *block) +{ + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (BlockCount j = 0; j < COUNTS_PER_BLOCK; j++) { + if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { + counters[j] = EMPTY_REFERENCE_COUNT; + block->allocatedCount--; + } + } +} + +/** + * Unpack reference counts blocks into the internal memory structure. + * + * @param packed The written reference block to be unpacked + * @param block The internal reference block to be loaded + **/ +static void unpackReferenceBlock(PackedReferenceBlock *packed, + ReferenceBlock *block) +{ + RefCounts *refCounts = block->refCounts; + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { + PackedReferenceSector *sector = &packed->sectors[i]; + unpackJournalPoint(§or->commitPoint, &block->commitPoints[i]); + memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts, + (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); + // The slabJournalPoint must be the latest point found in any sector. + if (beforeJournalPoint(&refCounts->slabJournalPoint, + &block->commitPoints[i])) { + refCounts->slabJournalPoint = block->commitPoints[i]; + } + + if ((i > 0) && !areEquivalentJournalPoints(&block->commitPoints[0], + &block->commitPoints[i])) { + size_t blockIndex = block - block->refCounts->blocks; + logWarning("Torn write detected in sector %u of reference block" + " %zu of slab %" PRIu16, + i, blockIndex, block->refCounts->slab->slabNumber); + } + } + + block->allocatedCount = 0; + for (BlockCount i = 0; i < COUNTS_PER_BLOCK; i++) { + if (counters[i] != EMPTY_REFERENCE_COUNT) { + block->allocatedCount++; + } + } +} + +/** + * After a reference block has been read, unpack it. + * + * @param completion The VIO that just finished reading + **/ +static void finishReferenceBlockLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + ReferenceBlock *block = entry->parent; + unpackReferenceBlock((PackedReferenceBlock *) entry->buffer, block); + + RefCounts *refCounts = block->refCounts; + returnVIO(refCounts->slab->allocator, entry); + refCounts->activeCount--; + clearProvisionalReferences(block); + + refCounts->freeBlocks -= block->allocatedCount; + checkIfSlabDrained(block->refCounts->slab); +} + +/** + * After a block waiter has gotten a VIO from the VIO pool, load the block. + * + * @param blockWaiter The waiter of the block to load + * @param vioContext The VIO returned by the pool + **/ +static void loadReferenceBlock(Waiter *blockWaiter, void *vioContext) +{ + VIOPoolEntry *entry = vioContext; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + size_t blockOffset = (block - block->refCounts->blocks); + PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); + entry->parent = block; + + entry->vio->completion.callbackThreadID + = block->refCounts->slab->allocator->threadID; + launchReadMetadataVIO(entry->vio, pbn, finishReferenceBlockLoad, + handleIOError); +} + +/** + * Load reference blocks from the underlying storage into a pre-allocated + * reference counter. + * + * @param refCounts The reference counter to be loaded + **/ +static void loadReferenceBlocks(RefCounts *refCounts) +{ + refCounts->freeBlocks = refCounts->blockCount; + refCounts->activeCount = refCounts->referenceBlockCount; + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + Waiter *blockWaiter = &refCounts->blocks[i].waiter; + blockWaiter->callback = loadReferenceBlock; + int result = acquireVIO(refCounts->slab->allocator, blockWaiter); + if (result != VDO_SUCCESS) { + // This should never happen. + refCounts->activeCount -= (refCounts->referenceBlockCount - i); + enterRefCountsReadOnlyMode(refCounts, result); + return; + } + } +} + +/**********************************************************************/ +void drainRefCounts(RefCounts *refCounts) +{ + Slab *slab = refCounts->slab; + bool save = false; + switch (slab->state.state) { + case ADMIN_STATE_SCRUBBING: + if (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { + loadReferenceBlocks(refCounts); + return; + } + + break; + + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + if (!mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { + // These reference counts were never written, so mark them all dirty. + dirtyAllReferenceBlocks(refCounts); + } + save = true; + break; + + case ADMIN_STATE_REBUILDING: + if (shouldSaveFullyBuiltSlab(slab)) { + dirtyAllReferenceBlocks(refCounts); + save = true; + } + break; + + case ADMIN_STATE_SAVING: + save = !isUnrecoveredSlab(slab); + break; + + case ADMIN_STATE_RECOVERING: + case ADMIN_STATE_SUSPENDING: + break; + + default: + notifyRefCountsAreDrained(slab, VDO_SUCCESS); + return; + } + + if (save) { + saveDirtyReferenceBlocks(refCounts); + } +} + +/**********************************************************************/ +void acquireDirtyBlockLocks(RefCounts *refCounts) +{ + dirtyAllReferenceBlocks(refCounts); + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + refCounts->blocks[i].slabJournalLock = 1; + } + + adjustSlabJournalBlockReference(refCounts->slab->journal, 1, + refCounts->referenceBlockCount); +} + +/**********************************************************************/ +void dumpRefCounts(const RefCounts *refCounts) +{ + // Terse because there are a lot of slabs to dump and syslog is lossy. + logInfo(" refCounts: free=%" PRIu32 "/%" PRIu32 " blocks=%" PRIu32 + " dirty=%zu active=%zu journal@(%llu,%" PRIu16 ")%s", + refCounts->freeBlocks, refCounts->blockCount, + refCounts->referenceBlockCount, + countWaiters(&refCounts->dirtyBlocks), + refCounts->activeCount, + refCounts->slabJournalPoint.sequenceNumber, + refCounts->slabJournalPoint.entryCount, + (refCounts->updatingSlabSummary ? " updating" : "")); +} diff --git a/vdo/base/refCounts.h b/vdo/base/refCounts.h new file mode 100644 index 0000000..f140c8c --- /dev/null +++ b/vdo/base/refCounts.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.h#7 $ + */ + +#ifndef REF_COUNTS_H +#define REF_COUNTS_H + +#include "completion.h" +#include "journalPoint.h" +#include "slab.h" +#include "types.h" + +/** + * Create a reference counting object. + * + *

A reference counting object can keep a reference count for every physical + * block in the VDO configuration. Since we expect the vast majority of the + * blocks to have 0 or 1 reference counts, the structure is optimized for that + * situation. + * + * @param [in] blockCount The number of physical blocks that can be + * referenced + * @param [in] slab The slab of the ref counts object + * @param [in] origin The layer PBN at which to save RefCounts + * @param [in] readOnlyNotifier The context for tracking read-only mode + * @param [out] refCountsPtr The pointer to hold the new ref counts object + * + * @return a success or error code + **/ +int makeRefCounts(BlockCount blockCount, + Slab *slab, + PhysicalBlockNumber origin, + ReadOnlyNotifier *readOnlyNotifier, + RefCounts **refCountsPtr) + __attribute__((warn_unused_result)); + +/** + * Free a reference counting object and null out the reference to it. + * + * @param refCountsPtr The reference to the reference counting object to free + **/ +void freeRefCounts(RefCounts **refCountsPtr); + +/** + * Check whether a RefCounts is active. + * + * @param refCounts The RefCounts to check + **/ +bool areRefCountsActive(RefCounts *refCounts) + __attribute__((warn_unused_result)); + +/** + * Get the stored count of the number of blocks that are currently free. + * + * @param refCounts The RefCounts object + * + * @return the number of blocks with a reference count of zero + **/ +BlockCount getUnreferencedBlockCount(RefCounts *refCounts) + __attribute__((warn_unused_result)); + +/** + * Determine how many times a reference count can be incremented without + * overflowing. + * + * @param refCounts The RefCounts object + * @param pbn The physical block number + * + * @return the number of increments that can be performed + **/ +uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Adjust the reference count of a block. + * + * @param [in] refCounts The refcounts object + * @param [in] operation The operation to perform + * @param [in] slabJournalPoint The slab journal entry for this adjustment + * @param [out] freeStatusChanged A pointer which will be set to true if the + * free status of the block changed + * + * + * @return A success or error code, specifically: + * VDO_REF_COUNT_INVALID if a decrement would result in a negative + * reference count, or an increment in a + * count greater than MAXIMUM_REFS + * + **/ +int adjustReferenceCount(RefCounts *refCounts, + ReferenceOperation operation, + const JournalPoint *slabJournalPoint, + bool *freeStatusChanged) + __attribute__((warn_unused_result)); + +/** + * Adjust the reference count of a block during rebuild. + * + * @param refCounts The refcounts object + * @param pbn The number of the block to adjust + * @param operation The operation to perform on the count + * + * @return VDO_SUCCESS or an error + **/ +int adjustReferenceCountForRebuild(RefCounts *refCounts, + PhysicalBlockNumber pbn, + JournalOperation operation) + __attribute__((warn_unused_result)); + +/** + * Replay the reference count adjustment from a slab journal entry into the + * reference count for a block. The adjustment will be ignored if it was already + * recorded in the reference count. + * + * @param refCounts The refcounts object + * @param entryPoint The slab journal point for the entry + * @param entry The slab journal entry being replayed + * + * @return VDO_SUCCESS or an error code + **/ +int replayReferenceCountChange(RefCounts *refCounts, + const JournalPoint *entryPoint, + SlabJournalEntry entry) + __attribute__((warn_unused_result)); + +/** + * Check whether two reference counters are equivalent. This method is + * used for unit testing. + * + * @param counterA The first counter to compare + * @param counterB The second counter to compare + * + * @return true if the two counters are equivalent + **/ +bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) + __attribute__((warn_unused_result)); + +/** + * Find a block with a reference count of zero in the range of physical block + * numbers tracked by the reference counter. If a free block is found, that + * block is allocated by marking it as provisionally referenced, and the + * allocated block number is returned. + * + * @param [in] refCounts The reference counters to scan + * @param [out] allocatedPtr A pointer to hold the physical block number of + * the block that was found and allocated + * + * @return VDO_SUCCESS if a free block was found and allocated; + * VDO_NO_SPACE if there are no unreferenced blocks; + * otherwise an error code + **/ +int allocateUnreferencedBlock(RefCounts *refCounts, + PhysicalBlockNumber *allocatedPtr) + __attribute__((warn_unused_result)); + +/** + * Provisionally reference a block if it is unreferenced. + * + * @param refCounts The reference counters + * @param pbn The PBN to reference + * @param lock The PBNLock on the block (may be NULL) + * + * @return VDO_SUCCESS or an error + **/ +int provisionallyReferenceBlock(RefCounts *refCounts, + PhysicalBlockNumber pbn, + PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Count all unreferenced blocks in a range [startBlock, endBlock) of physical + * block numbers. + * + * @param refCounts The reference counters to scan + * @param startPBN The physical block number at which to start + * scanning (included in the scan) + * @param endPBN The physical block number at which to stop + * scanning (excluded from the scan) + * + * @return The number of unreferenced blocks + **/ +BlockCount countUnreferencedBlocks(RefCounts *refCounts, + PhysicalBlockNumber startPBN, + PhysicalBlockNumber endPBN) + __attribute__((warn_unused_result)); + +/** + * Get the number of blocks required to save a reference counts state covering + * the specified number of data blocks. + * + * @param blockCount The number of physical data blocks that can be referenced + * + * @return The number of blocks required to save reference counts with the + * given block count + **/ +BlockCount getSavedReferenceCountSize(BlockCount blockCount) + __attribute__((warn_unused_result)); + +/** + * Request a RefCounts save several dirty blocks asynchronously. This function + * currently writes 1 / flushDivisor of the dirty blocks. + * + * @param refCounts The RefCounts object to notify + * @param flushDivisor The inverse fraction of the dirty blocks to write + **/ +void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor); + +/** + * Ask a RefCounts to save all its dirty blocks asynchronously. + * + * @param refCounts The RefCounts object to notify + **/ +void saveDirtyReferenceBlocks(RefCounts *refCounts); + +/** + * Mark all reference count blocks as dirty. + * + * @param refCounts The RefCounts of the reference blocks + **/ +void dirtyAllReferenceBlocks(RefCounts *refCounts); + +/** + * Drain all reference count I/O. Depending upon the type of drain being + * performed (as recorded in the RefCount's Slab), the reference blocks may + * be loaded from disk or dirty reference blocks may be written out. + * + * @param refCounts The reference counts to drain + **/ +void drainRefCounts(RefCounts *refCounts); + +/** + * Mark all reference count blocks dirty and cause them to hold locks on slab + * journal block 1. + * + * @param refCounts The RefCounts of the reference blocks + **/ +void acquireDirtyBlockLocks(RefCounts *refCounts); + +/** + * Dump information about this RefCounts structure. + * + * @param refCounts The RefCounts to dump + **/ +void dumpRefCounts(const RefCounts *refCounts); + +#endif // REF_COUNTS_H diff --git a/vdo/base/refCountsInternals.h b/vdo/base/refCountsInternals.h new file mode 100644 index 0000000..a1bd1db --- /dev/null +++ b/vdo/base/refCountsInternals.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCountsInternals.h#4 $ + */ + +#ifndef REF_COUNTS_INTERNALS_H +#define REF_COUNTS_INTERNALS_H + +#include "refCounts.h" + +#include "journalPoint.h" +#include "referenceBlock.h" +#include "slab.h" +#include "blockAllocatorInternals.h" +#include "waitQueue.h" + +/** + * Represents the possible status of a block. + **/ +typedef enum referenceStatus { + RS_FREE, // this block is free + RS_SINGLE, // this block is singly-referenced + RS_SHARED, // this block is shared + RS_PROVISIONAL // this block is provisionally allocated +} ReferenceStatus; + +/** + * The SearchCursor represents the saved position of a free block search. + **/ +typedef struct searchCursor { + /** The reference block containing the current search index */ + ReferenceBlock *block; + /** The position at which to start searching for the next free counter */ + SlabBlockNumber index; + /** The position just past the last valid counter in the current block */ + SlabBlockNumber endIndex; + + /** A pointer to the first reference block in the slab */ + ReferenceBlock *firstBlock; + /** A pointer to the last reference block in the slab */ + ReferenceBlock *lastBlock; +} SearchCursor; + +/* + * RefCounts structure + * + * A reference count is maintained for each PhysicalBlockNumber. The vast + * majority of blocks have a very small reference count (usually 0 or 1). + * For references less than or equal to MAXIMUM_REFS (254) the reference count + * is stored in counters[pbn]. + * + */ +struct refCounts { + /** The slab of this reference block */ + Slab *slab; + + /** The size of the counters array */ + uint32_t blockCount; + /** The number of free blocks */ + uint32_t freeBlocks; + /** The array of reference counts */ + ReferenceCount *counters; // use ALLOCATE to align data ptr + + /** The saved block pointer and array indexes for the free block search */ + SearchCursor searchCursor; + + /** A list of the dirty blocks waiting to be written out */ + WaitQueue dirtyBlocks; + /** The number of blocks which are currently writing */ + size_t activeCount; + + /** A waiter object for updating the slab summary */ + Waiter slabSummaryWaiter; + /** Whether slab summary update is in progress */ + bool updatingSlabSummary; + + /** The notifier for read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The refcount statistics, shared by all refcounts in our physical zone */ + AtomicRefCountStatistics *statistics; + /** The layer PBN for the first ReferenceBlock */ + PhysicalBlockNumber origin; + /** The latest slab journal entry this RefCounts has been updated with */ + JournalPoint slabJournalPoint; + + /** The number of reference count blocks */ + uint32_t referenceBlockCount; + /** reference count block array */ + ReferenceBlock blocks[]; +}; + +/** + * Convert a reference count to a reference status. + * + * @param count The count to convert + * + * @return The appropriate reference status + **/ +__attribute__((warn_unused_result)) +ReferenceStatus referenceCountToStatus(ReferenceCount count); + +/** + * Convert a generic VDOCompletion to a RefCounts. + * + * @param completion The completion to convert + * + * @return The completion as a RefCounts + **/ +RefCounts *asRefCounts(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Get the reference block that covers the given block index (exposed for + * testing). + * + * @param refCounts The refcounts object + * @param index The block index + **/ +ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) + __attribute__((warn_unused_result)); + +/** + * Find the reference counters for a given block (exposed for testing). + * + * @param block The ReferenceBlock in question + * + * @return A pointer to the reference counters for this block + **/ +ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) + __attribute__((warn_unused_result)); + +/** + * Copy data from a reference block to a buffer ready to be written out + * (exposed for testing). + * + * @param block The block to copy + * @param buffer The char buffer to fill with the packed block + **/ +void packReferenceBlock(ReferenceBlock *block, void *buffer); + +/** + * Get the reference status of a block. Exposed only for unit testing. + * + * @param [in] refCounts The refcounts object + * @param [in] pbn The physical block number + * @param [out] statusPtr Where to put the status of the block + * + * @return A success or error code, specifically: + * VDO_OUT_OF_RANGE if the pbn is out of range. + **/ +int getReferenceStatus(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceStatus *statusPtr) + __attribute__((warn_unused_result)); + +/** + * Find the first block with a reference count of zero in the specified range + * of reference counter indexes. Exposed for unit testing. + * + * @param [in] refCounts The reference counters to scan + * @param [in] startIndex The array index at which to start scanning + * (included in the scan) + * @param [in] endIndex The array index at which to stop scanning + * (excluded from the scan) + * @param [out] indexPtr A pointer to hold the array index of the free block + * + * @return true if a free block was found in the specified range + **/ +bool findFreeBlock(const RefCounts *refCounts, + SlabBlockNumber startIndex, + SlabBlockNumber endIndex, + SlabBlockNumber *indexPtr) + __attribute__((warn_unused_result)); + +/** + * Request a RefCounts save its oldest dirty block asynchronously. + * + * @param refCounts The RefCounts object to notify + **/ +void saveOldestReferenceBlock(RefCounts *refCounts); + +/** + * Reset all reference counts back to RS_FREE. + * + * @param refCounts The reference counters to reset + **/ +void resetReferenceCounts(RefCounts *refCounts); + +#endif // REF_COUNTS_INTERNALS_H diff --git a/vdo/base/referenceBlock.h b/vdo/base/referenceBlock.h new file mode 100644 index 0000000..8014c3b --- /dev/null +++ b/vdo/base/referenceBlock.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceBlock.h#1 $ + */ + +#ifndef REFERENCE_BLOCK_H +#define REFERENCE_BLOCK_H + +#include "constants.h" +#include "journalPoint.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A type representing a reference count. + **/ +typedef uint8_t ReferenceCount; + +/** + * Special ReferenceCount values. + **/ +enum { + EMPTY_REFERENCE_COUNT = 0, + MAXIMUM_REFERENCE_COUNT = 254, + PROVISIONAL_REFERENCE_COUNT = 255, +}; + +enum { + COUNTS_PER_SECTOR = ((VDO_SECTOR_SIZE - sizeof(PackedJournalPoint)) + / sizeof(ReferenceCount)), + COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * SECTORS_PER_BLOCK, +}; + +/** + * The format of a ReferenceSector on disk. + **/ +typedef struct { + PackedJournalPoint commitPoint; + ReferenceCount counts[COUNTS_PER_SECTOR]; +} __attribute__((packed)) PackedReferenceSector; + +typedef struct { + PackedReferenceSector sectors[SECTORS_PER_BLOCK]; +} PackedReferenceBlock; + +/* + * ReferenceBlock structure + * + * Blocks are used as a proxy, permitting saves of partial refcounts. + **/ +typedef struct { + /** This block waits on the refCounts to tell it to write */ + Waiter waiter; + /** The parent RefCount structure */ + RefCounts *refCounts; + /** The number of references in this block that represent allocations */ + BlockSize allocatedCount; + /** The slab journal block on which this block must hold a lock */ + SequenceNumber slabJournalLock; + /** + * The slab journal block which should be released when this block + * is committed + **/ + SequenceNumber slabJournalLockToRelease; + /** The point up to which each sector is accurate on disk */ + JournalPoint commitPoints[SECTORS_PER_BLOCK]; + /** Whether this block has been modified since it was written to disk */ + bool isDirty; + /** Whether this block is currently writing */ + bool isWriting; +} ReferenceBlock; + +#endif // REFERENCE_BLOCK_H diff --git a/vdo/base/referenceCountRebuild.c b/vdo/base/referenceCountRebuild.c new file mode 100644 index 0000000..a3d91ac --- /dev/null +++ b/vdo/base/referenceCountRebuild.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.c#6 $ + */ + +#include "referenceCountRebuild.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "forest.h" +#include "constants.h" +#include "numUtils.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +/** + * A reference count rebuild completion. + * Note that the page completions kept in this structure are not immediately + * freed, so the corresponding pages will be locked down in the page cache + * until the rebuild frees them. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the completion for flushing the block map */ + VDOCompletion subTaskCompletion; + /** the thread on which all block map operations must be done */ + ThreadID logicalThreadID; + /** the admin thread */ + ThreadID adminThreadID; + /** the block map */ + BlockMap *blockMap; + /** the slab depot */ + SlabDepot *depot; + /** whether this recovery has been aborted */ + bool aborted; + /** whether we are currently launching the initial round of requests */ + bool launching; + /** The number of logical blocks observed used */ + BlockCount *logicalBlocksUsed; + /** The number of block map data blocks */ + BlockCount *blockMapDataBlocks; + /** the next page to fetch */ + PageCount pageToFetch; + /** the number of leaf pages in the block map */ + PageCount leafPages; + /** the last slot of the block map */ + BlockMapSlot lastSlot; + /** number of pending (non-ready) requests*/ + PageCount outstanding; + /** number of page completions */ + PageCount pageCount; + /** array of requested, potentially ready page completions */ + VDOPageCompletion pageCompletions[]; +} RebuildCompletion; + +/** + * Convert a VDOCompletion to a RebuildCompletion. + * + * @param completion The completion to convert + * + * @return The completion as a RebuildCompletion + **/ +__attribute__((warn_unused_result)) +static inline RebuildCompletion *asRebuildCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(RebuildCompletion, completion) == 0); + assertCompletionType(completion->type, REFERENCE_COUNT_REBUILD_COMPLETION); + return (RebuildCompletion *) completion; +} + +/** + * Free a RebuildCompletion and null out the reference to it. + * + * @param completionPtr a pointer to the completion to free + **/ +static void freeRebuildCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + RebuildCompletion *rebuild = asRebuildCompletion(completion); + destroyEnqueueable(&rebuild->subTaskCompletion); + destroyEnqueueable(completion); + FREE(rebuild); + *completionPtr = NULL; +} + +/** + * Free the RebuildCompletion and notify the parent that the block map + * rebuild is done. This callback is registered in rebuildBlockMap(). + * + * @param completion The RebuildCompletion + **/ +static void finishRebuild(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + freeRebuildCompletion(&completion); + finishCompletion(parent, result); +} + +/** + * Make a new rebuild completion. + * + * @param [in] vdo The VDO + * @param [in] logicalBlocksUsed A pointer to hold the logical blocks used + * @param [in] blockMapDataBlocks A pointer to hold the number of block map + * data blocks + * @param [in] parent The parent of the rebuild completion + * @param [out] rebuildPtr The new block map rebuild completion + * + * @return a success or error code + **/ +static int makeRebuildCompletion(VDO *vdo, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks, + VDOCompletion *parent, + RebuildCompletion **rebuildPtr) +{ + BlockMap *blockMap = getBlockMap(vdo); + PageCount pageCount + = minPageCount(getConfiguredCacheSize(vdo) >> 1, + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); + + RebuildCompletion *rebuild; + int result = ALLOCATE_EXTENDED(RebuildCompletion, pageCount, + VDOPageCompletion, __func__, &rebuild); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&rebuild->completion, + REFERENCE_COUNT_REBUILD_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &rebuild->completion; + freeRebuildCompletion(&completion); + return result; + } + + result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &rebuild->completion; + freeRebuildCompletion(&completion); + return result; + } + + rebuild->blockMap = blockMap; + rebuild->depot = vdo->depot; + rebuild->logicalBlocksUsed = logicalBlocksUsed; + rebuild->blockMapDataBlocks = blockMapDataBlocks; + rebuild->pageCount = pageCount; + rebuild->leafPages = computeBlockMapPageCount(blockMap->entryCount); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + rebuild->logicalThreadID = getLogicalZoneThread(threadConfig, 0); + rebuild->adminThreadID = getAdminThread(threadConfig); + + ASSERT_LOG_ONLY((getCallbackThreadID() == rebuild->logicalThreadID), + "%s must be called on logical thread %u (not %u)", __func__, + rebuild->logicalThreadID, getCallbackThreadID()); + prepareCompletion(&rebuild->completion, finishRebuild, finishRebuild, + rebuild->logicalThreadID, parent); + + *rebuildPtr = rebuild; + return VDO_SUCCESS; +} + +/** + * Flush the block map now that all the reference counts are rebuilt. This + * callback is registered in finishIfDone(). + * + * @param completion The sub-task completion + **/ +static void flushBlockMapUpdates(VDOCompletion *completion) +{ + logInfo("Flushing block map changes"); + prepareToFinishParent(completion, completion->parent); + drainBlockMap(asRebuildCompletion(completion->parent)->blockMap, + ADMIN_STATE_RECOVERING, completion); +} + +/** + * Check whether the rebuild is done. If it succeeded, continue by flushing the + * block map. + * + * @param rebuild The rebuild completion + * + * @return true if the rebuild is complete + **/ +static bool finishIfDone(RebuildCompletion *rebuild) +{ + if (rebuild->launching || (rebuild->outstanding > 0)) { + return false; + } + + if (rebuild->aborted) { + completeCompletion(&rebuild->completion); + return true; + } + + if (rebuild->pageToFetch < rebuild->leafPages) { + return false; + } + + prepareCompletion(&rebuild->subTaskCompletion, flushBlockMapUpdates, + finishParentCallback, rebuild->adminThreadID, rebuild); + invokeCallback(&rebuild->subTaskCompletion); + return true; +} + +/** + * Record that there has been an error during the rebuild. + * + * @param rebuild The rebuild completion + * @param result The error result to use, if one is not already saved + **/ +static void abortRebuild(RebuildCompletion *rebuild, int result) +{ + rebuild->aborted = true; + setCompletionResult(&rebuild->completion, result); +} + +/** + * Handle an error loading a page. + * + * @param completion The VDOPageCompletion + **/ +static void handlePageLoadError(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + rebuild->outstanding--; + abortRebuild(rebuild, completion->result); + releaseVDOPageCompletion(completion); + finishIfDone(rebuild); +} + +/** + * Rebuild reference counts from a block map page. + * + * @param rebuild The rebuild completion + * @param completion The page completion holding the page + * + * @return VDO_SUCCESS or an error + **/ +static int rebuildReferenceCountsFromPage(RebuildCompletion *rebuild, + VDOCompletion *completion) +{ + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + return result; + } + + if (!isBlockMapPageInitialized(page)) { + return VDO_SUCCESS; + } + + // Remove any bogus entries which exist beyond the end of the logical space. + if (getBlockMapPagePBN(page) == rebuild->lastSlot.pbn) { + for (SlotNumber slot = rebuild->lastSlot.slot; + slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (isMappedLocation(&mapping)) { + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + } + } + } + + // Inform the slab depot of all entries on this page. + for (SlotNumber slot = 0; slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (!isValidLocation(&mapping)) { + // This entry is invalid, so remove it from the page. + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + continue; + } + + if (!isMappedLocation(&mapping)) { + continue; + } + + (*rebuild->logicalBlocksUsed)++; + if (mapping.pbn == ZERO_BLOCK) { + continue; + } + + if (!isPhysicalDataBlock(rebuild->depot, mapping.pbn)) { + // This is a nonsense mapping. Remove it from the map so we're at least + // consistent and mark the page dirty. + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + continue; + } + + Slab *slab = getSlab(rebuild->depot, mapping.pbn); + int result = adjustReferenceCountForRebuild(slab->referenceCounts, + mapping.pbn, DATA_INCREMENT); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Could not adjust reference count for PBN" + " %llu, slot %u mapped to PBN %llu", + getBlockMapPagePBN(page), slot, mapping.pbn); + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + } + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion); + +/** + * Process a page which has just been loaded. This callback is registered by + * fetchPage(). + * + * @param completion The VDOPageCompletion for the fetched page + **/ +static void pageLoaded(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + rebuild->outstanding--; + + int result = rebuildReferenceCountsFromPage(rebuild, completion); + if (result != VDO_SUCCESS) { + abortRebuild(rebuild, result); + } + + releaseVDOPageCompletion(completion); + if (finishIfDone(rebuild)) { + return; + } + + // Advance progress to the next page, and fetch the next page we + // haven't yet requested. + fetchPage(rebuild, completion); +} + +/** + * Fetch a page from the block map. + * + * @param rebuild the RebuildCompletion + * @param completion the page completion to use + **/ +static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion) +{ + while (rebuild->pageToFetch < rebuild->leafPages) { + PhysicalBlockNumber pbn = findBlockMapPagePBN(rebuild->blockMap, + rebuild->pageToFetch++); + if (pbn == ZERO_BLOCK) { + continue; + } + + if (!isPhysicalDataBlock(rebuild->depot, pbn)) { + abortRebuild(rebuild, VDO_BAD_MAPPING); + if (finishIfDone(rebuild)) { + return; + } + continue; + } + + initVDOPageCompletion(((VDOPageCompletion *) completion), + rebuild->blockMap->zones[0].pageCache, + pbn, true, &rebuild->completion, + pageLoaded, handlePageLoadError); + rebuild->outstanding++; + getVDOPageAsync(completion); + return; + } +} + +/** + * Rebuild reference counts from the leaf block map pages now that reference + * counts have been rebuilt from the interior tree pages (which have been + * loaded in the process). This callback is registered in + * rebuildReferenceCounts(). + * + * @param completion The sub-task completion + **/ +static void rebuildFromLeaves(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + *rebuild->logicalBlocksUsed = 0; + + // The PBN calculation doesn't work until the tree pages have been loaded, + // so we can't set this value at the start of rebuild. + rebuild->lastSlot = (BlockMapSlot) { + .slot = rebuild->blockMap->entryCount % BLOCK_MAP_ENTRIES_PER_PAGE, + .pbn = findBlockMapPagePBN(rebuild->blockMap, rebuild->leafPages - 1), + }; + + // Prevent any page from being processed until all pages have been launched. + rebuild->launching = true; + for (PageCount i = 0; i < rebuild->pageCount; i++) { + fetchPage(rebuild, &rebuild->pageCompletions[i].completion); + } + rebuild->launching = false; + finishIfDone(rebuild); +} + +/** + * Process a single entry from the block map tree. + * + *

Implements EntryCallback. + * + * @param pbn A pbn which holds a block map tree page + * @param completion The parent completion of the traversal + * + * @return VDO_SUCCESS or an error + **/ +static int processEntry(PhysicalBlockNumber pbn, VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + if ((pbn == ZERO_BLOCK) || !isPhysicalDataBlock(rebuild->depot, pbn)) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "PBN %llu out of range", + pbn); + } + + Slab *slab = getSlab(rebuild->depot, pbn); + int result = adjustReferenceCountForRebuild(slab->referenceCounts, pbn, + BLOCK_MAP_INCREMENT); + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, + "Could not adjust reference count for " + "block map tree PBN %llu", + pbn); + } + + (*rebuild->blockMapDataBlocks)++; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void rebuildReferenceCounts(VDO *vdo, + VDOCompletion *parent, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks) +{ + RebuildCompletion *rebuild; + int result = makeRebuildCompletion(vdo, logicalBlocksUsed, + blockMapDataBlocks, parent, &rebuild); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + // Completion chaining from page cache hits can lead to stack overflow + // during the rebuild, so clear out the cache before this rebuild phase. + result = invalidateVDOPageCache(rebuild->blockMap->zones[0].pageCache); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + // First traverse the block map trees. + *rebuild->blockMapDataBlocks = 0; + VDOCompletion *completion = &rebuild->subTaskCompletion; + prepareCompletion(completion, rebuildFromLeaves, finishParentCallback, + rebuild->logicalThreadID, rebuild); + traverseForest(rebuild->blockMap, processEntry, completion); +} diff --git a/vdo/base/referenceCountRebuild.h b/vdo/base/referenceCountRebuild.h new file mode 100644 index 0000000..59363ac --- /dev/null +++ b/vdo/base/referenceCountRebuild.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.h#1 $ + */ + +#ifndef REFERENCE_COUNT_REBUILD_H +#define REFERENCE_COUNT_REBUILD_H + +#include "types.h" + +/** + * Rebuild the reference counts from the block map (read-only rebuild). + * + * @param [in] vdo The VDO + * @param [in] parent The completion to notify when the rebuild is + * complete + * @param [out] logicalBlocksUsed A pointer to hold the logical blocks used + * @param [out] blockMapDataBlocks A pointer to hold the number of block map + * data blocks + **/ +void rebuildReferenceCounts(VDO *vdo, + VDOCompletion *parent, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks); + +#endif // REFERENCE_COUNT_REBUILD_H diff --git a/vdo/base/referenceOperation.c b/vdo/base/referenceOperation.c new file mode 100644 index 0000000..a8ea9a0 --- /dev/null +++ b/vdo/base/referenceOperation.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.c#1 $ + */ + +#include "referenceOperation.h" + +#include "physicalZone.h" +#include "types.h" + +/**********************************************************************/ +static PBNLock *returnPBNLock(ReferenceOperation operation) +{ + return (PBNLock *) operation.context; +} + +/**********************************************************************/ +void setUpReferenceOperationWithLock(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PBNLock *lock, + ReferenceOperation *operation) +{ + *operation = (ReferenceOperation) { + .type = type, + .pbn = pbn, + .state = state, + .lockGetter = returnPBNLock, + .context = lock, + }; +} + +/**********************************************************************/ +static PBNLock *lookUpPBNLock(ReferenceOperation operation) +{ + return ((operation.context == NULL) + ? NULL : getPBNLock(operation.context, operation.pbn)); +} + +/**********************************************************************/ +void setUpReferenceOperationWithZone(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PhysicalZone *zone, + ReferenceOperation *operation) +{ + *operation = (ReferenceOperation) { + .type = type, + .pbn = pbn, + .state = state, + .lockGetter = lookUpPBNLock, + .context = zone, + }; +} diff --git a/vdo/base/referenceOperation.h b/vdo/base/referenceOperation.h new file mode 100644 index 0000000..c846ec6 --- /dev/null +++ b/vdo/base/referenceOperation.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.h#1 $ + */ + +#ifndef REFERENCE_OPERATION_H +#define REFERENCE_OPERATION_H + +#include "types.h" + +typedef struct referenceOperation ReferenceOperation; + +/** + * Get the PBNLock associated with a ReferenceOperation. + * + * @param operation The ReferenceOperation + * + * @return The PBNLock on the block of a ReferenceOperation or NULL if there + * isn't one + **/ +typedef PBNLock *PBNLockGetter(ReferenceOperation operation); + +/** + * The current operation on a physical block (from the point of view of the + * DataVIO doing the operation) + **/ +struct referenceOperation { + /** The operation being performed */ + JournalOperation type; + /** The PBN of the block being operated on */ + PhysicalBlockNumber pbn; + /** The mapping state of the block being operated on */ + BlockMappingState state; + /** A function to use to get any PBNLock associated with this operation */ + PBNLockGetter *lockGetter; + /** The context to pass to the PBNLockGetter */ + void *context; +}; + +/** + * Get the PBNLock associated with the current ReferenceOperation. + * + * @param operation The reference operation + * + * @return The PBNLock on the block of the current operation or NULL if there + * isn't one + **/ +__attribute__((warn_unused_result)) +static inline +PBNLock *getReferenceOperationPBNLock(ReferenceOperation operation) +{ + return ((operation.lockGetter == NULL) + ? NULL : operation.lockGetter(operation)); +} + +/** + * Set up a ReferenceOperation for which we already have the lock. + * + * @param type The type of operation + * @param pbn The PBN of the block on which to operate + * @param state The mapping state of the block on which to operate + * @param lock The PBNLock to associate with the operation + * @param operation The ReferenceOperation to set up + **/ +void setUpReferenceOperationWithLock(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PBNLock *lock, + ReferenceOperation *operation); + +/** + * Set up a ReferenceOperation for which we will need to look up the lock later. + * + * @param type The type of operation + * @param pbn The PBN of the block on which to operate + * @param state The mapping state of the block on which to operate + * @param zone The PhysicalZone from which the PBNLock can be retrieved + * when needed + * @param operation The ReferenceOperation to set up + **/ +void setUpReferenceOperationWithZone(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PhysicalZone *zone, + ReferenceOperation *operation); + +#endif // REFERENCE_OPERATION_H diff --git a/vdo/base/releaseVersions.h b/vdo/base/releaseVersions.h new file mode 100644 index 0000000..7620f17 --- /dev/null +++ b/vdo/base/releaseVersions.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef RELEASE_VERSIONS_H +#define RELEASE_VERSIONS_H + +enum { + OXYGEN_RELEASE_VERSION_NUMBER = 109583, + FLUORINE_RELEASE_VERSION_NUMBER = 115838, + NEON_RELEASE_VERSION_NUMBER = 120965, + SODIUM_RELEASE_VERSION_NUMBER = 127441, + MAGNESIUM_RELEASE_VERSION_NUMBER = 131337, + ALUMINUM_RELEASE_VERSION_NUMBER = 133524, + HEAD_RELEASE_VERSION_NUMBER = 0, + CURRENT_RELEASE_VERSION_NUMBER = ALUMINUM_RELEASE_VERSION_NUMBER, +}; + +#endif /* not RELEASE_VERSIONS_H */ diff --git a/vdo/base/ringNode.h b/vdo/base/ringNode.h new file mode 100644 index 0000000..5f389f4 --- /dev/null +++ b/vdo/base/ringNode.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/ringNode.h#1 $ + */ + +#ifndef RING_NODE_H +#define RING_NODE_H + +#include "types.h" + +/** + * A ring node is a member of a doubly-linked circular list. + * + * Each node is usually embedded within a data structure that contains the + * relevant payload. In addition the ring head is also represented by a + * node where the next field designates the first element of the ring and the + * prev field designates the last. + * + * An empty ring contains next and prev fields that point back to the ring + * head itself. + * + * Typical iteration over a ring, from the front and back: + * + * for (RingNode *n = head->next; n != head; n = n->next) { ... } + * for (RingNode *p = head->prev; p != head; p = p->prev) { ... } + **/ +typedef struct ringNode RingNode; + +struct ringNode { + RingNode *next; + RingNode *prev; +}; + +/** + * Initialize a ring to be empty. + * + * @param head The head of the ring + **/ +static inline void initializeRing(RingNode *head) +{ + head->next = head->prev = head; +} + +/** + * Check whether a ring is empty. + * + * @param head The head of the ring + * + * @return true if the ring is empty + **/ +static inline bool isRingEmpty(const RingNode *head) +{ + return (head->next == head); +} + +/** + * Check whether a ring contains exactly one node. + * + * @param head The head of the ring + * + * @return true if the ring contains exactly one member + **/ +static inline bool isRingSingleton(const RingNode *head) +{ + return (!isRingEmpty(head) && (head->prev == head->next)); +} + +/** + * Unsplice a contiguous chain of at least one node from its ring. + * + * @param first the first entry in the ring to unsplice + * @param last the last entry in the ring to unsplice, + * may be the same as ``first`` + * + * The effect of this is to create two rings, the one designated + * by first through last, and the other consisting of anything remaining. + **/ +static inline void unspliceRingChain(RingNode *first, + RingNode *last) +{ + first->prev->next = last->next; + last->next->prev = first->prev; + first->prev = last; + last->next = first; +} + +/** + * Remove a ring node from its ring. + * + * @param node the ring node + * + * @return the removed node, for convenience + **/ +static inline RingNode *unspliceRingNode(RingNode *node) +{ + unspliceRingChain(node, node); + return node; +} + +/** + * Splice a contiguous chain of at least one node after the specified entry, + * which may be the head of a ring. + * + * @param first the first entry in a contiguous span of nodes + * @param last the last entry in a contiguous span of nodes, + * may be the same as ``first`` + * @param where the entry after which ``first`` through ``last`` + * shall appear + * + * The effect of this is to unsplice first through last (if necessary) and + * insert them after ``where`` so that the previous nodes after ``where`` + * now appear after ``last``. + **/ +static inline void spliceRingChainAfter(RingNode *first, + RingNode *last, + RingNode *where) +{ + if (last->next != first) { + unspliceRingChain(first, last); + } + last->next = where->next; + first->prev = where; + where->next->prev = last; + where->next = first; +} + +/** + * Splice a contiguous chain of at least one node before the specified entry, + * which may be the tail of a list. + * + * @param first the first entry in a contiguous span of nodes + * @param last the last entry in a contiguous span of nodes, + * may be the same as ``first`` + * @param where the entry before which ``first`` through ``last`` + * shall appear + * + * The effect of this is to unsplice first through last (if necessary) and + * insert them before ``where`` so that the previous nodes before ``where`` + * now appear before ``first``. + **/ +static inline void spliceRingChainBefore(RingNode *first, + RingNode *last, + RingNode *where) +{ + if (last->next != first) { + unspliceRingChain(first, last); + } + first->prev = where->prev; + last->next = where; + where->prev->next = first; + where->prev = last; +} + +/** + * Push a single node on the end of a ring. + * + * @param head The ring head + * @param node The node to push + **/ +static inline void pushRingNode(RingNode *head, RingNode *node) +{ + spliceRingChainBefore(node, node, head); +} + +/** + * Pop a single node off the end of a ring. + * + * @param head The ring head + * + * @return NULL if the ring was empty, otherwise the node that was + * removed from the ring (``head->prev``) + **/ +static inline RingNode *popRingNode(RingNode *head) +{ + return (isRingEmpty(head) ? NULL : unspliceRingNode(head->prev)); +} + +/** + * Remove a single node off the front of the list + **/ +static inline RingNode *chopRingNode(RingNode *head) +{ + return (isRingEmpty(head) ? NULL : unspliceRingNode(head->next)); +} + +#endif // RING_NODE_H diff --git a/vdo/base/slab.c b/vdo/base/slab.c new file mode 100644 index 0000000..f2903d6 --- /dev/null +++ b/vdo/base/slab.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.c#9 $ + */ + +#include "slab.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "completion.h" +#include "constants.h" +#include "numUtils.h" +#include "pbnLock.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "slabSummary.h" + +/**********************************************************************/ +int configureSlab(BlockCount slabSize, + BlockCount slabJournalBlocks, + SlabConfig *slabConfig) +{ + if (slabJournalBlocks >= slabSize) { + return VDO_BAD_CONFIGURATION; + } + + /* + * This calculation should technically be a recurrence, but the total number + * of metadata blocks is currently less than a single block of refCounts, so + * we'd gain at most one data block in each slab with more iteration. + */ + BlockCount refBlocks + = getSavedReferenceCountSize(slabSize - slabJournalBlocks); + BlockCount metaBlocks = (refBlocks + slabJournalBlocks); + + // Make sure test code hasn't configured slabs to be too small. + if (metaBlocks >= slabSize) { + return VDO_BAD_CONFIGURATION; + } + + /* + * If the slab size is very small, assume this must be a unit test and + * override the number of data blocks to be a power of two (wasting blocks + * in the slab). Many tests need their dataBlocks fields to be the exact + * capacity of the configured volume, and that used to fall out since they + * use a power of two for the number of data blocks, the slab size was a + * power of two, and every block in a slab was a data block. + * + * XXX Try to figure out some way of structuring testParameters and unit + * tests so this hack isn't needed without having to edit several unit tests + * every time the metadata size changes by one block. + */ + BlockCount dataBlocks = slabSize - metaBlocks; + if ((slabSize < 1024) && !isPowerOfTwo(dataBlocks)) { + dataBlocks = ((BlockCount) 1 << logBaseTwo(dataBlocks)); + } + + /* + * Configure the slab journal thresholds. The flush threshold is 168 of 224 + * blocks in production, or 3/4ths, so we use this ratio for all sizes. + */ + BlockCount flushingThreshold = ((slabJournalBlocks * 3) + 3) / 4; + /* + * The blocking threshold should be far enough from the the flushing + * threshold to not produce delays, but far enough from the end of the + * journal to allow multiple successive recovery failures. + */ + BlockCount remaining = slabJournalBlocks - flushingThreshold; + BlockCount blockingThreshold = flushingThreshold + ((remaining * 5) / 7); + /* + * The scrubbing threshold should be at least 2048 entries before the end of + * the journal. + */ + BlockCount minimalExtraSpace + = 1 + (MAXIMUM_USER_VIOS / SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK); + BlockCount scrubbingThreshold = blockingThreshold; + if (slabJournalBlocks > minimalExtraSpace) { + scrubbingThreshold = slabJournalBlocks - minimalExtraSpace; + } + if (blockingThreshold > scrubbingThreshold) { + blockingThreshold = scrubbingThreshold; + } + + *slabConfig = (SlabConfig) { + .slabBlocks = slabSize, + .dataBlocks = dataBlocks, + .referenceCountBlocks = refBlocks, + .slabJournalBlocks = slabJournalBlocks, + .slabJournalFlushingThreshold = flushingThreshold, + .slabJournalBlockingThreshold = blockingThreshold, + .slabJournalScrubbingThreshold = scrubbingThreshold + }; + return VDO_SUCCESS; +} + +/**********************************************************************/ +PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, + PhysicalBlockNumber origin) +{ + return origin + slabConfig->dataBlocks + slabConfig->referenceCountBlocks; +} + +/**********************************************************************/ +int makeSlab(PhysicalBlockNumber slabOrigin, + BlockAllocator *allocator, + PhysicalBlockNumber translation, + RecoveryJournal *recoveryJournal, + SlabCount slabNumber, + bool isNew, + Slab **slabPtr) +{ + Slab *slab; + int result = ALLOCATE(1, Slab, __func__, &slab); + if (result != VDO_SUCCESS) { + return result; + } + + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + + slab->allocator = allocator; + slab->start = slabOrigin; + slab->end = slab->start + slabConfig->slabBlocks; + slab->slabNumber = slabNumber; + initializeRing(&slab->ringNode); + + slab->refCountsOrigin = slabOrigin + slabConfig->dataBlocks + translation; + slab->journalOrigin = (getSlabJournalStartBlock(slabConfig, slabOrigin) + + translation); + + result = makeSlabJournal(allocator, slab, recoveryJournal, &slab->journal); + if (result != VDO_SUCCESS) { + freeSlab(&slab); + return result; + } + + if (isNew) { + slab->state.state = ADMIN_STATE_NEW; + result = allocateRefCountsForSlab(slab); + if (result != VDO_SUCCESS) { + freeSlab(&slab); + return result; + } + } + + *slabPtr = slab; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int allocateRefCountsForSlab(Slab *slab) +{ + BlockAllocator *allocator = slab->allocator; + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + + int result = ASSERT(slab->referenceCounts == NULL, + "Slab %u doesn't allocate refcounts twice", + slab->slabNumber); + if (result != VDO_SUCCESS) { + return result; + } + + return makeRefCounts(slabConfig->dataBlocks, slab, slab->refCountsOrigin, + allocator->readOnlyNotifier, &slab->referenceCounts); +} + +/**********************************************************************/ +void freeSlab(Slab **slabPtr) +{ + Slab *slab = *slabPtr; + if (slab == NULL) { + return; + } + + unspliceRingNode(&slab->ringNode); + freeSlabJournal(&slab->journal); + freeRefCounts(&slab->referenceCounts); + FREE(slab); + *slabPtr = NULL; +} + +/**********************************************************************/ +ZoneCount getSlabZoneNumber(Slab *slab) +{ + return slab->allocator->zoneNumber; +} + +/**********************************************************************/ +void markSlabReplaying(Slab *slab) +{ + if (slab->status == SLAB_REBUILT) { + slab->status = SLAB_REPLAYING; + } +} + +/**********************************************************************/ +void markSlabUnrecovered(Slab *slab) +{ + slab->status = SLAB_REQUIRES_SCRUBBING; +} + +/**********************************************************************/ +BlockCount getSlabFreeBlockCount(const Slab *slab) +{ + return getUnreferencedBlockCount(slab->referenceCounts); +} + +/**********************************************************************/ +int modifySlabReferenceCount(Slab *slab, + const JournalPoint *journalPoint, + ReferenceOperation operation) +{ + if (slab == NULL) { + return VDO_SUCCESS; + } + + /* + * If the slab is unrecovered, preserve the refCount state and let scrubbing + * correct the refCount. Note that the slab journal has already captured all + * refCount updates. + */ + if (isUnrecoveredSlab(slab)) { + SequenceNumber entryLock = journalPoint->sequenceNumber; + adjustSlabJournalBlockReference(slab->journal, entryLock, -1); + return VDO_SUCCESS; + } + + bool freeStatusChanged; + int result = adjustReferenceCount(slab->referenceCounts, operation, + journalPoint, &freeStatusChanged); + if (result != VDO_SUCCESS) { + return result; + } + + if (freeStatusChanged) { + adjustFreeBlockCount(slab, !isIncrementOperation(operation.type)); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int acquireProvisionalReference(Slab *slab, + PhysicalBlockNumber pbn, + PBNLock *lock) +{ + if (hasProvisionalReference(lock)) { + return VDO_SUCCESS; + } + + int result = provisionallyReferenceBlock(slab->referenceCounts, pbn, lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (hasProvisionalReference(lock)) { + adjustFreeBlockCount(slab, false); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int slabBlockNumberFromPBN(Slab *slab, + PhysicalBlockNumber physicalBlockNumber, + SlabBlockNumber *slabBlockNumberPtr) +{ + if (physicalBlockNumber < slab->start) { + return VDO_OUT_OF_RANGE; + } + + uint64_t slabBlockNumber = physicalBlockNumber - slab->start; + if (slabBlockNumber >= getSlabConfig(slab->allocator->depot)->dataBlocks) { + return VDO_OUT_OF_RANGE; + } + + *slabBlockNumberPtr = slabBlockNumber; + return VDO_SUCCESS; +} + +/**********************************************************************/ +bool shouldSaveFullyBuiltSlab(const Slab *slab) +{ + // Write out the refCounts if the slab has written them before, or it has + // any non-zero reference counts, or there are any slab journal blocks. + BlockCount dataBlocks = getSlabConfig(slab->allocator->depot)->dataBlocks; + return (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber) + || (getSlabFreeBlockCount(slab) != dataBlocks) + || !isSlabJournalBlank(slab->journal)); +} + +/** + * Initiate a slab action. + * + * Implements AdminInitiator. + **/ +static void initiateSlabAction(AdminState *state) +{ + Slab *slab = container_of(state, Slab, state); + if (isDraining(state)) { + if (state->state == ADMIN_STATE_SCRUBBING) { + slab->status = SLAB_REBUILDING; + } + + drainSlabJournal(slab->journal); + + if (slab->referenceCounts != NULL) { + drainRefCounts(slab->referenceCounts); + } + + checkIfSlabDrained(slab); + return; + } + + if (isLoading(state)) { + decodeSlabJournal(slab->journal); + return; + } + + if (isResuming(state)) { + queueSlab(slab); + finishResuming(state); + return; + } + + finishOperationWithResult(state, VDO_INVALID_ADMIN_STATE); +} + +/**********************************************************************/ +void startSlabAction(Slab *slab, + AdminStateCode operation, + VDOCompletion *parent) +{ + startOperationWithWaiter(&slab->state, operation, parent, + initiateSlabAction); +} + +/**********************************************************************/ +void notifySlabJournalIsLoaded(Slab *slab, int result) +{ + if ((result == VDO_SUCCESS) && isCleanLoad(&slab->state)) { + // Since this is a normal or new load, we don't need the memory to read and + // process the recovery journal, so we can allocate reference counts now. + result = allocateRefCountsForSlab(slab); + } + + finishLoadingWithResult(&slab->state, result); +} + +/**********************************************************************/ +bool isSlabOpen(Slab *slab) +{ + return (!isQuiescing(&slab->state) && !isQuiescent(&slab->state)); +} + +/**********************************************************************/ +bool isSlabDraining(Slab *slab) +{ + return isDraining(&slab->state); +} + +/**********************************************************************/ +void checkIfSlabDrained(Slab *slab) +{ + if (isDraining(&slab->state) + && !isSlabJournalActive(slab->journal) + && ((slab->referenceCounts == NULL) + || !areRefCountsActive(slab->referenceCounts))) { + finishDrainingWithResult(&slab->state, + (isReadOnly(slab->allocator->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); + } +} + +/**********************************************************************/ +void notifySlabJournalIsDrained(Slab *slab, int result) +{ + if (slab->referenceCounts == NULL) { + // This can happen when shutting down a VDO that was in read-only mode when + // loaded. + notifyRefCountsAreDrained(slab, result); + return; + } + + setOperationResult(&slab->state, result); + drainRefCounts(slab->referenceCounts); +} + +/**********************************************************************/ +void notifyRefCountsAreDrained(Slab *slab, int result) +{ + finishDrainingWithResult(&slab->state, result); +} + +/**********************************************************************/ +bool isSlabResuming(Slab *slab) +{ + return isResuming(&slab->state); +} + +/**********************************************************************/ +void finishScrubbingSlab(Slab *slab) +{ + slab->status = SLAB_REBUILT; + queueSlab(slab); + reopenSlabJournal(slab->journal); +} + +/**********************************************************************/ +static const char *statusToString(SlabRebuildStatus status) +{ + switch (status) { + case SLAB_REBUILT: + return "REBUILT"; + case SLAB_REQUIRES_SCRUBBING: + return "SCRUBBING"; + case SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING: + return "PRIORITY_SCRUBBING"; + case SLAB_REBUILDING: + return "REBUILDING"; + case SLAB_REPLAYING: + return "REPLAYING"; + default: + return "UNKNOWN"; + } +} + +/**********************************************************************/ +void dumpSlab(const Slab *slab) +{ + if (slab->referenceCounts != NULL) { + // Terse because there are a lot of slabs to dump and syslog is lossy. + logInfo("slab %u: P%u, %llu free", + slab->slabNumber, slab->priority, getSlabFreeBlockCount(slab)); + } else { + logInfo("slab %u: status %s", slab->slabNumber, + statusToString(slab->status)); + } + + dumpSlabJournal(slab->journal); + + if (slab->referenceCounts != NULL) { + dumpRefCounts(slab->referenceCounts); + } else { + logInfo("refCounts is null"); + } +} diff --git a/vdo/base/slab.h b/vdo/base/slab.h new file mode 100644 index 0000000..c7f204b --- /dev/null +++ b/vdo/base/slab.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.h#8 $ + */ + +#ifndef VDO_SLAB_H +#define VDO_SLAB_H + +#include "permassert.h" + +#include "adminState.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "referenceOperation.h" +#include "ringNode.h" +#include "types.h" + +typedef uint32_t SlabBlockNumber; + +typedef enum { + SLAB_REBUILT = 0, + SLAB_REPLAYING, + SLAB_REQUIRES_SCRUBBING, + SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, + SLAB_REBUILDING, +} SlabRebuildStatus; + +/** + * This is the type declaration for the Slab type. (The struct tag is named + * vdoSlab to avoid a conflict with the linux kernel type). A Slab currently + * consists of a run of 2^23 data blocks, but that will soon change to + * dedicate a small number of those blocks for metadata storage for the + * reference counts and slab journal for the slab. + **/ +struct vdoSlab { + /** A RingNode to queue this slab in a BlockAllocator ring */ + RingNode ringNode; + + /** The BlockAllocator that owns this slab */ + BlockAllocator *allocator; + + /** The reference counts for the data blocks in this slab */ + RefCounts *referenceCounts; + /** The journal for this slab */ + SlabJournal *journal; + + /** The slab number of this slab */ + SlabCount slabNumber; + /** The offset in the allocator partition of the first block in this slab */ + PhysicalBlockNumber start; + /** The offset of the first block past the end of this slab */ + PhysicalBlockNumber end; + /** The starting translated PBN of the slab journal */ + PhysicalBlockNumber journalOrigin; + /** The starting translated PBN of the reference counts */ + PhysicalBlockNumber refCountsOrigin; + + /** The administrative state of the slab */ + AdminState state; + /** The status of the slab */ + SlabRebuildStatus status; + /** Whether the slab was ever queued for scrubbing */ + bool wasQueuedForScrubbing; + + /** The priority at which this slab has been queued for allocation */ + uint8_t priority; +}; + +/** + * Measure and initialize the configuration to use for each slab. + * + * @param [in] slabSize The number of blocks per slab + * @param [in] slabJournalBlocks The number of blocks for the slab journal + * @param [out] slabConfig The slab configuration to initialize + * + * @return VDO_SUCCESS or an error code + **/ +int configureSlab(BlockCount slabSize, + BlockCount slabJournalBlocks, + SlabConfig *slabConfig) + __attribute__((warn_unused_result)); + +/** + * Convert a Slab's RingNode back to the Slab. + * + * @param ringNode The RingNode to convert + * + * @return The RingNode as a Slab + **/ +static inline Slab *slabFromRingNode(RingNode *ringNode) +{ + STATIC_ASSERT(offsetof(Slab, ringNode) == 0); + return (Slab *) ringNode; +} + +/** + * Get the physical block number of the start of the slab journal + * relative to the start block allocator partition. + * + * @param slabConfig The slab configuration of the VDO + * @param origin The first block of the slab + **/ +__attribute__((warn_unused_result)) +PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, + PhysicalBlockNumber origin); + +/** + * Construct a new, empty slab. + * + * @param [in] slabOrigin The physical block number within the block + * allocator partition of the first block in the + * slab + * @param [in] allocator The block allocator to which the slab belongs + * @param [in] translation The translation from the depot's partition to + * the physical storage + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [in] slabNumber The slab number of the slab + * @param [in] isNew true if this slab is being + * allocated as part of a resize + * @param [out] slabPtr A pointer to receive the new slab + * + * @return VDO_SUCCESS or an error code + **/ +int makeSlab(PhysicalBlockNumber slabOrigin, + BlockAllocator *allocator, + PhysicalBlockNumber translation, + RecoveryJournal *recoveryJournal, + SlabCount slabNumber, + bool isNew, + Slab **slabPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the reference counts for a slab. + * + * @param slab The slab whose reference counts need allocation. + * + * @return VDO_SUCCESS or an error code + **/ +int allocateRefCountsForSlab(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Destroy a slab and null out the reference to it. + * + * @param slabPtr The reference to the slab to destroy + **/ +void freeSlab(Slab **slabPtr); + +/** + * Get the physical zone number of a slab. + * + * @param slab The slab + * + * @return The number of the slab's physical zone + **/ +ZoneCount getSlabZoneNumber(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab is unrecovered. + * + * @param slab The slab to check + * + * @return true if the slab is unrecovered + **/ +static inline bool isUnrecoveredSlab(const Slab *slab) +{ + return (slab->status != SLAB_REBUILT); +} + +/** + * Check whether a slab is being replayed into. + * + * @param slab The slab to check + * + * @return true if the slab is replaying + **/ +static inline bool isReplayingSlab(const Slab *slab) +{ + return (slab->status == SLAB_REPLAYING); +} + +/** + * Check whether a slab is being rebuilt. + * + * @param slab The slab to check + * + * @return true if the slab is being rebuilt + **/ +static inline bool slabIsRebuilding(const Slab *slab) +{ + return (slab->status == SLAB_REBUILDING); +} + +/** + * Mark a slab as replaying, during offline recovery. + * + * @param slab The slab to mark + **/ +void markSlabReplaying(Slab *slab); + +/** + * Mark a slab as unrecovered, for online recovery. + * + * @param slab The slab to mark + **/ +void markSlabUnrecovered(Slab *slab); + +/** + * Get the current number of free blocks in a slab. + * + * @param slab The slab to query + * + * @return the number of free blocks in the slab + **/ +BlockCount getSlabFreeBlockCount(const Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Increment or decrement the reference count of a block in a slab. + * + * @param slab The slab containing the block (may be NULL when + * referencing the zero block) + * @param journalPoint The slab journal entry corresponding to this change + * @param operation The operation to perform on the reference count + * + * @return VDO_SUCCESS or an error + **/ +int modifySlabReferenceCount(Slab *slab, + const JournalPoint *journalPoint, + ReferenceOperation operation) + __attribute__((warn_unused_result)); + +/** + * Acquire a provisional reference on behalf of a PBN lock if the block it + * locks is unreferenced. + * + * @param slab The slab which contains the block + * @param pbn The physical block to reference + * @param lock The lock + * + * @return VDO_SUCCESS or an error + **/ +int acquireProvisionalReference(Slab *slab, + PhysicalBlockNumber pbn, + PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Determine the index within the slab of a particular physical block number. + * + * @param [in] slab The slab + * @param [in] physicalBlockNumber The physical block number + * @param [out] slabBlockNumberPtr A pointer to the slab block number + * + * @return VDO_SUCCESS or an error code + **/ +int slabBlockNumberFromPBN(Slab *slab, + PhysicalBlockNumber physicalBlockNumber, + SlabBlockNumber *slabBlockNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Check whether the reference counts for a given rebuilt slab should be saved. + * Implements SlabStatusChecker. + * + * @param slab The slab to check + * + * @return true if the slab should be saved + **/ +bool shouldSaveFullyBuiltSlab(const Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Start an administrative operation on a slab. + * + * @param slab The slab to load + * @param operation The type of load to perform + * @param parent The object to notify when the operation is complete + **/ +void startSlabAction(Slab *slab, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Inform a slab that its journal has been loaded. + * + * @param slab The slab whose journal has been loaded + * @param result The result of the load operation + **/ +void notifySlabJournalIsLoaded(Slab *slab, int result); + +/** + * Check whether a slab is open, i.e. is neither quiescent nor quiescing. + * + * @param slab The slab to check + * + * @return true if the slab is open + **/ +bool isSlabOpen(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab is currently draining. + * + * @param slab The slab to check + * + * @return true if the slab is performing a drain operation + **/ +bool isSlabDraining(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab has drained, and if so, send a notification thereof. + * + * @param slab The slab to check + **/ +void checkIfSlabDrained(Slab *slab); + +/** + * Inform a slab that its journal has finished draining. + * + * @param slab The slab whose journal has been drained + * @param result The result of the drain operation + **/ +void notifySlabJournalIsDrained(Slab *slab, int result); + +/** + * Inform a slab that its RefCounts have finished draining. + * + * @param slab The slab whose RefCounts has been drained + * @param result The result of the drain operation + **/ +void notifyRefCountsAreDrained(Slab *slab, int result); + +/** + * Check whether a slab is currently resuming. + * + * @param slab The slab to check + * + * @return true if the slab is performing a resume operation + **/ +bool isSlabResuming(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Finish scrubbing a slab now that it has been rebuilt by updating its status, + * queueing it for allocation, and reopening its journal. + * + * @param slab The slab whose reference counts have been rebuilt from its + * journal + **/ +void finishScrubbingSlab(Slab *slab); + +/** + * Dump information about a slab to the log for debugging. + * + * @param slab The slab to dump + **/ +void dumpSlab(const Slab *slab); + +#endif // VDO_SLAB_H diff --git a/vdo/base/slabDepot.c b/vdo/base/slabDepot.c new file mode 100644 index 0000000..6c10c29 --- /dev/null +++ b/vdo/base/slabDepot.c @@ -0,0 +1,1145 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.c#23 $ + */ + +#include "slabDepot.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "actionManager.h" +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "constants.h" +#include "header.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "refCounts.h" +#include "slab.h" +#include "slabDepotInternals.h" +#include "slabJournal.h" +#include "slabIterator.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "types.h" + +typedef struct { + SlabConfig slabConfig; + PhysicalBlockNumber firstBlock; + PhysicalBlockNumber lastBlock; + ZoneCount zoneCount; +} __attribute__((packed)) SlabDepotState2_0; + +static const Header SLAB_DEPOT_HEADER_2_0 = { + .id = SLAB_DEPOT, + .version = { + .majorVersion = 2, + .minorVersion = 0, + }, + .size = sizeof(SlabDepotState2_0), +}; + +/** + * Compute the number of slabs a depot with given parameters would have. + * + * @param firstBlock PBN of the first data block + * @param lastBlock PBN of the last data block + * @param slabSizeShift Exponent for the number of blocks per slab + * + * @return The number of slabs + **/ +__attribute__((warn_unused_result)) +static SlabCount computeSlabCount(PhysicalBlockNumber firstBlock, + PhysicalBlockNumber lastBlock, + unsigned int slabSizeShift) +{ + BlockCount dataBlocks = lastBlock - firstBlock; + return (SlabCount) (dataBlocks >> slabSizeShift); +} + +/**********************************************************************/ +SlabCount calculateSlabCount(SlabDepot *depot) +{ + return computeSlabCount(depot->firstBlock, depot->lastBlock, + depot->slabSizeShift); +} + +/** + * Get an iterator over all the slabs in the depot. + * + * @param depot The depot + * + * @return An iterator over the depot's slabs + **/ +static SlabIterator getSlabIterator(SlabDepot *depot) +{ + return iterateSlabs(depot->slabs, depot->slabCount - 1, 0, 1); +} + +/** + * Allocate a new slab pointer array. Any existing slab pointers will be + * copied into the new array, and slabs will be allocated as needed. The + * newly allocated slabs will not be distributed for use by the block + * allocators. + * + * @param depot The depot + * @param slabCount The number of slabs the depot should have in the new + * array + * + * @return VDO_SUCCESS or an error code + **/ +static int allocateSlabs(SlabDepot *depot, SlabCount slabCount) +{ + int result = ALLOCATE(slabCount, Slab *, "slab pointer array", + &depot->newSlabs); + if (result != VDO_SUCCESS) { + return result; + } + + bool resizing = false; + if (depot->slabs != NULL) { + memcpy(depot->newSlabs, depot->slabs, depot->slabCount * sizeof(Slab *)); + resizing = true; + } + + BlockCount slabSize = getSlabConfig(depot)->slabBlocks; + PhysicalBlockNumber slabOrigin + = depot->firstBlock + (depot->slabCount * slabSize); + + // The translation between allocator partition PBNs and layer PBNs. + BlockCount translation = depot->origin - depot->firstBlock; + depot->newSlabCount = depot->slabCount; + while (depot->newSlabCount < slabCount) { + BlockAllocator *allocator + = depot->allocators[depot->newSlabCount % depot->zoneCount]; + Slab **slabPtr = &depot->newSlabs[depot->newSlabCount]; + result = makeSlab(slabOrigin, allocator, translation, depot->journal, + depot->newSlabCount, resizing, slabPtr); + if (result != VDO_SUCCESS) { + return result; + } + // Increment here to ensure that abandonNewSlabs will clean up correctly. + depot->newSlabCount++; + + slabOrigin += slabSize; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void abandonNewSlabs(SlabDepot *depot) +{ + if (depot->newSlabs == NULL) { + return; + } + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + freeSlab(&depot->newSlabs[i]); + } + depot->newSlabCount = 0; + FREE(depot->newSlabs); + depot->newSlabs = NULL; + depot->newSize = 0; +} + +/** + * Get the ID of the thread on which a given allocator operates. + * + *

Implements ZoneThreadGetter. + **/ +static ThreadID getAllocatorThreadID(void *context, ZoneCount zoneNumber) +{ + return getBlockAllocatorForZone(context, zoneNumber)->threadID; +} + +/** + * Prepare to commit oldest tail blocks. + * + *

Implements ActionPreamble. + **/ +static void prepareForTailBlockCommit(void *context, VDOCompletion *parent) +{ + SlabDepot *depot = context; + depot->activeReleaseRequest = depot->newReleaseRequest; + completeCompletion(parent); +} + +/** + * Schedule a tail block commit if necessary. This method should not be called + * directly. Rather, call scheduleDefaultAction() on the depot's action + * manager. + * + *

Implements ActionScheduler, + **/ +static bool scheduleTailBlockCommit(void *context) +{ + SlabDepot *depot = context; + if (depot->newReleaseRequest == depot->activeReleaseRequest) { + return false; + } + + return scheduleAction(depot->actionManager, prepareForTailBlockCommit, + releaseTailBlockLocks, NULL, NULL); +} + +/** + * Allocate those components of the slab depot which are needed only at load + * time, not at format time. + * + * @param depot The depot + * @param nonce The nonce of the VDO + * @param threadConfig The thread config of the VDO + * @param vioPoolSize The size of the VIO pool + * @param layer The physical layer below this depot + * @param summaryPartition The partition which holds the slab summary + * + * @return VDO_SUCCESS or an error + **/ +static int allocateComponents(SlabDepot *depot, + Nonce nonce, + const ThreadConfig *threadConfig, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition) +{ + /* + * If createVIO is NULL, the slab depot is only being used to format + * or audit the VDO. These only require the SuperBlock component, so we can + * just skip allocating all the memory needed for runtime components. + */ + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + int result = initializeEnqueueableCompletion(&depot->scrubbingCompletion, + SUB_TASK_COMPLETION, layer); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeActionManager(depot->zoneCount, getAllocatorThreadID, + getJournalZoneThread(threadConfig), depot, + scheduleTailBlockCommit, layer, + &depot->actionManager); + if (result != VDO_SUCCESS) { + return result; + } + + depot->origin = depot->firstBlock; + + result = makeSlabSummary(layer, summaryPartition, threadConfig, + depot->slabSizeShift, depot->slabConfig.dataBlocks, + depot->readOnlyNotifier, &depot->slabSummary); + if (result != VDO_SUCCESS) { + return result; + } + + SlabCount slabCount = calculateSlabCount(depot); + if (threadConfig->physicalZoneCount > slabCount) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "%u physical zones exceeds slab count %u", + threadConfig->physicalZoneCount, slabCount); + } + + // Allocate the block allocators. + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + ThreadID threadID = getPhysicalZoneThread(threadConfig, zone); + result = makeBlockAllocator(depot, zone, threadID, nonce, vioPoolSize, + layer, depot->readOnlyNotifier, + &depot->allocators[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Allocate slabs. + result = allocateSlabs(depot, slabCount); + if (result != VDO_SUCCESS) { + return result; + } + + // Use the new slabs. + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + Slab *slab = depot->newSlabs[i]; + registerSlabWithAllocator(slab->allocator, slab); + depot->slabCount++; + } + + depot->slabs = depot->newSlabs; + depot->newSlabs = NULL; + depot->newSlabCount = 0; + + return VDO_SUCCESS; +} + +/** + * Allocate a slab depot. + * + * @param [in] state The parameters for the new depot + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * (if NULL, the depot is format-only) + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +__attribute__((warn_unused_result)) +static int allocateDepot(const SlabDepotState2_0 *state, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + // Calculate the bit shift for efficiently mapping block numbers to slabs. + // Using a shift requires that the slab size be a power of two. + BlockCount slabSize = state->slabConfig.slabBlocks; + if (!isPowerOfTwo(slabSize)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "slab size must be a power of two"); + } + unsigned int slabSizeShift = logBaseTwo(slabSize); + + SlabDepot *depot; + int result = ALLOCATE_EXTENDED(SlabDepot, threadConfig->physicalZoneCount, + BlockAllocator *, __func__, &depot); + if (result != VDO_SUCCESS) { + return result; + } + + depot->oldZoneCount = state->zoneCount; + depot->zoneCount = threadConfig->physicalZoneCount; + depot->slabConfig = state->slabConfig; + depot->readOnlyNotifier = readOnlyNotifier; + depot->firstBlock = state->firstBlock; + depot->lastBlock = state->lastBlock; + depot->slabSizeShift = slabSizeShift; + depot->journal = recoveryJournal; + + result = allocateComponents(depot, nonce, threadConfig, vioPoolSize, + layer, summaryPartition); + if (result != VDO_SUCCESS) { + freeSlabDepot(&depot); + return result; + } + + *depotPtr = depot; + return VDO_SUCCESS; +} + +/** + * Configure the SlabDepot for the specified storage capacity, finding the + * number of data blocks that will fit and still leave room for the depot + * metadata, then return the saved state for that configuration. + * + * @param [in] blockCount The number of blocks in the underlying storage + * @param [in] firstBlock The number of the first block that may be allocated + * @param [in] slabConfig The configuration of a single slab + * @param [in] zoneCount The number of zones the depot will use + * @param [out] state The state structure to be configured + * + * @return VDO_SUCCESS or an error code + **/ +static int configureState(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + ZoneCount zoneCount, + SlabDepotState2_0 *state) +{ + BlockCount slabSize = slabConfig.slabBlocks; + logDebug("slabDepot configureState(blockCount=%" PRIu64 + ", firstBlock=%llu, slabSize=%llu, zoneCount=%u)", + blockCount, firstBlock, slabSize, zoneCount); + + // We do not allow runt slabs, so we waste up to a slab's worth. + size_t slabCount = (blockCount / slabSize); + if (slabCount == 0) { + return VDO_NO_SPACE; + } + + if (slabCount > MAX_SLABS) { + return VDO_TOO_MANY_SLABS; + } + + BlockCount totalSlabBlocks = slabCount * slabConfig.slabBlocks; + BlockCount totalDataBlocks = slabCount * slabConfig.dataBlocks; + PhysicalBlockNumber lastBlock = firstBlock + totalSlabBlocks; + + *state = (SlabDepotState2_0) { + .slabConfig = slabConfig, + .firstBlock = firstBlock, + .lastBlock = lastBlock, + .zoneCount = zoneCount, + }; + + logDebug("slabDepot lastBlock=%llu, totalDataBlocks=%" PRIu64 + ", slabCount=%zu, leftOver=%llu", + lastBlock, totalDataBlocks, slabCount, + blockCount - (lastBlock - firstBlock)); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeSlabDepot(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + SlabDepotState2_0 state; + int result = configureState(blockCount, firstBlock, slabConfig, 0, &state); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepot *depot = NULL; + result = allocateDepot(&state, threadConfig, nonce, vioPoolSize, layer, + summaryPartition, readOnlyNotifier, recoveryJournal, + &depot); + if (result != VDO_SUCCESS) { + return result; + } + + *depotPtr = depot; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabDepot(SlabDepot **depotPtr) +{ + SlabDepot *depot = *depotPtr; + if (depot == NULL) { + return; + } + + abandonNewSlabs(depot); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + freeBlockAllocator(&depot->allocators[zone]); + } + + if (depot->slabs != NULL) { + for (SlabCount i = 0; i < depot->slabCount; i++) { + freeSlab(&depot->slabs[i]); + } + } + + FREE(depot->slabs); + freeActionManager(&depot->actionManager); + freeSlabSummary(&depot->slabSummary); + destroyEnqueueable(&depot->scrubbingCompletion); + FREE(depot); + *depotPtr = NULL; +} + +/**********************************************************************/ +size_t getSlabDepotEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(SlabDepotState2_0); +} + +/** + * Decode a slab config from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The config structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeSlabConfig(Buffer *buffer, SlabConfig *config) +{ + BlockCount count; + int result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->dataBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->referenceCountBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalFlushingThreshold = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalBlockingThreshold = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalScrubbingThreshold = count; + + return UDS_SUCCESS; +} + +/** + * Encode a slab config into a buffer. + * + * @param config The config structure to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodeSlabConfig(const SlabConfig *config, Buffer *buffer) +{ + int result = putUInt64LEIntoBuffer(buffer, config->slabBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->dataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->referenceCountBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalFlushingThreshold); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlockingThreshold); + if (result != UDS_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, config->slabJournalScrubbingThreshold); +} + +/**********************************************************************/ +int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) +{ + int result = encodeHeader(&SLAB_DEPOT_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = encodeSlabConfig(&depot->slabConfig, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, depot->firstBlock); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, depot->lastBlock); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * If this depot is currently using 0 zones, it must have been + * synchronously loaded by a tool and is now being saved. We + * did not load and combine the slab summary, so we still need + * to do that next time we load with the old zone count rather + * than 0. + */ + ZoneCount zonesToRecord = depot->zoneCount; + if (depot->zoneCount == 0) { + zonesToRecord = depot->oldZoneCount; + } + result = putByte(buffer, zonesToRecord); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(SLAB_DEPOT_HEADER_2_0.size == encodedSize, + "encoded block map component size must match header size"); +} + +/** + * Decode slab depot component state version 2.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeSlabDepotState_2_0(Buffer *buffer, SlabDepotState2_0 *state) +{ + size_t initialLength = contentLength(buffer); + + int result = decodeSlabConfig(buffer, &state->slabConfig); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber firstBlock; + result = getUInt64LEFromBuffer(buffer, &firstBlock); + if (result != UDS_SUCCESS) { + return result; + } + state->firstBlock = firstBlock; + + PhysicalBlockNumber lastBlock; + result = getUInt64LEFromBuffer(buffer, &lastBlock); + if (result != UDS_SUCCESS) { + return result; + } + state->lastBlock = lastBlock; + + result = getByte(buffer, &state->zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(SLAB_DEPOT_HEADER_2_0.size == decodedSize, + "decoded slab depot component size must match header size"); +} + +/**********************************************************************/ +int decodeSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&SLAB_DEPOT_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepotState2_0 state; + result = decodeSlabDepotState_2_0(buffer, &state); + if (result != UDS_SUCCESS) { + return result; + } + + return allocateDepot(&state, threadConfig, nonce, VIO_POOL_SIZE, layer, + summaryPartition, readOnlyNotifier, recoveryJournal, + depotPtr); +} + +/**********************************************************************/ +int decodeSodiumSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + // Sodium uses version 2.0 of the slab depot state. + return decodeSlabDepot(buffer, threadConfig, nonce, layer, summaryPartition, + readOnlyNotifier, recoveryJournal, depotPtr); +} + +/**********************************************************************/ +int allocateSlabRefCounts(SlabDepot *depot) +{ + SlabIterator iterator = getSlabIterator(depot); + while (hasNextSlab(&iterator)) { + int result = allocateRefCountsForSlab(nextSlab(&iterator)); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, + ZoneCount zoneNumber) +{ + return depot->allocators[zoneNumber]; +} + +/**********************************************************************/ +int getSlabNumber(const SlabDepot *depot, + PhysicalBlockNumber pbn, + SlabCount *slabNumberPtr) +{ + if (pbn < depot->firstBlock) { + return VDO_OUT_OF_RANGE; + } + + SlabCount slabNumber = (pbn - depot->firstBlock) >> depot->slabSizeShift; + if (slabNumber >= depot->slabCount) { + return VDO_OUT_OF_RANGE; + } + + *slabNumberPtr = slabNumber; + return VDO_SUCCESS; +} + +/**********************************************************************/ +Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + if (pbn == ZERO_BLOCK) { + return NULL; + } + + SlabCount slabNumber; + int result = getSlabNumber(depot, pbn, &slabNumber); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(depot->readOnlyNotifier, result); + return NULL; + } + + return depot->slabs[slabNumber]; + +} + +/**********************************************************************/ +SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + Slab *slab = getSlab(depot, pbn); + return ((slab != NULL) ? slab->journal : NULL); +} + +/**********************************************************************/ +uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) +{ + Slab *slab = getSlab(depot, pbn); + if ((slab == NULL) || isUnrecoveredSlab(slab)) { + return 0; + } + + return getAvailableReferences(slab->referenceCounts, pbn); +} + +/**********************************************************************/ +bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + if (pbn == ZERO_BLOCK) { + return true; + } + + SlabCount slabNumber; + if (getSlabNumber(depot, pbn, &slabNumber) != VDO_SUCCESS) { + return false; + } + + SlabBlockNumber sbn; + int result = slabBlockNumberFromPBN(depot->slabs[slabNumber], pbn, &sbn); + return (result == VDO_SUCCESS); +} + +/**********************************************************************/ +BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) +{ + BlockCount total = 0; + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + // The allocators are responsible for thread safety. + total += getAllocatedBlocks(depot->allocators[zone]); + } + return total; +} + +/**********************************************************************/ +BlockCount getDepotDataBlocks(const SlabDepot *depot) +{ + // XXX This needs to be thread safe, but resize changes the slab count. It + // does so on the admin thread (our usual caller), so it's usually safe. + return (depot->slabCount * depot->slabConfig.dataBlocks); +} + +/**********************************************************************/ +BlockCount getDepotFreeBlocks(const SlabDepot *depot) +{ + /* + * We can't ever shrink a volume except when resize fails, and we can't + * allocate from the new slabs until after the resize succeeds, so by + * getting the number of allocated blocks first, we ensure the allocated + * count is always less than the capacity. Doing it in the other order on a + * full volume could lose a race with a sucessful resize, resulting in a + * nonsensical negative/underflow result. + */ + BlockCount allocated = getDepotAllocatedBlocks(depot); + memoryFence(); + return (getDepotDataBlocks(depot) - allocated); +} + +/**********************************************************************/ +SlabCount getDepotSlabCount(const SlabDepot *depot) +{ + return depot->slabCount; +} + +/**********************************************************************/ +SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) +{ + SlabCount total = 0; + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + // The allocators are responsible for thread safety. + total += getUnrecoveredSlabCount(depot->allocators[zone]); + } + return total; +} + +/** + * The preamble of a load operation which loads the slab summary. + * + *

Implements ActionPreamble. + **/ +static void startDepotLoad(void *context, VDOCompletion *parent) +{ + SlabDepot *depot = context; + loadSlabSummary(depot->slabSummary, + getCurrentManagerOperation(depot->actionManager), + depot->oldZoneCount, parent); +} + +/**********************************************************************/ +void loadSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent, + void *context) +{ + if (assertLoadOperation(operation, parent)) { + scheduleOperationWithContext(depot->actionManager, operation, + startDepotLoad, loadBlockAllocator, NULL, + context, parent); + } +} + +/**********************************************************************/ +void prepareToAllocate(SlabDepot *depot, + SlabDepotLoadType loadType, + VDOCompletion *parent) +{ + depot->loadType = loadType; + atomicStore32(&depot->zonesToScrub, depot->zoneCount); + scheduleAction(depot->actionManager, NULL, prepareAllocatorToAllocate, + NULL, parent); +} + +/**********************************************************************/ +void updateSlabDepotSize(SlabDepot *depot) +{ + depot->lastBlock = depot->newLastBlock; +} + +/**********************************************************************/ +int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) +{ + if ((newSize >> depot->slabSizeShift) <= depot->slabCount) { + return VDO_INCREMENT_TOO_SMALL; + } + + // Generate the depot configuration for the new block count. + SlabDepotState2_0 newState; + int result = configureState(newSize, depot->firstBlock, depot->slabConfig, + depot->zoneCount, &newState); + if (result != VDO_SUCCESS) { + return result; + } + + SlabCount newSlabCount = computeSlabCount(depot->firstBlock, + newState.lastBlock, + depot->slabSizeShift); + if (newSlabCount <= depot->slabCount) { + return logErrorWithStringError(VDO_INCREMENT_TOO_SMALL, + "Depot can only grow"); + } + if (newSlabCount == depot->newSlabCount) { + // Check it out, we've already got all the new slabs allocated! + return VDO_SUCCESS; + } + + abandonNewSlabs(depot); + result = allocateSlabs(depot, newSlabCount); + if (result != VDO_SUCCESS) { + abandonNewSlabs(depot); + return result; + } + + depot->newSize = newSize; + depot->oldLastBlock = depot->lastBlock; + depot->newLastBlock = newState.lastBlock; + + return VDO_SUCCESS; +} + +/** + * Finish registering new slabs now that all of the allocators have received + * their new slabs. + * + *

Implements ActionConclusion. + **/ +static int finishRegistration(void *context) +{ + SlabDepot *depot = context; + depot->slabCount = depot->newSlabCount; + FREE(depot->slabs); + depot->slabs = depot->newSlabs; + depot->newSlabs = NULL; + depot->newSlabCount = 0; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void useNewSlabs(SlabDepot *depot, VDOCompletion *parent) +{ + ASSERT_LOG_ONLY(depot->newSlabs != NULL, "Must have new slabs to use"); + scheduleOperation(depot->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, + NULL, registerNewSlabsForAllocator, finishRegistration, + parent); +} + +/**********************************************************************/ +void drainSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(depot->actionManager, operation, NULL, drainBlockAllocator, + NULL, parent); +} + +/**********************************************************************/ +void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent) +{ + if (isReadOnly(depot->readOnlyNotifier)) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + scheduleOperation(depot->actionManager, ADMIN_STATE_RESUMING, NULL, + resumeBlockAllocator, NULL, parent); +} + +/**********************************************************************/ +void commitOldestSlabJournalTailBlocks(SlabDepot *depot, + SequenceNumber recoveryBlockNumber) +{ + if (depot == NULL) { + return; + } + + depot->newReleaseRequest = recoveryBlockNumber; + scheduleDefaultAction(depot->actionManager); +} + +/**********************************************************************/ +const SlabConfig *getSlabConfig(const SlabDepot *depot) +{ + return &depot->slabConfig; +} + +/**********************************************************************/ +SlabSummary *getSlabSummary(const SlabDepot *depot) +{ + return depot->slabSummary; +} + +/**********************************************************************/ +SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) +{ + if (depot->slabSummary == NULL) { + return NULL; + } + return getSummaryForZone(depot->slabSummary, zone); +} + +/**********************************************************************/ +void scrubAllUnrecoveredSlabs(SlabDepot *depot, + void *parent, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + VDOCompletion *launchParent) +{ + prepareCompletion(&depot->scrubbingCompletion, callback, errorHandler, + threadID, parent); + scheduleAction(depot->actionManager, NULL, scrubAllUnrecoveredSlabsInZone, + NULL, launchParent); +} + +/**********************************************************************/ +void notifyZoneFinishedScrubbing(VDOCompletion *completion) +{ + SlabDepot *depot = completion->parent; + if (atomicAdd32(&depot->zonesToScrub, -1) == 0) { + // We're the last! + completeCompletion(&depot->scrubbingCompletion); + } +} + +/**********************************************************************/ +bool hasUnrecoveredSlabs(SlabDepot *depot) +{ + return (atomicLoad32(&depot->zonesToScrub) > 0); +} + +/**********************************************************************/ +BlockCount getNewDepotSize(const SlabDepot *depot) +{ + return (depot->newSlabs == NULL) ? 0 : depot->newSize; +} + +/**********************************************************************/ +bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) +{ + if ((depotA->firstBlock != depotB->firstBlock) + || (depotA->lastBlock != depotB->lastBlock) + || (depotA->slabCount != depotB->slabCount) + || (depotA->slabSizeShift != depotB->slabSizeShift) + || (getDepotAllocatedBlocks(depotA) + != getDepotAllocatedBlocks(depotB))) { + return false; + } + + for (size_t i = 0; i < depotA->slabCount; i++) { + Slab *slabA = depotA->slabs[i]; + Slab *slabB = depotB->slabs[i]; + if ((slabA->start != slabB->start) + || (slabA->end != slabB->end) + || !areEquivalentReferenceCounters(slabA->referenceCounts, + slabB->referenceCounts)) { + return false; + } + } + + return true; +} + +/**********************************************************************/ +void allocateFromLastSlab(SlabDepot *depot) +{ + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + allocateFromAllocatorLastSlab(depot->allocators[zone]); + } +} + +/**********************************************************************/ +BlockAllocatorStatistics +getDepotBlockAllocatorStatistics(const SlabDepot *depot) +{ + BlockAllocatorStatistics totals; + memset(&totals, 0, sizeof(totals)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + BlockAllocatorStatistics stats = getBlockAllocatorStatistics(allocator); + totals.slabCount += stats.slabCount; + totals.slabsOpened += stats.slabsOpened; + totals.slabsReopened += stats.slabsReopened; + } + + return totals; +} + +/**********************************************************************/ +RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) +{ + RefCountsStatistics depotStats; + memset(&depotStats, 0, sizeof(depotStats)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + RefCountsStatistics stats = getRefCountsStatistics(allocator); + depotStats.blocksWritten += stats.blocksWritten; + } + + return depotStats; +} + +/**********************************************************************/ +SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) +{ + SlabJournalStatistics depotStats; + memset(&depotStats, 0, sizeof(depotStats)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + SlabJournalStatistics stats = getSlabJournalStatistics(allocator); + depotStats.diskFullCount += stats.diskFullCount; + depotStats.flushCount += stats.flushCount; + depotStats.blockedCount += stats.blockedCount; + depotStats.blocksWritten += stats.blocksWritten; + depotStats.tailBusyCount += stats.tailBusyCount; + } + + return depotStats; +} + +/**********************************************************************/ +void dumpSlabDepot(const SlabDepot *depot) +{ + logInfo("Slab Depot"); + logInfo(" zoneCount=%u oldZoneCount=%u slabCount=%" PRIu32 + " activeReleaseRequest=%llu newReleaseRequest=%llu", + (unsigned int) depot->zoneCount, (unsigned int) depot->oldZoneCount, + depot->slabCount, depot->activeReleaseRequest, + depot->newReleaseRequest); +} diff --git a/vdo/base/slabDepot.h b/vdo/base/slabDepot.h new file mode 100644 index 0000000..b439470 --- /dev/null +++ b/vdo/base/slabDepot.h @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.h#12 $ + */ + +#ifndef SLAB_DEPOT_H +#define SLAB_DEPOT_H + +#include "buffer.h" + +#include "adminState.h" +#include "completion.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A SlabDepot is responsible for managing all of the slabs and block + * allocators of a VDO. It has a single array of slabs in order to eliminate + * the need for additional math in order to compute which physical zone a PBN + * is in. It also has a BlockAllocator per zone. + * + * Load operations are required to be performed on a single thread. Normal + * operations are assumed to be performed in the appropriate zone. Allocations + * and reference count updates must be done from the thread of their physical + * zone. Requests to commit slab journal tail blocks from the recovery journal + * must be done on the journal zone thread. Save operations are required to be + * launched from the same thread as the original load operation. + **/ + +typedef enum { + NORMAL_LOAD, + RECOVERY_LOAD, + REBUILD_LOAD +} SlabDepotLoadType; + +/** + * Calculate the number of slabs a depot would have. + * + * @param depot The depot + * + * @return The number of slabs + **/ +SlabCount calculateSlabCount(SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Create a slab depot. + * + * @param [in] blockCount The number of blocks initially available + * @param [in] firstBlock The number of the first block which may be + * allocated + * @param [in] slabConfig The slab configuration + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int makeSlabDepot(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a slab depot and null out the reference to it. + * + * @param depotPtr The reference to the depot to destroy + **/ +void freeSlabDepot(SlabDepot **depotPtr); + +/** + * Get the size of the encoded state of a slab depot. + * + * @return The encoded size of the depot's state + **/ +size_t getSlabDepotEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a slab depot into a buffer. + * + * @param depot The depot to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a slab depot saved in a buffer. + * + * @param [in] buffer The buffer containing the saved state + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int decodeSodiumSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a slab depot saved in a buffer. + * + * @param [in] buffer The buffer containing the saved state + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int decodeSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the RefCounts for all slabs in the depot. This method may be called + * only before entering normal operation from the load thread. + * + * @param depot The depot whose RefCounts need allocation + * + * @return VDO_SUCCESS or an error + **/ +int allocateSlabRefCounts(SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the block allocator for a specified physical zone from a depot. + * + * @param depot The depot + * @param zoneNumber The physical zone + * + * @return The block allocator for the specified zone + **/ +BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, + ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Get the number of the slab that contains a specified block. + * + * @param depot The slab depot + * @param pbn The physical block number + * @param slabNumberPtr A pointer to hold the slab number + * + * @return VDO_SUCCESS or an error + **/ +int getSlabNumber(const SlabDepot *depot, + PhysicalBlockNumber pbn, + SlabCount *slabNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Get the slab object for the slab that contains a specified block. Will put + * the VDO in read-only mode if the PBN is not a valid data block nor the zero + * block. + * + * @param depot The slab depot + * @param pbn The physical block number + * + * @return The slab containing the block, or NULL if the block number is the + * zero block or otherwise out of range + **/ +Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Get the slab journal for the slab that contains a specified block. + * + * @param depot The slab depot + * @param pbn The physical block number within the block depot partition + * of any block in the slab + * + * @return The slab journal of the slab containing the block, or NULL if the + * block number is for the zero block or otherwise out of range + **/ +SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Determine how many new references a block can acquire. This method must be + * called from the the physical zone thread of the PBN. + * + * @param depot The slab depot + * @param pbn The physical block number that is being queried + * + * @return the number of available references + **/ +uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Determine whether the given PBN refers to a data block. + * + * @param depot The depot + * @param pbn The physical block number to ask about + * + * @return True if the PBN corresponds to a data block + **/ +bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Get the total number of data blocks allocated across all the slabs in the + * depot, which is the total number of blocks with a non-zero reference count. + * This may be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of blocks with a non-zero reference count + **/ +BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total of the statistics from all the block allocators in the depot. + * + * @param depot The slab depot + * + * @return The statistics from all block allocators in the depot + **/ +BlockAllocatorStatistics +getDepotBlockAllocatorStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of data blocks in all the slabs in the depot. This may + * be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of data blocks in all slabs + **/ +BlockCount getDepotDataBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of free blocks remaining in all the slabs in the + * depot, which is the total number of blocks that have a zero reference + * count. This may be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of blocks with a zero reference count + **/ +BlockCount getDepotFreeBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of slabs in the depot + * + * @param depot The slab depot + * + * @return The total number of slabs + **/ +SlabCount getDepotSlabCount(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of unrecovered slabs in the depot, which is the total + * number of unrecovered slabs from all zones. This may be called from any + * thread. + * + * @param depot The slab depot + * + * @return The total number of slabs that are unrecovered + **/ +SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the aggregated slab journal statistics for the depot. + * + * @param depot The slab depot + * + * @return The aggregated statistics for all slab journals in the depot + **/ +SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the cumulative RefCounts statistics for the depot. + * + * @param depot The slab depot + * + * @return The cumulative statistics for all RefCounts in the depot + **/ +RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Asynchronously load any slab depot state that isn't included in the + * SuperBlock component. This method may be called only before entering normal + * operation from the load thread. + * + * @param depot The depot to load + * @param operation The type of load to perform + * @param parent The completion to finish when the load is complete + * @param context Additional context for the load operation; may be NULL + **/ +void loadSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent, + void *context); + +/** + * Prepare the slab depot to come online and start allocating blocks. This + * method may be called only before entering normal operation from the load + * thread. It must be called before allocation may proceed. + * + * @param depot The depot to prepare + * @param loadType The load type + * @param parent The completion to finish when the operation is complete + **/ +void prepareToAllocate(SlabDepot *depot, + SlabDepotLoadType loadType, + VDOCompletion *parent); + +/** + * Update the slab depot to reflect its new size in memory. This size is saved + * to disk as part of the super block. + * + * @param depot The depot to update + **/ +void updateSlabDepotSize(SlabDepot *depot); + +/** + * Allocate new memory needed for a resize of a slab depot to the given size. + * + * @param depot The depot to prepare to resize + * @param newSize The number of blocks in the new depot + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) + __attribute__((warn_unused_result)); + +/** + * Use the new slabs allocated for resize. + * + * @param depot The depot + * @param parent The object to notify when complete + **/ +void useNewSlabs(SlabDepot *depot, VDOCompletion *parent); + +/** + * Abandon any new slabs in this depot, freeing them as needed. + * + * @param depot The depot + **/ +void abandonNewSlabs(SlabDepot *depot); + +/** + * Drain all slab depot I/O. If saving, or flushing, all dirty depot metadata + * will be written out. If saving or suspending, the depot will be left in a + * suspended state. + * + * @param depot The depot to drain + * @param operation The drain operation (flush, rebuild, suspend, or save) + * @param parent The completion to finish when the drain is complete + **/ +void drainSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a suspended slab depot. + * + * @param depot The depot to resume + * @param parent The completion to finish when the depot has resumed + **/ +void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent); + +/** + * Commit all dirty tail blocks which are locking a given recovery journal + * block. This method must be called from the journal zone thread. + * + * @param depot The depot + * @param recoveryBlockNumber The sequence number of the recovery journal + * block whose locks should be released + **/ +void commitOldestSlabJournalTailBlocks(SlabDepot *depot, + SequenceNumber recoveryBlockNumber); + +/** + * Get the SlabConfig of a depot. + * + * @param depot The slab depot + * + * @return The slab configuration of the specified depot + **/ +const SlabConfig *getSlabConfig(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the slab summary. + * + * @param depot The slab depot + * + * @return The slab summary + **/ +SlabSummary *getSlabSummary(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the portion of the slab summary for a given physical zone. + * + * @param depot The slab depot + * @param zone The zone + * + * @return The portion of the slab summary for the specified zone + **/ +SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) + __attribute__((warn_unused_result)); + +/** + * Scrub all unrecovered slabs. + * + * @param depot The depot to scrub + * @param parent The object to notify when scrubbing is complete + * @param callback The function to call when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + * @param threadID The thread on which to run the callback + * @param launchParent The object to notify when scrubbing has been launched + * for all zones + **/ +void scrubAllUnrecoveredSlabs(SlabDepot *depot, + void *parent, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + VDOCompletion *launchParent); + +/** + * Check whether there are outstanding unrecovered slabs. + * + * @param depot The slab depot + * + * @return Whether there are outstanding unrecovered slabs + **/ +bool hasUnrecoveredSlabs(SlabDepot *depot); + +/** + * Get the physical size to which this depot is prepared to grow. + * + * @param depot The slab depot + * + * @return The new number of blocks the depot will be grown to, or 0 if the + * depot is not prepared to grow + **/ +BlockCount getNewDepotSize(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Dump the slab depot, in a thread-unsafe fashion. + * + * @param depot The slab depot + **/ +void dumpSlabDepot(const SlabDepot *depot); + +#endif // SLAB_DEPOT_H diff --git a/vdo/base/slabDepotInternals.h b/vdo/base/slabDepotInternals.h new file mode 100644 index 0000000..7dfe57b --- /dev/null +++ b/vdo/base/slabDepotInternals.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepotInternals.h#13 $ + */ + +#ifndef SLAB_DEPOT_INTERNALS_H +#define SLAB_DEPOT_INTERNALS_H + +#include "slabDepot.h" + +#include "atomic.h" + +#include "actionManager.h" + +struct slabDepot { + ZoneCount zoneCount; + ZoneCount oldZoneCount; + SlabConfig slabConfig; + SlabSummary *slabSummary; + ReadOnlyNotifier *readOnlyNotifier; + ActionManager *actionManager; + + PhysicalBlockNumber firstBlock; + PhysicalBlockNumber lastBlock; + PhysicalBlockNumber origin; + + /** slabSize == (1 << slabSizeShift) */ + unsigned int slabSizeShift; + + /** Determines how slabs should be queued during load */ + SlabDepotLoadType loadType; + + /** The state for notifying slab journals to release recovery journal */ + SequenceNumber activeReleaseRequest; + SequenceNumber newReleaseRequest; + + /** The completion for scrubbing */ + VDOCompletion scrubbingCompletion; + Atomic32 zonesToScrub; + + /** Cached journal pointer for slab creation */ + RecoveryJournal *journal; + + /** Array of pointers to individually allocated slabs */ + Slab **slabs; + /** The number of slabs currently allocated and stored in 'slabs' */ + SlabCount slabCount; + + /** Array of pointers to a larger set of slabs (used during resize) */ + Slab **newSlabs; + /** The number of slabs currently allocated and stored in 'newSlabs' */ + SlabCount newSlabCount; + /** The size that 'newSlabs' was allocated for */ + BlockCount newSize; + + /** The last block before resize, for rollback */ + PhysicalBlockNumber oldLastBlock; + /** The last block after resize, for resize */ + PhysicalBlockNumber newLastBlock; + + /** The block allocators for this depot */ + BlockAllocator *allocators[]; +}; + +/** + * Destroy a slab. + * + * @param slab The slab to destroy + **/ +void destroySlab(Slab *slab); + +/** + * Inform a slab's depot that the slab has been created. + * + * @param slab The slab to register + **/ +void registerSlabWithDepot(Slab *slab); + +/** + * Notify a slab depot that one of its allocators has finished scrubbing slabs. + * This method should only be called if the scrubbing was successful. This + * callback is registered by each block allocator in + * scrubAllUnrecoveredSlabsInZone(). + * + * @param completion A completion whose parent must be a slab depot + **/ +void notifyZoneFinishedScrubbing(VDOCompletion *completion); + +/** + * Check whether two depots are equivalent (i.e. represent the same + * state and have the same reference counter). This method is used for unit + * testing. + * + * @param depotA The first depot to compare + * @param depotB The second depot to compare + * + * @return true if the two depots are equivalent + **/ +bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) + __attribute__((warn_unused_result)); + +/** + * Start allocating from the highest numbered slab in each zone. + * + * @param depot The depot + **/ +void allocateFromLastSlab(SlabDepot *depot); + +#endif /* SLAB_DEPOT_INTERNALS_H */ diff --git a/vdo/base/slabIterator.h b/vdo/base/slabIterator.h new file mode 100644 index 0000000..e977c2d --- /dev/null +++ b/vdo/base/slabIterator.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabIterator.h#1 $ + */ + +#ifndef SLAB_ITERATOR_H +#define SLAB_ITERATOR_H + +#include "slab.h" +#include "types.h" + +/** + * SlabIterator is a structure for iterating over a set of slabs. + **/ +typedef struct { + Slab **slabs; + Slab *next; + SlabCount end; + SlabCount stride; +} SlabIterator; + +/** + * Return a SlabIterator initialized to iterate over an array of slabs + * with a given stride. Iteration always occurs from higher to lower numbered + * slabs. + * + * @param slabs The array of slabs + * @param start The number of the slab to start iterating from + * @param end The number of the last slab which may be returned + * @param stride The difference in slab number between successive slabs + * + * @return an initialized iterator structure + **/ +static inline SlabIterator iterateSlabs(Slab **slabs, + SlabCount start, + SlabCount end, + SlabCount stride) +{ + return (SlabIterator) { + .slabs = slabs, + .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]), + .end = end, + .stride = stride, + }; +} + +/** + * Check whether another Slab would be returned by the iterator. + * + * @param iterator The iterator to poll + * + * @return true if the next call to nextSlab + * will return a Slab + **/ +static inline bool hasNextSlab(const SlabIterator *iterator) +{ + return (iterator->next != NULL); +} + +/** + * Get the next Slab, advancing the iterator. + * + * @param iterator The iterator over the Slab chain + * + * @return the next Slab or NULL if the array of slabs is empty + * or if all the appropriate Slabs have been returned + **/ +static inline Slab *nextSlab(SlabIterator *iterator) +{ + Slab *slab = iterator->next; + if ((slab == NULL) + || (slab->slabNumber < iterator->end + iterator->stride)) { + iterator->next = NULL; + } else { + iterator->next = iterator->slabs[slab->slabNumber - iterator->stride]; + } + return slab; +} + +#endif // SLAB_ITERATOR_H diff --git a/vdo/base/slabJournal.c b/vdo/base/slabJournal.c new file mode 100644 index 0000000..1895f80 --- /dev/null +++ b/vdo/base/slabJournal.c @@ -0,0 +1,1321 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.c#18 $ + */ + +#include "slabJournalInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "dataVIO.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "slabSummary.h" + +/** + * Return the slab journal from the resource waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromResourceWaiter(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(SlabJournal, resourceWaiter) == 0); + return (SlabJournal *) waiter; +} + +/** + * Return the slab journal from the flush waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromFlushWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (SlabJournal *) + ((uintptr_t) waiter - offsetof(SlabJournal, flushWaiter)); +} + +/**********************************************************************/ +SlabJournal *slabJournalFromDirtyNode(RingNode *node) +{ + if (node == NULL) { + return NULL; + } + return (SlabJournal *) ((uintptr_t) node - offsetof(SlabJournal, dirtyNode)); +} + +/** + * Return the slab journal from the slab summary waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromSlabSummaryWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (SlabJournal *) + ((uintptr_t) waiter - offsetof(SlabJournal, slabSummaryWaiter)); +} + +/** + * Get the physical block number for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number of the desired block + * + * @return the block number corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getBlockNumber(SlabJournal *journal, + SequenceNumber sequence) +{ + TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequence); + return (journal->slab->journalOrigin + offset); +} + +/** + * Get the lock object for a slab journal block by sequence number. + * + * @param journal Slab journal to retrieve from + * @param sequenceNumber Sequence number of the block + * + * @return the lock object for the given sequence number + **/ +__attribute__((warn_unused_result)) +static inline JournalLock *getLock(SlabJournal *journal, + SequenceNumber sequenceNumber) +{ + TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequenceNumber); + return &journal->locks[offset]; +} + +/** + * Check whether the VDO is in read-only mode. + * + * @param journal The journal whose owning VDO should be checked + * + * @return true if the VDO is in read-only mode + **/ +__attribute__((warn_unused_result)) +static inline bool isVDOReadOnly(SlabJournal *journal) +{ + return isReadOnly(journal->slab->allocator->readOnlyNotifier); +} + +/** + * Check whether there are entry waiters which should delay a flush. + * + * @param journal The journal to check + * + * @return true if there are no entry waiters, or if the slab + * is unrecovered + **/ +__attribute__((warn_unused_result)) +static inline bool mustMakeEntriesToFlush(SlabJournal *journal) +{ + return (!slabIsRebuilding(journal->slab) + && hasWaiters(&journal->entryWaiters)); +} + +/** + * Check whether a reap is currently in progress. + * + * @param journal The journal which may be reaping + * + * @return true if the journal is reaping + **/ +__attribute__((warn_unused_result)) +static inline bool isReaping(SlabJournal *journal) +{ + return (journal->head != journal->unreapable); +} + +/**********************************************************************/ +bool isSlabJournalActive(SlabJournal *journal) +{ + return (mustMakeEntriesToFlush(journal) + || isReaping(journal) + || journal->waitingToCommit + || !isRingEmpty(&journal->uncommittedBlocks) + || journal->updatingSlabSummary); +} + +/** + * Initialize tail block as a new block. + * + * @param journal The journal whose tail block is being initialized + **/ +static void initializeTailBlock(SlabJournal *journal) +{ + SlabJournalBlockHeader *header = &journal->tailHeader; + header->sequenceNumber = journal->tail; + header->entryCount = 0; + header->hasBlockMapIncrements = false; +} + +/** + * Set all journal fields appropriately to start journaling. + * + * @param journal The journal to be reset, based on its tail sequence number + **/ +static void initializeJournalState(SlabJournal *journal) +{ + journal->unreapable = journal->head; + journal->reapLock = getLock(journal, journal->unreapable); + journal->nextCommit = journal->tail; + journal->summarized = journal->lastSummarized = journal->tail; + initializeTailBlock(journal); +} + +/** + * Check whether a journal block is full. + * + * @param journal The slab journal for the block + * + * @return true if the tail block is full + **/ +__attribute__((warn_unused_result)) +static bool blockIsFull(SlabJournal *journal) +{ + JournalEntryCount count = journal->tailHeader.entryCount; + return (journal->tailHeader.hasBlockMapIncrements + ? (journal->fullEntriesPerBlock == count) + : (journal->entriesPerBlock == count)); +} + +/**********************************************************************/ +static void addEntries(SlabJournal *journal); +static void updateTailBlockLocation(SlabJournal *journal); +static void releaseJournalLocks(Waiter *waiter, void *context); + +/**********************************************************************/ +int makeSlabJournal(BlockAllocator *allocator, + Slab *slab, + RecoveryJournal *recoveryJournal, + SlabJournal **journalPtr) +{ + SlabJournal *journal; + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + int result = ALLOCATE_EXTENDED(SlabJournal, slabConfig->slabJournalBlocks, + JournalLock, __func__, &journal); + if (result != VDO_SUCCESS) { + return result; + } + + journal->slab = slab; + journal->size = slabConfig->slabJournalBlocks; + journal->flushingThreshold = slabConfig->slabJournalFlushingThreshold; + journal->blockingThreshold = slabConfig->slabJournalBlockingThreshold; + journal->scrubbingThreshold = slabConfig->slabJournalScrubbingThreshold; + journal->entriesPerBlock = SLAB_JOURNAL_ENTRIES_PER_BLOCK; + journal->fullEntriesPerBlock = SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; + journal->events = &allocator->slabJournalStatistics; + journal->recoveryJournal = recoveryJournal; + journal->summary = getSlabSummaryZone(allocator); + journal->tail = 1; + journal->head = 1; + + journal->flushingDeadline = journal->flushingThreshold; + // Set there to be some time between the deadline and the blocking threshold, + // so that hopefully all are done before blocking. + if ((journal->blockingThreshold - journal->flushingThreshold) > 5) { + journal->flushingDeadline = journal->blockingThreshold - 5; + } + + journal->slabSummaryWaiter.callback = releaseJournalLocks; + + result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedSlabJournalBlock", + (char **) &journal->block); + if (result != VDO_SUCCESS) { + freeSlabJournal(&journal); + return result; + } + + initializeRing(&journal->dirtyNode); + initializeRing(&journal->uncommittedBlocks); + + journal->tailHeader.nonce = slab->allocator->nonce; + journal->tailHeader.metadataType = VDO_METADATA_SLAB_JOURNAL; + initializeJournalState(journal); + + *journalPtr = journal; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabJournal(SlabJournal **journalPtr) +{ + SlabJournal *journal = *journalPtr; + if (journal == NULL) { + return; + } + + FREE(journal->block); + FREE(journal); + *journalPtr = NULL; +} + +/**********************************************************************/ +bool isSlabJournalBlank(const SlabJournal *journal) +{ + return ((journal != NULL) + && (journal->tail == 1) + && (journal->tailHeader.entryCount == 0)); +} + +/**********************************************************************/ +bool isSlabJournalDirty(const SlabJournal *journal) +{ + return (journal->recoveryLock != 0); +} + +/** + * Put a slab journal on the dirty ring of its allocator in the correct order. + * + * @param journal The journal to be marked dirty + * @param lock The recovery journal lock held by the slab journal + **/ +static void markSlabJournalDirty(SlabJournal *journal, SequenceNumber lock) +{ + ASSERT_LOG_ONLY(!isSlabJournalDirty(journal), "slab journal was clean"); + + journal->recoveryLock = lock; + RingNode *dirtyRing = &journal->slab->allocator->dirtySlabJournals; + RingNode *node = dirtyRing->prev; + while (node != dirtyRing) { + SlabJournal *dirtyJournal = slabJournalFromDirtyNode(node); + if (dirtyJournal->recoveryLock <= journal->recoveryLock) { + break; + } + + node = node->prev; + } + + pushRingNode(node->next, &journal->dirtyNode); +} + +/**********************************************************************/ +static void markSlabJournalClean(SlabJournal *journal) +{ + journal->recoveryLock = 0; + unspliceRingNode(&journal->dirtyNode); +} + +/** + * Implements WaiterCallback. This callback is invoked on all VIOs waiting + * to make slab journal entries after the VDO has gone into read-only mode. + **/ +static void abortWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + continueDataVIO(waiterAsDataVIO(waiter), VDO_READ_ONLY); +} + +/**********************************************************************/ +void abortSlabJournalWaiters(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "abortSlabJournalWaiters() called on correct thread"); + notifyAllWaiters(&journal->entryWaiters, abortWaiter, journal); + checkIfSlabDrained(journal->slab); +} + +/** + * Put the journal in read-only mode. All attempts to add entries after + * this function is called will fail. All VIOs waiting for to make entries + * will be awakened with an error. All flushes will complete as soon as all + * pending IO is done. + * + * @param journal The journal which has failed + * @param errorCode The error result triggering this call + **/ +static void enterJournalReadOnlyMode(SlabJournal *journal, int errorCode) +{ + enterReadOnlyMode(journal->slab->allocator->readOnlyNotifier, errorCode); + abortSlabJournalWaiters(journal); +} + +/** + * Actually advance the head of the journal now that any necessary flushes + * are complete. + * + * @param journal The journal to be reaped + **/ +static void finishReaping(SlabJournal *journal) +{ + journal->head = journal->unreapable; + addEntries(journal); + checkIfSlabDrained(journal->slab); +} + +/**********************************************************************/ +static void reapSlabJournal(SlabJournal *journal); + +/** + * Finish reaping now that we have flushed the lower layer and then try + * reaping again in case we deferred reaping due to an outstanding VIO. + * + * @param completion The flush VIO + **/ +static void completeReaping(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + returnVIO(journal->slab->allocator, entry); + finishReaping(journal); + reapSlabJournal(journal); +} + +/** + * Handle an error flushing the lower layer. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + SlabJournal *journal = ((VIOPoolEntry *) completion->parent)->parent; + enterJournalReadOnlyMode(journal, completion->result); + completeReaping(completion); +} + +/** + * Waiter callback for getting a VIO with which to flush the lower layer prior + * to reaping. + * + * @param waiter The journal as a flush waiter + * @param vioContext The newly acquired flush VIO + **/ +static void flushForReaping(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromFlushWaiter(waiter); + VIOPoolEntry *entry = vioContext; + VIO *vio = entry->vio; + + entry->parent = journal; + vio->completion.callbackThreadID = journal->slab->allocator->threadID; + launchFlush(vio, completeReaping, handleFlushError); +} + +/** + * Conduct a reap on a slab journal to reclaim unreferenced blocks. + * + * @param journal The slab journal + **/ +static void reapSlabJournal(SlabJournal *journal) +{ + if (isReaping(journal)) { + // We already have a reap in progress so wait for it to finish. + return; + } + + if (isUnrecoveredSlab(journal->slab) || !isNormal(&journal->slab->state) + || isVDOReadOnly(journal)) { + // We must not reap in the first two cases, and there's no point in + // read-only mode. + return; + } + + /* + * Start reclaiming blocks only when the journal head has no references. Then + * stop when a block is referenced or reap reaches the most recently written + * block, referenced by the slab summary, which has the sequence number just + * before the tail. + */ + bool reaped = false; + while ((journal->unreapable < journal->tail) + && (journal->reapLock->count == 0)) { + reaped = true; + journal->unreapable++; + journal->reapLock++; + if (journal->reapLock == &journal->locks[journal->size]) { + journal->reapLock = &journal->locks[0]; + } + } + + if (!reaped) { + return; + } + + PhysicalLayer *layer = journal->slab->allocator->completion.layer; + if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { + finishReaping(journal); + return; + } + + /* + * In async mode, it is never safe to reap a slab journal block without first + * issuing a flush, regardless of whether a user flush has been received or + * not. In the absence of the flush, the reference block write which released + * the locks allowing the slab journal to reap may not be persisted. Although + * slab summary writes will eventually issue flushes, multiple slab journal + * block writes can be issued while previous slab summary updates have not + * yet been made. Even though those slab journal block writes will be ignored + * if the slab summary update is not persisted, they may still overwrite the + * to-be-reaped slab journal block resulting in a loss of reference count + * updates (VDO-2912). + * + * In sync mode, it is similarly unsafe. However, we cannot possibly make + * those additional slab journal block writes due to the blocking threshold + * and the recovery journal's flush policy of flushing before every block. + * We may make no more than (number of VIOs) entries in slab journals since + * the last recovery journal flush; thus, due to the size of the slab + * journal blocks, the RJ must have flushed the storage no more than one + * slab journal block ago. So we could only overwrite the to-be-reaped block + * if we wrote and flushed the last block in the journal. But the blocking + * threshold prevents that. + */ + journal->flushWaiter.callback = flushForReaping; + int result = acquireVIO(journal->slab->allocator, &journal->flushWaiter); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } +} + +/** + * This is the callback invoked after a slab summary update completes. It + * is registered in the constructor on behalf of updateTailBlockLocation(). + * + * Implements WaiterCallback. + * + * @param waiter The slab summary waiter that has just been notified + * @param context The result code of the update + **/ +static void releaseJournalLocks(Waiter *waiter, void *context) +{ + SlabJournal *journal = slabJournalFromSlabSummaryWaiter(waiter); + int result = *((int *) context); + if (result != VDO_SUCCESS) { + if (result != VDO_READ_ONLY) { + // Don't bother logging what might be lots of errors if we are already + // in read-only mode. + logErrorWithStringError(result, "failed slab summary update %llu", + journal->summarized); + } + + journal->updatingSlabSummary = false; + enterJournalReadOnlyMode(journal, result); + return; + } + + if (journal->partialWriteInProgress + && (journal->summarized == journal->tail)) { + journal->partialWriteInProgress = false; + addEntries(journal); + } + + SequenceNumber first = journal->lastSummarized; + journal->lastSummarized = journal->summarized; + for (SequenceNumber i = journal->summarized - 1; i >= first; i--) { + // Release the lock the summarized block held on the recovery journal. + // (During replay, recoveryStart will always be 0.) + if (journal->recoveryJournal != NULL) { + ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; + releaseRecoveryJournalBlockReference(journal->recoveryJournal, + getLock(journal, i)->recoveryStart, + ZONE_TYPE_PHYSICAL, + zoneNumber); + + } + + // Release our own lock against reaping for blocks that are committed. + // (This function will not change locks during replay.) + adjustSlabJournalBlockReference(journal, i, -1); + } + + journal->updatingSlabSummary = false; + + reapSlabJournal(journal); + + // Check if the slab summary needs to be updated again. + updateTailBlockLocation(journal); +} + +/** + * Update the tail block location in the slab summary, if necessary. + * + * @param journal The slab journal that is updating its tail block location + **/ +static void updateTailBlockLocation(SlabJournal *journal) +{ + if (journal->updatingSlabSummary || isVDOReadOnly(journal) + || (journal->lastSummarized >= journal->nextCommit)) { + checkIfSlabDrained(journal->slab); + return; + } + + BlockCount freeBlockCount; + if (isUnrecoveredSlab(journal->slab)) { + freeBlockCount = getSummarizedFreeBlockCount(journal->summary, + journal->slab->slabNumber); + } else { + freeBlockCount = getSlabFreeBlockCount(journal->slab); + } + + journal->summarized = journal->nextCommit; + journal->updatingSlabSummary = true; + + /* + * Update slab summary as dirty. + * Slab journal can only reap past sequence number 1 when all the refCounts + * for this slab have been written to the layer. Therefore, indicate that the + * refCounts must be loaded when the journal head has reaped past sequence + * number 1. + */ + TailBlockOffset blockOffset + = getSlabJournalBlockOffset(journal, journal->summarized); + updateSlabSummaryEntry(journal->summary, &journal->slabSummaryWaiter, + journal->slab->slabNumber, blockOffset, + (journal->head > 1), false, freeBlockCount); +} + +/**********************************************************************/ +void reopenSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY(journal->tailHeader.entryCount == 0, + "Slab journal's active block empty before reopening"); + journal->head = journal->tail; + initializeJournalState(journal); + + // Ensure no locks are spuriously held on an empty journal. + for (SequenceNumber block = 1; block <= journal->size; block++) { + ASSERT_LOG_ONLY((getLock(journal, block)->count == 0), + "Scrubbed journal's block %llu is not locked", + block); + } + + addEntries(journal); +} + +/**********************************************************************/ +static SequenceNumber getCommittingSequenceNumber(const VIOPoolEntry *entry) +{ + const PackedSlabJournalBlock *block = entry->buffer; + return getUInt64LE(block->header.fields.sequenceNumber); +} + +/** + * Handle post-commit processing. This is the callback registered by + * writeSlabJournalBlock(). + * + * @param completion The write VIO as a completion + **/ +static void completeWrite(VDOCompletion *completion) +{ + int writeResult = completion->result; + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + + SequenceNumber committed = getCommittingSequenceNumber(entry); + unspliceRingNode(&entry->node); + returnVIO(journal->slab->allocator, entry); + + if (writeResult != VDO_SUCCESS) { + logErrorWithStringError(writeResult, + "cannot write slab journal block %llu", + committed); + enterJournalReadOnlyMode(journal, writeResult); + return; + } + + relaxedAdd64(&journal->events->blocksWritten, 1); + + if (isRingEmpty(&journal->uncommittedBlocks)) { + // If no blocks are outstanding, then the commit point is at the tail. + journal->nextCommit = journal->tail; + } else { + // The commit point is always the beginning of the oldest incomplete block. + VIOPoolEntry *oldest = asVIOPoolEntry(journal->uncommittedBlocks.next); + journal->nextCommit = getCommittingSequenceNumber(oldest); + } + + updateTailBlockLocation(journal); +} + +/** + * Callback from acquireVIO() registered in commitSlabJournalTail(). + * + * @param waiter The VIO pool waiter which was just notified + * @param vioContext The VIO pool entry for the write + **/ +static void writeSlabJournalBlock(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromResourceWaiter(waiter); + VIOPoolEntry *entry = vioContext; + SlabJournalBlockHeader *header = &journal->tailHeader; + + header->head = journal->head; + pushRingNode(&journal->uncommittedBlocks, &entry->node); + packSlabJournalBlockHeader(header, &journal->block->header); + + // Copy the tail block into the VIO. + memcpy(entry->buffer, journal->block, VDO_BLOCK_SIZE); + + int unusedEntries = journal->entriesPerBlock - header->entryCount; + ASSERT_LOG_ONLY(unusedEntries >= 0, "Slab journal block is not overfull"); + if (unusedEntries > 0) { + // Release the per-entry locks for any unused entries in the block we are + // about to write. + adjustSlabJournalBlockReference(journal, header->sequenceNumber, + -unusedEntries); + journal->partialWriteInProgress = !blockIsFull(journal); + } + + PhysicalBlockNumber blockNumber + = getBlockNumber(journal, header->sequenceNumber); + + entry->parent = journal; + entry->vio->completion.callbackThreadID = journal->slab->allocator->threadID; + /* + * This block won't be read in recovery until the slab summary is updated + * to refer to it. The slab summary update does a flush which is sufficient + * to protect us from VDO-2331. + */ + launchWriteMetadataVIO(entry->vio, blockNumber, completeWrite, + completeWrite); + + // Since the write is submitted, the tail block structure can be reused. + journal->tail++; + initializeTailBlock(journal); + journal->waitingToCommit = false; + if (journal->slab->state.state == ADMIN_STATE_WAITING_FOR_RECOVERY) { + finishOperationWithResult(&journal->slab->state, + (isVDOReadOnly(journal) + ? VDO_READ_ONLY : VDO_SUCCESS)); + return; + } + + addEntries(journal); +} + +/**********************************************************************/ +void commitSlabJournalTail(SlabJournal *journal) +{ + if ((journal->tailHeader.entryCount == 0) + && mustMakeEntriesToFlush(journal)) { + // There are no entries at the moment, but there are some waiters, so defer + // initiating the flush until those entries are ready to write. + return; + } + + if (isVDOReadOnly(journal) + || journal->waitingToCommit + || (journal->tailHeader.entryCount == 0)) { + // There is nothing to do since the tail block is empty, or writing, or + // the journal is in read-only mode. + return; + } + + /* + * Since we are about to commit the tail block, this journal no longer + * needs to be on the ring of journals which the recovery journal might + * ask to commit. + */ + markSlabJournalClean(journal); + + journal->waitingToCommit = true; + + journal->resourceWaiter.callback = writeSlabJournalBlock; + int result = acquireVIO(journal->slab->allocator, &journal->resourceWaiter); + if (result != VDO_SUCCESS) { + journal->waitingToCommit = false; + enterJournalReadOnlyMode(journal, result); + return; + } +} + +/**********************************************************************/ +void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, + SlabJournalPayload *payload, + SlabBlockNumber sbn, + JournalOperation operation) +{ + JournalEntryCount entryNumber = tailHeader->entryCount++; + if (operation == BLOCK_MAP_INCREMENT) { + if (!tailHeader->hasBlockMapIncrements) { + memset(payload->fullEntries.entryTypes, 0, + SLAB_JOURNAL_ENTRY_TYPES_SIZE); + tailHeader->hasBlockMapIncrements = true; + } + + payload->fullEntries.entryTypes[entryNumber / 8] + |= ((byte) 1 << (entryNumber % 8)); + } + + packSlabJournalEntry(&payload->entries[entryNumber], sbn, + isIncrementOperation(operation)); +} + +/**********************************************************************/ +SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, + JournalEntryCount entryCount) +{ + SlabJournalEntry entry + = unpackSlabJournalEntry(&block->payload.entries[entryCount]); + if (block->header.fields.hasBlockMapIncrements + && ((block->payload.fullEntries.entryTypes[entryCount / 8] + & ((byte) 1 << (entryCount % 8))) != 0)) { + entry.operation = BLOCK_MAP_INCREMENT; + } + return entry; +} + +/** + * Actually add an entry to the slab journal, potentially firing off a write + * if a block becomes full. This function is synchronous. + * + * @param journal The slab journal to append to + * @param pbn The pbn being adjusted + * @param operation The type of entry to make + * @param recoveryPoint The recovery journal point for this entry + **/ +static void addEntry(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + const JournalPoint *recoveryPoint) +{ + int result = ASSERT(beforeJournalPoint(&journal->tailHeader.recoveryPoint, + recoveryPoint), + "recovery journal point is monotonically increasing, " + "recovery point: %llu.%u, " + "block recovery point: %llu.%u", + recoveryPoint->sequenceNumber, recoveryPoint->entryCount, + journal->tailHeader.recoveryPoint.sequenceNumber, + journal->tailHeader.recoveryPoint.entryCount); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + + PackedSlabJournalBlock *block = journal->block; + if (operation == BLOCK_MAP_INCREMENT) { + result = ASSERT_LOG_ONLY((journal->tailHeader.entryCount + < journal->fullEntriesPerBlock), + "block has room for full entries"); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + } + + encodeSlabJournalEntry(&journal->tailHeader, &block->payload, + pbn - journal->slab->start, operation); + journal->tailHeader.recoveryPoint = *recoveryPoint; + if (blockIsFull(journal)) { + commitSlabJournalTail(journal); + } +} + +/**********************************************************************/ +bool attemptReplayIntoSlabJournal(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + JournalPoint *recoveryPoint, + VDOCompletion *parent) +{ + // Only accept entries after the current recovery point. + if (!beforeJournalPoint(&journal->tailHeader.recoveryPoint, recoveryPoint)) { + return true; + } + + SlabJournalBlockHeader *header = &journal->tailHeader; + if ((header->entryCount >= journal->fullEntriesPerBlock) + && (header->hasBlockMapIncrements || + (operation == BLOCK_MAP_INCREMENT))) { + // The tail block does not have room for the entry we are attempting + // to add so commit the tail block now. + commitSlabJournalTail(journal); + } + + if (journal->waitingToCommit) { + startOperationWithWaiter(&journal->slab->state, + ADMIN_STATE_WAITING_FOR_RECOVERY, parent, NULL); + return false; + } + + if ((journal->tail - journal->head) >= journal->size) { + /* + * We must have reaped the current head before the crash, since + * the blocked threshold keeps us from having more entries than + * fit in a slab journal; hence we can just advance the head + * (and unreapable block), as needed. + */ + journal->head++; + journal->unreapable++; + } + + markSlabReplaying(journal->slab); + addEntry(journal, pbn, operation, recoveryPoint); + return true; +} + +/** + * Check whether the journal should be saving reference blocks out. + * + * @param journal The journal to check + * + * @return true if the journal should be requesting reference block writes + **/ +static bool requiresFlushing(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->flushingThreshold); +} + +/** + * Check whether the journal must be reaped before adding new entries. + * + * @param journal The journal to check + * + * @return true if the journal must be reaped + **/ +static bool requiresReaping(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->blockingThreshold); +} + +/**********************************************************************/ +bool requiresScrubbing(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->scrubbingThreshold); +} + +/** + * Implements WaiterCallback. This callback is invoked by addEntries() once + * it has determined that we are ready to make another entry in the slab + * journal. + * + * @param waiter The VIO which should make an entry now + * @param context The slab journal to make an entry in + **/ +static void addEntryFromWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + SlabJournal *journal = (SlabJournal *) context; + SlabJournalBlockHeader *header = &journal->tailHeader; + SequenceNumber recoveryBlock = dataVIO->recoveryJournalPoint.sequenceNumber; + + if (header->entryCount == 0) { + /* + * This is the first entry in the current tail block, so get a lock + * on the recovery journal which we will hold until this tail block is + * committed. + */ + getLock(journal, header->sequenceNumber)->recoveryStart = recoveryBlock; + if (journal->recoveryJournal != NULL) { + ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; + acquireRecoveryJournalBlockReference(journal->recoveryJournal, + recoveryBlock, ZONE_TYPE_PHYSICAL, + zoneNumber); + } + markSlabJournalDirty(journal, recoveryBlock); + + // If the slab journal is over the first threshold, tell the refCounts to + // write some reference blocks, but proceed apace. + if (requiresFlushing(journal)) { + relaxedAdd64(&journal->events->flushCount, 1); + BlockCount journalLength = (journal->tail - journal->head); + BlockCount blocksToDeadline = 0; + if (journalLength <= journal->flushingDeadline) { + blocksToDeadline = journal->flushingDeadline - journalLength; + } + saveSeveralReferenceBlocks(journal->slab->referenceCounts, + blocksToDeadline + 1); + } + } + + JournalPoint slabJournalPoint = { + .sequenceNumber = header->sequenceNumber, + .entryCount = header->entryCount, + }; + + addEntry(journal, dataVIO->operation.pbn, dataVIO->operation.type, + &dataVIO->recoveryJournalPoint); + + // Now that an entry has been made in the slab journal, update the + // reference counts. + int result = modifySlabReferenceCount(journal->slab, &slabJournalPoint, + dataVIO->operation); + continueDataVIO(dataVIO, result); +} + +/** + * Check whether the next entry to be made is a block map increment. + * + * @param journal The journal + * + * @return true if the first entry waiter's operation is a block + * map increment + **/ +static inline bool isNextEntryABlockMapIncrement(SlabJournal *journal) +{ + DataVIO *dataVIO = waiterAsDataVIO(getFirstWaiter(&journal->entryWaiters)); + return (dataVIO->operation.type == BLOCK_MAP_INCREMENT); +} + +/** + * Add as many entries as possible from the queue of VIOs waiting to make + * entries. By processing the queue in order, we ensure that slab journal + * entries are made in the same order as recovery journal entries for the + * same increment or decrement. + * + * @param journal The journal to which entries may be added + **/ +static void addEntries(SlabJournal *journal) +{ + if (journal->addingEntries) { + // Protect against re-entrancy. + return; + } + + journal->addingEntries = true; + while (hasWaiters(&journal->entryWaiters)) { + if (journal->partialWriteInProgress || slabIsRebuilding(journal->slab)) { + // Don't add entries while rebuilding or while a partial write is + // outstanding (VDO-2399). + break; + } + + SlabJournalBlockHeader *header = &journal->tailHeader; + if (journal->waitingToCommit) { + // If we are waiting for resources to write the tail block, and the + // tail block is full, we can't make another entry. + relaxedAdd64(&journal->events->tailBusyCount, 1); + break; + } else if (isNextEntryABlockMapIncrement(journal) + && (header->entryCount >= journal->fullEntriesPerBlock)) { + // The tail block does not have room for a block map increment, so + // commit it now. + commitSlabJournalTail(journal); + if (journal->waitingToCommit) { + relaxedAdd64(&journal->events->tailBusyCount, 1); + break; + } + } + + // If the slab is over the blocking threshold, make the VIO wait. + if (requiresReaping(journal)) { + relaxedAdd64(&journal->events->blockedCount, 1); + saveDirtyReferenceBlocks(journal->slab->referenceCounts); + break; + } + + if (header->entryCount == 0) { + JournalLock *lock = getLock(journal, header->sequenceNumber); + // Check if the on disk slab journal is full. Because of the + // blocking and scrubbing thresholds, this should never happen. + if (lock->count > 0) { + ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, + "New block has locks, but journal is not full"); + + /* + * The blocking threshold must let the journal fill up if the new + * block has locks; if the blocking threshold is smaller than the + * journal size, the new block cannot possibly have locks already. + */ + ASSERT_LOG_ONLY((journal->blockingThreshold >= journal->size), + "New block can have locks already iff blocking" + "threshold is at the end of the journal"); + + relaxedAdd64(&journal->events->diskFullCount, 1); + saveDirtyReferenceBlocks(journal->slab->referenceCounts); + break; + } + + /* + * Don't allow the new block to be reaped until all of the reference + * count blocks are written and the journal block has been + * fully committed as well. + */ + lock->count = journal->entriesPerBlock + 1; + + if (header->sequenceNumber == 1) { + /* + * This is the first entry in this slab journal, ever. Dirty all of + * the reference count blocks. Each will acquire a lock on the + * tail block so that the journal won't be reaped until the + * reference counts are initialized. The lock acquisition must + * be done by the RefCounts since here we don't know how many + * reference blocks the RefCounts has. + */ + acquireDirtyBlockLocks(journal->slab->referenceCounts); + } + } + + notifyNextWaiter(&journal->entryWaiters, addEntryFromWaiter, journal); + } + + journal->addingEntries = false; + + // If there are no waiters, and we are flushing or saving, commit the + // tail block. + if (isSlabDraining(journal->slab) && !isSuspending(&journal->slab->state) + && !hasWaiters(&journal->entryWaiters)) { + commitSlabJournalTail(journal); + } +} + +/**********************************************************************/ +void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO) +{ + if (!isSlabOpen(journal->slab)) { + continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); + return; + } + + if (isVDOReadOnly(journal)) { + continueDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + int result = enqueueDataVIO(&journal->entryWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + continueDataVIO(dataVIO, result); + return; + } + + if (isUnrecoveredSlab(journal->slab) && requiresReaping(journal)) { + increaseScrubbingPriority(journal->slab); + } + + addEntries(journal); +} + +/**********************************************************************/ +void adjustSlabJournalBlockReference(SlabJournal *journal, + SequenceNumber sequenceNumber, + int adjustment) +{ + if (sequenceNumber == 0) { + return; + } + + if (isReplayingSlab(journal->slab)) { + // Locks should not be used during offline replay. + return; + } + + ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero"); + JournalLock *lock = getLock(journal, sequenceNumber); + if (adjustment < 0) { + ASSERT_LOG_ONLY((-adjustment <= lock->count), + "adjustment %d of lock count %u for slab journal block %" + PRIu64 " must not underflow", adjustment, lock->count, + sequenceNumber); + } + + lock->count += adjustment; + if (lock->count == 0) { + reapSlabJournal(journal); + } +} + +/**********************************************************************/ +bool releaseRecoveryJournalLock(SlabJournal *journal, + SequenceNumber recoveryLock) +{ + if (recoveryLock > journal->recoveryLock) { + ASSERT_LOG_ONLY((recoveryLock < journal->recoveryLock), + "slab journal recovery lock is not older than the recovery" + " journal head"); + return false; + } + + if ((recoveryLock < journal->recoveryLock) || isVDOReadOnly(journal)) { + return false; + } + + // All locks are held by the block which is in progress; write it. + commitSlabJournalTail(journal); + return true; +} + +/**********************************************************************/ +void drainSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "drainSlabJournal() called on correct thread"); + if (isQuiescing(&journal->slab->state)) { + // XXX: we should revisit this assertion since it is no longer clear what + // it is for. + ASSERT_LOG_ONLY((!(slabIsRebuilding(journal->slab) + && hasWaiters(&journal->entryWaiters))), + "slab is recovered or has no waiters"); + } + + switch (journal->slab->state.state) { + case ADMIN_STATE_REBUILDING: + case ADMIN_STATE_SUSPENDING: + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + break; + + default: + commitSlabJournalTail(journal); + } +} + +/** + * Finish the decode process by returning the VIO and notifying the slab that + * we're done. + * + * @param completion The VIO as a completion + **/ +static void finishDecodingJournal(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + returnVIO(journal->slab->allocator, entry); + notifySlabJournalIsLoaded(journal->slab, result); +} + +/** + * Set up the in-memory journal state to the state which was written to disk. + * This is the callback registered in readSlabJournalTail(). + * + * @param completion The VIO which was used to read the journal tail + **/ +static void setDecodedState(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + PackedSlabJournalBlock *block = entry->buffer; + + SlabJournalBlockHeader header; + unpackSlabJournalBlockHeader(&block->header, &header); + + if ((header.metadataType != VDO_METADATA_SLAB_JOURNAL) + || (header.nonce != journal->slab->allocator->nonce)) { + finishDecodingJournal(completion); + return; + } + + journal->tail = header.sequenceNumber + 1; + + // If the slab is clean, this implies the slab journal is empty, so advance + // the head appropriately. + if (getSummarizedCleanliness(journal->summary, journal->slab->slabNumber)) { + journal->head = journal->tail; + } else { + journal->head = header.head; + } + + journal->tailHeader = header; + initializeJournalState(journal); + finishDecodingJournal(completion); +} + +/** + * This reads the slab journal tail block by using a VIO acquired from the VIO + * pool. This is the success callback from acquireVIOFromPool() when decoding + * the slab journal. + * + * @param waiter The VIO pool waiter which has just been notified + * @param vioContext The VIO pool entry given to the waiter + **/ +static void readSlabJournalTail(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromResourceWaiter(waiter); + Slab *slab = journal->slab; + VIOPoolEntry *entry = vioContext; + TailBlockOffset lastCommitPoint + = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); + entry->parent = journal; + + + // Slab summary keeps the commit point offset, so the tail block is the + // block before that. Calculation supports small journals in unit tests. + TailBlockOffset tailBlock = ((lastCommitPoint == 0) + ? (TailBlockOffset) (journal->size - 1) + : (lastCommitPoint - 1)); + entry->vio->completion.callbackThreadID = slab->allocator->threadID; + launchReadMetadataVIO(entry->vio, slab->journalOrigin + tailBlock, + setDecodedState, finishDecodingJournal); +} + +/**********************************************************************/ +void decodeSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "decodeSlabJournal() called on correct thread"); + Slab *slab = journal->slab; + TailBlockOffset lastCommitPoint + = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); + if ((lastCommitPoint == 0) + && !mustLoadRefCounts(journal->summary, slab->slabNumber)) { + /* + * This slab claims that it has a tail block at (journal->size - 1), but + * a head of 1. This is impossible, due to the scrubbing threshold, on + * a real system, so don't bother reading the (bogus) data off disk. + */ + ASSERT_LOG_ONLY(((journal->size < 16) + || (journal->scrubbingThreshold < (journal->size - 1))), + "Scrubbing threshold protects against reads of unwritten" + "slab journal blocks"); + notifySlabJournalIsLoaded(slab, VDO_SUCCESS); + return; + } + + journal->resourceWaiter.callback = readSlabJournalTail; + int result = acquireVIO(slab->allocator, &journal->resourceWaiter); + if (result != VDO_SUCCESS) { + notifySlabJournalIsLoaded(slab, result); + } +} + +/**********************************************************************/ +void dumpSlabJournal(const SlabJournal *journal) +{ + logInfo(" slab journal: entryWaiters=%zu waitingToCommit=%s" + " updatingSlabSummary=%s head=%llu unreapable=%" PRIu64 + " tail=%llu nextCommit=%llu summarized=%" PRIu64 + " lastSummarized=%llu recoveryJournalLock=%" PRIu64 + " dirty=%s", countWaiters(&journal->entryWaiters), + boolToString(journal->waitingToCommit), + boolToString(journal->updatingSlabSummary), + journal->head, journal->unreapable, journal->tail, + journal->nextCommit, journal->summarized, journal->lastSummarized, + journal->recoveryLock, + boolToString(isSlabJournalDirty(journal))); + // Given the frequency with which the locks are just a tiny bit off, it + // might be worth dumping all the locks, but that might be too much logging. +} diff --git a/vdo/base/slabJournal.h b/vdo/base/slabJournal.h new file mode 100644 index 0000000..a411711 --- /dev/null +++ b/vdo/base/slabJournal.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.h#8 $ + */ + +#ifndef SLAB_JOURNAL_H +#define SLAB_JOURNAL_H + +#include "completion.h" +#include "journalPoint.h" +#include "ringNode.h" +#include "types.h" + +/** + * Convert a completion to a SlabJournal. + * + * @param completion The completion to convert + * + * @return The completion as a SlabJournal + **/ +SlabJournal *asSlabJournal(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Calculate the number of slab journal entries per block. + * + * @return The number of slab journal entries per block + **/ +size_t getSlabJournalEntriesPerBlock(void) + __attribute__((warn_unused_result)); + +/** + * Obtain a pointer to a SlabJournal structure from a pointer to the + * dirtyRingNode field within it. + * + * @param node The RingNode to convert + * + * @return The RingNode as a SlabJournal + **/ +SlabJournal *slabJournalFromDirtyNode(RingNode *node) + __attribute__((warn_unused_result)); + +/** + * Create a slab journal. + * + * @param [in] allocator The block allocator which owns this journal + * @param [in] slab The parent slab of the journal + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] journalPtr The pointer to hold the new slab journal + * + * @return VDO_SUCCESS or error code + **/ +int makeSlabJournal(BlockAllocator *allocator, + Slab *slab, + RecoveryJournal *recoveryJournal, + SlabJournal **journalPtr) + __attribute__((warn_unused_result)); + +/** + * Free a slab journal and null out the reference to it. + * + * @param journalPtr The reference to the slab journal to free + **/ +void freeSlabJournal(SlabJournal **journalPtr); + +/** + * Check whether a slab journal is blank, meaning it has never had any entries + * recorded in it. + * + * @param journal The journal to query + * + * @return true if the slab journal has never been modified + **/ +bool isSlabJournalBlank(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Check whether the slab journal is on the block allocator's ring of dirty + * journals. + * + * @param journal The journal to query + * + * @return true if the journal has been added to the dirty ring + **/ +bool isSlabJournalDirty(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab journal is active. + * + * @param journal The slab journal to check + * + * @return true if the journal is active + **/ +bool isSlabJournalActive(SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Abort any VIOs waiting to make slab journal entries. + * + * @param journal The journal to abort + **/ +void abortSlabJournalWaiters(SlabJournal *journal); + +/** + * Reopen a slab journal by emptying it and then adding any pending entries. + * + * @param journal The journal to reopen + **/ +void reopenSlabJournal(SlabJournal *journal); + +/** + * Attempt to replay a recovery journal entry into a slab journal. + * + * @param journal The slab journal to use + * @param pbn The PBN for the entry + * @param operation The type of entry to add + * @param recoveryPoint The recovery journal point corresponding to this entry + * @param parent The completion to notify when there is space to add + * the entry if the entry could not be added immediately + * + * @return true if the entry was added immediately + **/ +bool attemptReplayIntoSlabJournal(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + JournalPoint *recoveryPoint, + VDOCompletion *parent) + __attribute__((warn_unused_result)); + +/** + * Add an entry to a slab journal. + * + * @param journal The slab journal to use + * @param dataVIO The DataVIO for which to add the entry + **/ +void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO); + +/** + * Adjust the reference count for a slab journal block. Note that when the + * adjustment is negative, the slab journal will be reaped. + * + * @param journal The slab journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param adjustment Amount to adjust the reference counter + **/ +void adjustSlabJournalBlockReference(SlabJournal *journal, + SequenceNumber sequenceNumber, + int adjustment); + +/** + * Request the slab journal to release the recovery journal lock it may hold on + * a specified recovery journal block. + * + * @param journal The slab journal + * @param recoveryLock The sequence number of the recovery journal block + * whose locks should be released + * + * @return true if the journal does hold a lock on the specified + * block (which it will release) + **/ +bool releaseRecoveryJournalLock(SlabJournal *journal, + SequenceNumber recoveryLock) + __attribute__((warn_unused_result)); + +/** + * Commit the tail block of a slab journal. + * + * @param journal The journal whose tail block should be committed + **/ +void commitSlabJournalTail(SlabJournal *journal); + +/** + * Drain slab journal I/O. Depending upon the type of drain (as recorded in + * the journal's slab), any dirty journal blocks may be written out. + * + * @param journal The journal to drain + **/ +void drainSlabJournal(SlabJournal *journal); + +/** + * Decode the slab journal by reading its tail. + * + * @param journal The journal to decode + **/ +void decodeSlabJournal(SlabJournal *journal); + +/** + * Check to see if the journal should be scrubbed. + * + * @param journal The slab journal + * + * @return true if the journal requires scrubbing + **/ +bool requiresScrubbing(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Dump the slab journal. + * + * @param journal The slab journal to dump + **/ +void dumpSlabJournal(const SlabJournal *journal); + +#endif // SLAB_JOURNAL_H diff --git a/vdo/base/slabJournalEraser.c b/vdo/base/slabJournalEraser.c new file mode 100644 index 0000000..7cd6a81 --- /dev/null +++ b/vdo/base/slabJournalEraser.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.c#1 $ + */ + +#include "slabJournalEraser.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "extent.h" +#include "slab.h" +#include "slabDepot.h" + +typedef struct { + VDOCompletion *parent; + VDOExtent *extent; + char *zeroBuffer; + SlabIterator slabs; +} SlabJournalEraser; + +/** + * Free the eraser and finish the parent. + * + * @param eraser The eraser that is done + * @param result The result to return to the parent + **/ +static void finishErasing(SlabJournalEraser *eraser, int result) +{ + VDOCompletion *parent = eraser->parent; + freeExtent(&eraser->extent); + FREE(eraser->zeroBuffer); + FREE(eraser); + finishCompletion(parent, result); +} + +/** + * Finish erasing slab journals with an error. + * + * @param completion A completion whose parent is the eraser + **/ +static void handleErasingError(VDOCompletion *completion) +{ + SlabJournalEraser *eraser = completion->parent; + finishErasing(eraser, eraser->extent->completion.result); +} + +/** + * Erase the next slab journal. + * + * @param extentCompletion A completion whose parent is the eraser + **/ +static void eraseNextSlabJournal(VDOCompletion *extentCompletion) +{ + SlabJournalEraser *eraser = extentCompletion->parent; + + if (!hasNextSlab(&eraser->slabs)) { + finishErasing(eraser, VDO_SUCCESS); + return; + } + + Slab *slab = nextSlab(&eraser->slabs); + writeMetadataExtent(eraser->extent, slab->journalOrigin); +} + +/**********************************************************************/ +void eraseSlabJournals(SlabDepot *depot, + SlabIterator slabs, + VDOCompletion *parent) +{ + SlabJournalEraser *eraser; + int result = ALLOCATE(1, SlabJournalEraser, __func__, &eraser); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + eraser->parent = parent; + eraser->slabs = slabs; + + BlockCount journalSize = getSlabConfig(depot)->slabJournalBlocks; + result = ALLOCATE(journalSize * VDO_BLOCK_SIZE, char, __func__, + &eraser->zeroBuffer); + if (result != VDO_SUCCESS) { + finishErasing(eraser, result); + return; + } + + result = createExtent(parent->layer, VIO_TYPE_SLAB_JOURNAL, + VIO_PRIORITY_METADATA, journalSize, eraser->zeroBuffer, + &eraser->extent); + if (result != VDO_SUCCESS) { + finishErasing(eraser, result); + return; + } + + VDOCompletion *extentCompletion = &eraser->extent->completion; + prepareCompletion(extentCompletion, eraseNextSlabJournal, + handleErasingError, getCallbackThreadID(), eraser); + eraseNextSlabJournal(extentCompletion); +} diff --git a/vdo/base/slabJournalEraser.h b/vdo/base/slabJournalEraser.h new file mode 100644 index 0000000..215d86f --- /dev/null +++ b/vdo/base/slabJournalEraser.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.h#1 $ + */ + +#ifndef SLAB_JOURNAL_ERASER_H +#define SLAB_JOURNAL_ERASER_H + +#include "slabIterator.h" +#include "types.h" + +/** + * Begin erasing slab journals, one at a time. + * + * @param depot The depot from which to erase + * @param slabs The slabs whose journals need erasing + * @param parent The object to notify when complete + **/ +void eraseSlabJournals(SlabDepot *depot, + SlabIterator slabs, + VDOCompletion *parent); + +#endif // SLAB_JOURNAL_ERASER_H diff --git a/vdo/base/slabJournalInternals.h b/vdo/base/slabJournalInternals.h new file mode 100644 index 0000000..ce7eafb --- /dev/null +++ b/vdo/base/slabJournalInternals.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalInternals.h#8 $ + */ + +#ifndef SLAB_JOURNAL_INTERNALS_H +#define SLAB_JOURNAL_INTERNALS_H + +#include "slabJournal.h" + +#include "numeric.h" + +#include "blockAllocatorInternals.h" +#include "blockMapEntry.h" +#include "journalPoint.h" +#include "slab.h" +#include "slabSummary.h" +#include "statistics.h" +#include "waitQueue.h" + +/** + * Slab journal blocks may have one of two formats, depending upon whether or + * not any of the entries in the block are block map increments. Since the + * steady state for a VDO is that all of the necessary block map pages will + * be allocated, most slab journal blocks will have only data entries. Such + * blocks can hold more entries, hence the two formats. + **/ + +/** A single slab journal entry */ +struct slabJournalEntry { + SlabBlockNumber sbn; + JournalOperation operation; +}; + +/** A single slab journal entry in its on-disk form */ +typedef union { + struct __attribute__((packed)) { + uint8_t offsetLow8; + uint8_t offsetMid8; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned offsetHigh7 : 7; + unsigned increment : 1; +#else + unsigned increment : 1; + unsigned offsetHigh7 : 7; +#endif + } fields; + + // A raw view of the packed encoding. + uint8_t raw[3]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned offset : 23; + unsigned increment : 1; + } littleEndian; +#endif +} __attribute__((packed)) PackedSlabJournalEntry; + +/** The unpacked representation of the header of a slab journal block */ +typedef struct { + /** Sequence number for head of journal */ + SequenceNumber head; + /** Sequence number for this block */ + SequenceNumber sequenceNumber; + /** The nonce for a given VDO instance */ + Nonce nonce; + /** Recovery journal point for last entry */ + JournalPoint recoveryPoint; + /** Metadata type */ + VDOMetadataType metadataType; + /** Whether this block contains block map increments */ + bool hasBlockMapIncrements; + /** The number of entries in the block */ + JournalEntryCount entryCount; +} SlabJournalBlockHeader; + +/** + * The packed, on-disk representation of a slab journal block header. + * All fields are kept in little-endian byte order. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** 64-bit sequence number for head of journal */ + byte head[8]; + /** 64-bit sequence number for this block */ + byte sequenceNumber[8]; + /** Recovery journal point for last entry, packed into 64 bits */ + PackedJournalPoint recoveryPoint; + /** The 64-bit nonce for a given VDO instance */ + byte nonce[8]; + /** 8-bit metadata type (should always be two, for the slab journal) */ + uint8_t metadataType; + /** Whether this block contains block map increments */ + bool hasBlockMapIncrements; + /** 16-bit count of the entries encoded in the block */ + byte entryCount[2]; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 8 + 1 + 1 + 2]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + SequenceNumber head; + SequenceNumber sequenceNumber; + PackedJournalPoint recoveryPoint; + Nonce nonce; + VDOMetadataType metadataType; + bool hasBlockMapIncrements; + JournalEntryCount entryCount; + } littleEndian; +#endif +} PackedSlabJournalBlockHeader; + +enum { + SLAB_JOURNAL_PAYLOAD_SIZE + = VDO_BLOCK_SIZE - sizeof(PackedSlabJournalBlockHeader), + SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25, + SLAB_JOURNAL_ENTRY_TYPES_SIZE = ((SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) + / 8) + 1, + SLAB_JOURNAL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE + / sizeof(PackedSlabJournalEntry)), +}; + +/** The payload of a slab journal block which has block map increments */ +typedef struct { + /* The entries themselves */ + PackedSlabJournalEntry entries[SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK]; + /* The bit map indicating which entries are block map increments */ + byte entryTypes[SLAB_JOURNAL_ENTRY_TYPES_SIZE]; +} __attribute__((packed)) FullSlabJournalEntries; + +typedef union { + /* Entries which include block map increments */ + FullSlabJournalEntries fullEntries; + /* Entries which are only data updates */ + PackedSlabJournalEntry entries[SLAB_JOURNAL_ENTRIES_PER_BLOCK]; + /* Ensure the payload fills to the end of the block */ + byte space[SLAB_JOURNAL_PAYLOAD_SIZE]; +} __attribute__((packed)) SlabJournalPayload; + +typedef struct { + PackedSlabJournalBlockHeader header; + SlabJournalPayload payload; +} __attribute__((packed)) PackedSlabJournalBlock; + +typedef struct { + uint16_t count; + SequenceNumber recoveryStart; +} JournalLock; + +struct slabJournal { + /** A waiter object for getting a VIO pool entry */ + Waiter resourceWaiter; + /** A waiter object for updating the slab summary */ + Waiter slabSummaryWaiter; + /** A waiter object for getting an extent with which to flush */ + Waiter flushWaiter; + /** The queue of VIOs waiting to make an entry */ + WaitQueue entryWaiters; + /** The parent slab reference of this journal */ + Slab *slab; + + /** Whether a tail block commit is pending */ + bool waitingToCommit; + /** Whether the journal is updating the slab summary */ + bool updatingSlabSummary; + /** Whether the journal is adding entries from the entryWaiters queue */ + bool addingEntries; + /** Whether a partial write is in progress */ + bool partialWriteInProgress; + + /** The oldest block in the journal on disk */ + SequenceNumber head; + /** The oldest block in the journal which may not be reaped */ + SequenceNumber unreapable; + /** The end of the half-open interval of the active journal */ + SequenceNumber tail; + /** The next journal block to be committed */ + SequenceNumber nextCommit; + /** The tail sequence number that is written in the slab summary */ + SequenceNumber summarized; + /** The tail sequence number that was last summarized in slab summary */ + SequenceNumber lastSummarized; + + /** The sequence number of the recovery journal lock */ + SequenceNumber recoveryLock; + + /** + * The number of entries which fit in a single block. Can't use the constant + * because unit tests change this number. + **/ + JournalEntryCount entriesPerBlock; + /** + * The number of full entries which fit in a single block. Can't use the + * constant because unit tests change this number. + **/ + JournalEntryCount fullEntriesPerBlock; + + /** The recovery journal of the VDO (slab journal holds locks on it) */ + RecoveryJournal *recoveryJournal; + + /** The slab summary to update tail block location */ + SlabSummaryZone *summary; + /** The statistics shared by all slab journals in our physical zone */ + AtomicSlabJournalStatistics *events; + /** A ring of the VIO pool entries for outstanding journal block writes */ + RingNode uncommittedBlocks; + + /** + * The current tail block header state. This will be packed into + * the block just before it is written. + **/ + SlabJournalBlockHeader tailHeader; + /** A pointer to a block-sized buffer holding the packed block data */ + PackedSlabJournalBlock *block; + + /** The number of blocks in the on-disk journal */ + BlockCount size; + /** The number of blocks at which to start pushing reference blocks */ + BlockCount flushingThreshold; + /** The number of blocks at which all reference blocks should be writing */ + BlockCount flushingDeadline; + /** The number of blocks at which to wait for reference blocks to write */ + BlockCount blockingThreshold; + /** The number of blocks at which to scrub the slab before coming online */ + BlockCount scrubbingThreshold; + + /** This node is for BlockAllocator to keep a queue of dirty journals */ + RingNode dirtyNode; + + /** The lock for the oldest unreaped block of the journal */ + JournalLock *reapLock; + /** The locks for each on disk block */ + JournalLock locks[]; +}; + +/** + * Get the slab journal block offset of the given sequence number. + * + * @param journal The slab journal + * @param sequence The sequence number + * + * @return the offset corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline TailBlockOffset +getSlabJournalBlockOffset(SlabJournal *journal, SequenceNumber sequence) +{ + return (sequence % journal->size); +} + +/** + * Encode a slab journal entry (exposed for unit tests). + * + * @param tailHeader The unpacked header for the block + * @param payload The journal block payload to hold the entry + * @param sbn The slab block number of the entry to encode + * @param operation The type of the entry + **/ +void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, + SlabJournalPayload *payload, + SlabBlockNumber sbn, + JournalOperation operation); + +/** + * Decode a slab journal entry. + * + * @param block The journal block holding the entry + * @param entryCount The number of the entry + * + * @return The decoded entry + **/ +SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, + JournalEntryCount entryCount) + __attribute__((warn_unused_result)); + +/** + * Generate the packed encoding of a slab journal entry. + * + * @param packed The entry into which to pack the values + * @param sbn The slab block number of the entry to encode + * @param isIncrement The increment flag + **/ +static inline void packSlabJournalEntry(PackedSlabJournalEntry *packed, + SlabBlockNumber sbn, + bool isIncrement) +{ + packed->fields.offsetLow8 = (sbn & 0x0000FF); + packed->fields.offsetMid8 = (sbn & 0x00FF00) >> 8; + packed->fields.offsetHigh7 = (sbn & 0x7F0000) >> 16; + packed->fields.increment = isIncrement ? 1 : 0; +} + +/** + * Decode the packed representation of a slab journal entry. + * + * @param packed The packed entry to decode + * + * @return The decoded slab journal entry + **/ +__attribute__((warn_unused_result)) +static inline +SlabJournalEntry unpackSlabJournalEntry(const PackedSlabJournalEntry *packed) +{ + SlabJournalEntry entry; + entry.sbn = packed->fields.offsetHigh7; + entry.sbn <<= 8; + entry.sbn |= packed->fields.offsetMid8; + entry.sbn <<= 8; + entry.sbn |= packed->fields.offsetLow8; + entry.operation + = (packed->fields.increment ? DATA_INCREMENT : DATA_DECREMENT); + return entry; +} + +/** + * Generate the packed representation of a slab block header. + * + * @param header The header containing the values to encode + * @param packed The header into which to pack the values + **/ +static inline +void packSlabJournalBlockHeader(const SlabJournalBlockHeader *header, + PackedSlabJournalBlockHeader *packed) +{ + storeUInt64LE(packed->fields.head, header->head); + storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); + storeUInt64LE(packed->fields.nonce, header->nonce); + storeUInt16LE(packed->fields.entryCount, header->entryCount); + + packed->fields.metadataType = header->metadataType; + packed->fields.hasBlockMapIncrements = header->hasBlockMapIncrements; + + packJournalPoint(&header->recoveryPoint, &packed->fields.recoveryPoint); +} + +/** + * Decode the packed representation of a slab block header. + * + * @param packed The packed header to decode + * @param header The header into which to unpack the values + **/ +static inline +void unpackSlabJournalBlockHeader(const PackedSlabJournalBlockHeader *packed, + SlabJournalBlockHeader *header) +{ + *header = (SlabJournalBlockHeader) { + .head = getUInt64LE(packed->fields.head), + .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), + .nonce = getUInt64LE(packed->fields.nonce), + .entryCount = getUInt16LE(packed->fields.entryCount), + .metadataType = packed->fields.metadataType, + .hasBlockMapIncrements = packed->fields.hasBlockMapIncrements, + }; + unpackJournalPoint(&packed->fields.recoveryPoint, &header->recoveryPoint); +} + +#endif // SLAB_JOURNAL_INTERNALS_H diff --git a/vdo/base/slabScrubber.c b/vdo/base/slabScrubber.c new file mode 100644 index 0000000..e37e9c8 --- /dev/null +++ b/vdo/base/slabScrubber.c @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.c#6 $ + */ + +#include "slabScrubberInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "blockAllocator.h" +#include "constants.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "refCountsInternals.h" +#include "slab.h" +#include "slabJournalInternals.h" + +/** + * Allocate the buffer and extent used for reading the slab journal when + * scrubbing a slab. + * + * @param scrubber The slab scrubber for which to allocate + * @param layer The physical layer on which the scrubber resides + * @param slabJournalSize The size of a slab journal + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int allocateExtentAndBuffer(SlabScrubber *scrubber, + PhysicalLayer *layer, + BlockCount slabJournalSize) +{ + size_t bufferSize = VDO_BLOCK_SIZE * slabJournalSize; + int result = ALLOCATE(bufferSize, char, __func__, &scrubber->journalData); + if (result != VDO_SUCCESS) { + return result; + } + + return createExtent(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + slabJournalSize, scrubber->journalData, + &scrubber->extent); +} + +/**********************************************************************/ +int makeSlabScrubber(PhysicalLayer *layer, + BlockCount slabJournalSize, + ReadOnlyNotifier *readOnlyNotifier, + SlabScrubber **scrubberPtr) +{ + SlabScrubber *scrubber; + int result = ALLOCATE(1, SlabScrubber, __func__, &scrubber); + if (result != VDO_SUCCESS) { + return result; + } + + result = allocateExtentAndBuffer(scrubber, layer, slabJournalSize); + if (result != VDO_SUCCESS) { + freeSlabScrubber(&scrubber); + return result; + } + + initializeCompletion(&scrubber->completion, SLAB_SCRUBBER_COMPLETION, layer); + initializeRing(&scrubber->highPrioritySlabs); + initializeRing(&scrubber->slabs); + scrubber->readOnlyNotifier = readOnlyNotifier; + scrubber->adminState.state = ADMIN_STATE_SUSPENDED; + *scrubberPtr = scrubber; + return VDO_SUCCESS; +} + +/** + * Free the extent and buffer used for reading slab journals. + * + * @param scrubber The scrubber + **/ +static void freeExtentAndBuffer(SlabScrubber *scrubber) +{ + freeExtent(&scrubber->extent); + if (scrubber->journalData != NULL) { + FREE(scrubber->journalData); + scrubber->journalData = NULL; + } +} + +/**********************************************************************/ +void freeSlabScrubber(SlabScrubber **scrubberPtr) +{ + if (*scrubberPtr == NULL) { + return; + } + + SlabScrubber *scrubber = *scrubberPtr; + freeExtentAndBuffer(scrubber); + FREE(scrubber); + *scrubberPtr = NULL; +} + +/** + * Get the next slab to scrub. + * + * @param scrubber The slab scrubber + * + * @return The next slab to scrub or NULL if there are none + **/ +static Slab *getNextSlab(SlabScrubber *scrubber) +{ + if (!isRingEmpty(&scrubber->highPrioritySlabs)) { + return slabFromRingNode(scrubber->highPrioritySlabs.next); + } + + if (!isRingEmpty(&scrubber->slabs)) { + return slabFromRingNode(scrubber->slabs.next); + } + + return NULL; +} + +/**********************************************************************/ +bool hasSlabsToScrub(SlabScrubber *scrubber) +{ + return (getNextSlab(scrubber) != NULL); +} + +/**********************************************************************/ +SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) +{ + return relaxedLoad64(&scrubber->slabCount); +} + +/**********************************************************************/ +void registerSlabForScrubbing(SlabScrubber *scrubber, + Slab *slab, + bool highPriority) +{ + ASSERT_LOG_ONLY((slab->status != SLAB_REBUILT), + "slab to be scrubbed is unrecovered"); + + if (slab->status != SLAB_REQUIRES_SCRUBBING) { + return; + } + + unspliceRingNode(&slab->ringNode); + if (!slab->wasQueuedForScrubbing) { + relaxedAdd64(&scrubber->slabCount, 1); + slab->wasQueuedForScrubbing = true; + } + + if (highPriority) { + slab->status = SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING; + pushRingNode(&scrubber->highPrioritySlabs, &slab->ringNode); + return; + } + + pushRingNode(&scrubber->slabs, &slab->ringNode); +} + +/** + * Stop scrubbing, either because there are no more slabs to scrub or because + * there's been an error. + * + * @param scrubber The scrubber + **/ +static void finishScrubbing(SlabScrubber *scrubber) +{ + if (!hasSlabsToScrub(scrubber)) { + freeExtentAndBuffer(scrubber); + } + + // Inform whoever is waiting that scrubbing has completed. + completeCompletion(&scrubber->completion); + + bool notify = hasWaiters(&scrubber->waiters); + + // Note that the scrubber has stopped, and inform anyone who might be waiting + // for that to happen. + if (!finishDraining(&scrubber->adminState)) { + scrubber->adminState.state = ADMIN_STATE_SUSPENDED; + } + + /* + * We can't notify waiters until after we've finished draining or they'll + * just requeue. Fortunately if there were waiters, we can't have been freed + * yet. + */ + if (notify) { + notifyAllWaiters(&scrubber->waiters, NULL, NULL); + } +} + +/**********************************************************************/ +static void scrubNextSlab(SlabScrubber *scrubber); + +/** + * Notify the scrubber that a slab has been scrubbed. This callback is + * registered in applyJournalEntries(). + * + * @param completion The slab rebuild completion + **/ +static void slabScrubbed(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + finishScrubbingSlab(scrubber->slab); + relaxedAdd64(&scrubber->slabCount, -1); + scrubNextSlab(scrubber); +} + +/** + * Abort scrubbing due to an error. + * + * @param scrubber The slab scrubber + * @param result The error + **/ +static void abortScrubbing(SlabScrubber *scrubber, int result) +{ + enterReadOnlyMode(scrubber->readOnlyNotifier, result); + setCompletionResult(&scrubber->completion, result); + scrubNextSlab(scrubber); +} + +/** + * Handle errors while rebuilding a slab. + * + * @param completion The slab rebuild completion + **/ +static void handleScrubberError(VDOCompletion *completion) +{ + abortScrubbing(completion->parent, completion->result); +} + +/** + * Apply all the entries in a block to the reference counts. + * + * @param block A block with entries to apply + * @param entryCount The number of entries to apply + * @param blockNumber The sequence number of the block + * @param slab The slab to apply the entries to + * + * @return VDO_SUCCESS or an error code + **/ +static int applyBlockEntries(PackedSlabJournalBlock *block, + JournalEntryCount entryCount, + SequenceNumber blockNumber, + Slab *slab) +{ + JournalPoint entryPoint = { + .sequenceNumber = blockNumber, + .entryCount = 0, + }; + + SlabBlockNumber maxSBN = slab->end - slab->start; + while (entryPoint.entryCount < entryCount) { + SlabJournalEntry entry = decodeSlabJournalEntry(block, + entryPoint.entryCount); + if (entry.sbn > maxSBN) { + // This entry is out of bounds. + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Slab journal entry" + " (%llu, %u) had invalid offset" + " %u in slab (size %u blocks)", + blockNumber, entryPoint.entryCount, + entry.sbn, maxSBN); + } + + int result = replayReferenceCountChange(slab->referenceCounts, &entryPoint, + entry); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, "Slab journal entry (%llu, %u)" + " (%s of offset %" PRIu32 ") could not be" + " applied in slab %u", + blockNumber, entryPoint.entryCount, + getJournalOperationName(entry.operation), + entry.sbn, slab->slabNumber); + return result; + } + entryPoint.entryCount++; + } + + return VDO_SUCCESS; +} + +/** + * Find the relevant extent of the slab journal and apply all valid entries. + * This is a callback registered in startScrubbing(). + * + * @param completion The metadata read extent completion + **/ +static void applyJournalEntries(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + Slab *slab = scrubber->slab; + SlabJournal *journal = slab->journal; + RefCounts *referenceCounts = slab->referenceCounts; + + // Find the boundaries of the useful part of the journal. + SequenceNumber tail = journal->tail; + TailBlockOffset endIndex = getSlabJournalBlockOffset(journal, tail - 1); + char *endData = scrubber->journalData + (endIndex * VDO_BLOCK_SIZE); + PackedSlabJournalBlock *endBlock = (PackedSlabJournalBlock *) endData; + + SequenceNumber head = getUInt64LE(endBlock->header.fields.head); + TailBlockOffset headIndex = getSlabJournalBlockOffset(journal, head); + BlockCount index = headIndex; + + JournalPoint refCountsPoint = referenceCounts->slabJournalPoint; + JournalPoint lastEntryApplied = refCountsPoint; + for (SequenceNumber sequence = head; sequence < tail; sequence++) { + char *blockData = scrubber->journalData + (index * VDO_BLOCK_SIZE); + PackedSlabJournalBlock *block = (PackedSlabJournalBlock *) blockData; + SlabJournalBlockHeader header; + unpackSlabJournalBlockHeader(&block->header, &header); + + if ((header.nonce != slab->allocator->nonce) + || (header.metadataType != VDO_METADATA_SLAB_JOURNAL) + || (header.sequenceNumber != sequence) + || (header.entryCount > journal->entriesPerBlock) + || (header.hasBlockMapIncrements + && (header.entryCount > journal->fullEntriesPerBlock))) { + // The block is not what we expect it to be. + logError("Slab journal block for slab %u was invalid", + slab->slabNumber); + abortScrubbing(scrubber, VDO_CORRUPT_JOURNAL); + return; + } + + int result = applyBlockEntries(block, header.entryCount, sequence, slab); + if (result != VDO_SUCCESS) { + abortScrubbing(scrubber, result); + return; + } + + lastEntryApplied.sequenceNumber = sequence; + lastEntryApplied.entryCount = header.entryCount - 1; + index++; + if (index == journal->size) { + index = 0; + } + } + + // At the end of rebuild, the refCounts should be accurate to the end + // of the journal we just applied. + int result = ASSERT(!beforeJournalPoint(&lastEntryApplied, &refCountsPoint), + "Refcounts are not more accurate than the slab journal"); + if (result != VDO_SUCCESS) { + abortScrubbing(scrubber, result); + return; + } + + // Save out the rebuilt reference blocks. + prepareCompletion(completion, slabScrubbed, handleScrubberError, + completion->callbackThreadID, scrubber); + startSlabAction(slab, ADMIN_STATE_SAVE_FOR_SCRUBBING, completion); +} + +/** + * Read the current slab's journal from disk now that it has been flushed. + * This callback is registered in scrubNextSlab(). + * + * @param completion The scrubber's extent completion + **/ +static void startScrubbing(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + Slab *slab = scrubber->slab; + if (getSummarizedCleanliness(slab->allocator->summary, slab->slabNumber)) { + slabScrubbed(completion); + return; + } + + prepareCompletion(&scrubber->extent->completion, applyJournalEntries, + handleScrubberError, completion->callbackThreadID, + completion->parent); + readMetadataExtent(scrubber->extent, slab->journalOrigin); +} + +/** + * Scrub the next slab if there is one. + * + * @param scrubber The scrubber + **/ +static void scrubNextSlab(SlabScrubber *scrubber) +{ + // Note: this notify call is always safe only because scrubbing can only + // be started when the VDO is quiescent. + notifyAllWaiters(&scrubber->waiters, NULL, NULL); + if (isReadOnly(scrubber->readOnlyNotifier)) { + setCompletionResult(&scrubber->completion, VDO_READ_ONLY); + finishScrubbing(scrubber); + return; + } + + Slab *slab = getNextSlab(scrubber); + if ((slab == NULL) + || (scrubber->highPriorityOnly + && isRingEmpty(&scrubber->highPrioritySlabs))) { + scrubber->highPriorityOnly = false; + finishScrubbing(scrubber); + return; + } + + if (finishDraining(&scrubber->adminState)) { + return; + } + + unspliceRingNode(&slab->ringNode); + scrubber->slab = slab; + VDOCompletion *completion = extentAsCompletion(scrubber->extent); + prepareCompletion(completion, startScrubbing, + handleScrubberError, scrubber->completion.callbackThreadID, + scrubber); + startSlabAction(slab, ADMIN_STATE_SCRUBBING, completion); +} + +/**********************************************************************/ +void scrubSlabs(SlabScrubber *scrubber, + void *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + resumeIfQuiescent(&scrubber->adminState); + ThreadID threadID = getCallbackThreadID(); + prepareCompletion(&scrubber->completion, callback, errorHandler, threadID, + parent); + if (!hasSlabsToScrub(scrubber)) { + finishScrubbing(scrubber); + return; + } + + scrubNextSlab(scrubber); +} + +/**********************************************************************/ +void scrubHighPrioritySlabs(SlabScrubber *scrubber, + bool scrubAtLeastOne, + VDOCompletion *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + if (scrubAtLeastOne && isRingEmpty(&scrubber->highPrioritySlabs)) { + Slab *slab = getNextSlab(scrubber); + if (slab != NULL) { + registerSlabForScrubbing(scrubber, slab, true); + } + } + scrubber->highPriorityOnly = true; + scrubSlabs(scrubber, parent, callback, errorHandler); +} + +/**********************************************************************/ +void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) +{ + if (isQuiescent(&scrubber->adminState)) { + completeCompletion(parent); + } else { + startDraining(&scrubber->adminState, ADMIN_STATE_SUSPENDING, parent, NULL); + } +} + +/**********************************************************************/ +void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) +{ + if (!hasSlabsToScrub(scrubber)) { + completeCompletion(parent); + return; + } + + int result = resumeIfQuiescent(&scrubber->adminState); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + scrubNextSlab(scrubber); + completeCompletion(parent); +} + +/**********************************************************************/ +int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter) +{ + if (isReadOnly(scrubber->readOnlyNotifier)) { + return VDO_READ_ONLY; + } + + if (isQuiescent(&scrubber->adminState)) { + return VDO_NO_SPACE; + } + + return enqueueWaiter(&scrubber->waiters, waiter); +} + +/**********************************************************************/ +void dumpSlabScrubber(const SlabScrubber *scrubber) +{ + logInfo("slabScrubber slabCount %u waiters %zu %s%s", + getScrubberSlabCount(scrubber), + countWaiters(&scrubber->waiters), + getAdminStateName(&scrubber->adminState), + scrubber->highPriorityOnly ? ", highPriorityOnly " : ""); +} diff --git a/vdo/base/slabScrubber.h b/vdo/base/slabScrubber.h new file mode 100644 index 0000000..ca13e63 --- /dev/null +++ b/vdo/base/slabScrubber.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.h#4 $ + */ + +#ifndef SLAB_SCRUBBER_H +#define SLAB_SCRUBBER_H + +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Create a slab scrubber + * + * @param layer The physical layer of the VDO + * @param slabJournalSize The size of a slab journal in blocks + * @param readOnlyNotifier The context for entering read-only mode + * @param scrubberPtr A pointer to hold the scrubber + * + * @return VDO_SUCCESS or an error + **/ +int makeSlabScrubber(PhysicalLayer *layer, + BlockCount slabJournalSize, + ReadOnlyNotifier *readOnlyNotifier, + SlabScrubber **scrubberPtr) + __attribute__((warn_unused_result)); + +/** + * Free a slab scrubber and null out the reference to it. + * + * @param scrubberPtr A pointer to the scrubber to destroy + **/ +void freeSlabScrubber(SlabScrubber **scrubberPtr); + +/** + * Check whether a scrubber has slabs to scrub. + * + * @param scrubber The scrubber to check + * + * @return true if the scrubber has slabs to scrub + **/ +bool hasSlabsToScrub(SlabScrubber *scrubber) + __attribute__((warn_unused_result)); + +/** + * Register a slab with a scrubber. + * + * @param scrubber The scrubber + * @param slab The slab to scrub + * @param highPriority true if the slab should be put on the + * high-priority queue + **/ +void registerSlabForScrubbing(SlabScrubber *scrubber, + Slab *slab, + bool highPriority); + +/** + * Scrub all the slabs which have been registered with a slab scrubber. + * + * @param scrubber The scrubber + * @param parent The object to notify when scrubbing is complete + * @param callback The function to run when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + **/ +void scrubSlabs(SlabScrubber *scrubber, + void *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Scrub any slabs which have been registered at high priority with a slab + * scrubber. + * + * @param scrubber The scrubber + * @param scrubAtLeastOne true if one slab should always be + * scrubbed, even if there are no high-priority slabs + * (and there is at least one low priority slab) + * @param parent The completion to notify when scrubbing is complete + * @param callback The function to run when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + **/ +void scrubHighPrioritySlabs(SlabScrubber *scrubber, + bool scrubAtLeastOne, + VDOCompletion *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Tell the scrubber to stop scrubbing after it finishes the slab it is + * currently working on. + * + * @param scrubber The scrubber to stop + * @param parent The completion to notify when scrubbing has stopped + **/ +void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); + +/** + * Tell the scrubber to resume scrubbing if it has been stopped. + * + * @param scrubber The scrubber to resume + * @param parent The object to notify once scrubbing has resumed + **/ +void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); + +/** + * Wait for a clean slab. + * + * @param scrubber The scrubber on which to wait + * @param waiter The waiter + * + * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no + * slabs to scrub, and some other error otherwise + **/ +int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter); + +/** + * Get the number of slabs that are unrecovered or being scrubbed. + * + * @param scrubber The scrubber to query + * + * @return the number of slabs that are unrecovered or being scrubbed + **/ +SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) + __attribute__((warn_unused_result)); + +/** + * Dump information about a slab scrubber to the log for debugging. + * + * @param scrubber The scrubber to dump + **/ +void dumpSlabScrubber(const SlabScrubber *scrubber); + +#endif /* SLAB_SCRUBBER_H */ diff --git a/vdo/base/slabScrubberInternals.h b/vdo/base/slabScrubberInternals.h new file mode 100644 index 0000000..3d3e8cd --- /dev/null +++ b/vdo/base/slabScrubberInternals.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubberInternals.h#5 $ + */ + +#ifndef SLAB_SCRUBBER_INTERNALS_H +#define SLAB_SCRUBBER_INTERNALS_H + +#include "slabScrubber.h" + +#include "adminState.h" +#include "atomic.h" +#include "extent.h" +#include "ringNode.h" + +struct slabScrubber { + VDOCompletion completion; + /** The queue of slabs to scrub first */ + RingNode highPrioritySlabs; + /** The queue of slabs to scrub once there are no highPrioritySlabs */ + RingNode slabs; + /** The queue of VIOs waiting for a slab to be scrubbed */ + WaitQueue waiters; + + // The number of slabs that are unrecovered or being scrubbed. This field is + // modified by the physical zone thread, but is queried by other threads. + Atomic64 slabCount; + + /** The administrative state of the scrubber */ + AdminState adminState; + /** Whether to only scrub high-priority slabs */ + bool highPriorityOnly; + /** The context for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The slab currently being scrubbed */ + Slab *slab; + /** The extent for loading slab journal blocks */ + VDOExtent *extent; + /** A buffer to store the slab journal blocks */ + char *journalData; +}; + +#endif // SLAB_SCRUBBER_INTERNALS_H diff --git a/vdo/base/slabSummary.c b/vdo/base/slabSummary.c new file mode 100644 index 0000000..7021c67 --- /dev/null +++ b/vdo/base/slabSummary.c @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.c#7 $ + */ + +#include "slabSummary.h" + +#include "memoryAlloc.h" + +#include "adminState.h" +#include "constants.h" +#include "extent.h" +#include "readOnlyNotifier.h" +#include "slabSummaryInternals.h" +#include "threadConfig.h" +#include "types.h" + +// SIZING + +/**********************************************************************/ +static BlockCount getSlabSummaryZoneSize(BlockSize blockSize) +{ + SlabCount entriesPerBlock = blockSize / sizeof(SlabSummaryEntry); + BlockCount blocksNeeded = MAX_SLABS / entriesPerBlock; + return blocksNeeded; +} + +/**********************************************************************/ +BlockCount getSlabSummarySize(BlockSize blockSize) +{ + return getSlabSummaryZoneSize(blockSize) * MAX_PHYSICAL_ZONES; +} + +// FULLNESS HINT COMPUTATION + +/** + * Translate a slab's free block count into a 'fullness hint' that can be + * stored in a SlabSummaryEntry's 7 bits that are dedicated to its free count. + * + * Note: the number of free blocks must be strictly less than 2^23 blocks, + * even though theoretically slabs could contain precisely 2^23 blocks; there + * is an assumption that at least one block is used by metadata. This + * assumption is necessary; otherwise, the fullness hint might overflow. + * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but + * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which + * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free + * blocks. + * + * @param summary The summary which is being updated + * @param freeBlocks The number of free blocks + * + * @return A fullness hint, which can be stored in 7 bits. + **/ +__attribute__((warn_unused_result)) +static uint8_t computeFullnessHint(SlabSummary *summary, BlockCount freeBlocks) +{ + ASSERT_LOG_ONLY((freeBlocks < (1 << 23)), + "free blocks must be less than 2^23"); + + if (freeBlocks == 0) { + return 0; + } + + BlockCount hint = freeBlocks >> summary->hintShift; + return ((hint == 0) ? 1 : hint); +} + +/** + * Translate a slab's free block hint into an approximate count, such that + * computeFullnessHint() is the inverse function of getApproximateFreeBlocks() + * (i.e. computeFullnessHint(getApproximateFreeBlocks(x)) == x). + * + * @param summary The summary from which the hint was obtained + * @param freeBlockHint The hint read from the summary + * + * @return An approximation to the free block count + **/ +__attribute__((warn_unused_result)) +static BlockCount getApproximateFreeBlocks(SlabSummary *summary, + uint8_t freeBlockHint) +{ + return ((BlockCount) freeBlockHint) << summary->hintShift; +} + +// MAKE/FREE FUNCTIONS + +/**********************************************************************/ +static void launchWrite(SlabSummaryBlock *summaryBlock); + +/** + * Initialize a SlabSummaryBlock. + * + * @param layer The backing layer + * @param summaryZone The parent SlabSummaryZone + * @param threadID The ID of the thread of physical zone of this block + * @param entries The entries this block manages + * @param index The index of this block in its zone's summary + * @param slabSummaryBlock The block to intialize + * + * @return VDO_SUCCESS or an error + **/ +static int initializeSlabSummaryBlock(PhysicalLayer *layer, + SlabSummaryZone *summaryZone, + ThreadID threadID, + SlabSummaryEntry *entries, + BlockCount index, + SlabSummaryBlock *slabSummaryBlock) +{ + int result = ALLOCATE(VDO_BLOCK_SIZE, char, __func__, + &slabSummaryBlock->outgoingEntries); + if (result != VDO_SUCCESS) { + return result; + } + + result = createVIO(layer, VIO_TYPE_SLAB_SUMMARY, VIO_PRIORITY_METADATA, + slabSummaryBlock, slabSummaryBlock->outgoingEntries, + &slabSummaryBlock->vio); + if (result != VDO_SUCCESS) { + return result; + } + + slabSummaryBlock->vio->completion.callbackThreadID = threadID; + slabSummaryBlock->zone = summaryZone; + slabSummaryBlock->entries = entries; + slabSummaryBlock->index = index; + return VDO_SUCCESS; +} + +/** + * Create a new, empty SlabSummaryZone object. + * + * @param summary The summary to which the new zone will belong + * @param layer The layer + * @param zoneNumber The zone this is + * @param threadID The ID of the thread for this zone + * @param entries The buffer to hold the entries in this zone + * + * @return VDO_SUCCESS or an error + **/ +static int makeSlabSummaryZone(SlabSummary *summary, + PhysicalLayer *layer, + ZoneCount zoneNumber, + ThreadID threadID, + SlabSummaryEntry *entries) +{ + int result = ALLOCATE_EXTENDED(SlabSummaryZone, summary->blocksPerZone, + SlabSummaryBlock, __func__, + &summary->zones[zoneNumber]); + if (result != VDO_SUCCESS) { + return result; + } + + SlabSummaryZone *summaryZone = summary->zones[zoneNumber]; + summaryZone->summary = summary; + summaryZone->zoneNumber = zoneNumber; + summaryZone->entries = entries; + + if (layer->createMetadataVIO == NULL) { + // Blocks are only used for writing, and without a createVIO() call, + // we'll never be writing anything. + return VDO_SUCCESS; + } + + // Initialize each block. + for (BlockCount i = 0; i < summary->blocksPerZone; i++) { + result = initializeSlabSummaryBlock(layer, summaryZone, threadID, entries, + i, &summaryZone->summaryBlocks[i]); + if (result != VDO_SUCCESS) { + return result; + } + entries += summary->entriesPerBlock; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeSlabSummary(PhysicalLayer *layer, + Partition *partition, + const ThreadConfig *threadConfig, + unsigned int slabSizeShift, + BlockCount maximumFreeBlocksPerSlab, + ReadOnlyNotifier *readOnlyNotifier, + SlabSummary **slabSummaryPtr) +{ + BlockCount blocksPerZone = getSlabSummaryZoneSize(VDO_BLOCK_SIZE); + SlabCount entriesPerBlock = MAX_SLABS / blocksPerZone; + int result = ASSERT((entriesPerBlock * blocksPerZone) == MAX_SLABS, + "block size must be a multiple of entry size"); + if (result != VDO_SUCCESS) { + return result; + } + + if (partition == NULL) { + // Don't make a slab summary for the formatter since it doesn't need it. + return VDO_SUCCESS; + } + + SlabSummary *summary; + result = ALLOCATE_EXTENDED(SlabSummary, threadConfig->physicalZoneCount, + SlabSummaryZone *, __func__, &summary); + if (result != VDO_SUCCESS) { + return result; + } + + summary->zoneCount = threadConfig->physicalZoneCount; + summary->readOnlyNotifier = readOnlyNotifier; + summary->hintShift = (slabSizeShift > 6) ? (slabSizeShift - 6) : 0; + summary->blocksPerZone = blocksPerZone; + summary->entriesPerBlock = entriesPerBlock; + + size_t totalEntries = MAX_SLABS * MAX_PHYSICAL_ZONES; + size_t entryBytes = totalEntries * sizeof(SlabSummaryEntry); + result = layer->allocateIOBuffer(layer, entryBytes, "summary entries", + (char **) &summary->entries); + if (result != VDO_SUCCESS) { + freeSlabSummary(&summary); + return result; + } + + // Initialize all the entries. + uint8_t hint = computeFullnessHint(summary, maximumFreeBlocksPerSlab); + for (size_t i = 0; i < totalEntries; i++) { + // This default tail block offset must be reflected in + // slabJournal.c::readSlabJournalTail(). + summary->entries[i] = (SlabSummaryEntry) { + .tailBlockOffset = 0, + .fullnessHint = hint, + .loadRefCounts = false, + .isDirty = false, + }; + } + + setSlabSummaryOrigin(summary, partition); + for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { + result = makeSlabSummaryZone(summary, layer, zone, + getPhysicalZoneThread(threadConfig, zone), + summary->entries + (MAX_SLABS * zone)); + if (result != VDO_SUCCESS) { + freeSlabSummary(&summary); + return result; + } + } + + *slabSummaryPtr = summary; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabSummary(SlabSummary **slabSummaryPtr) +{ + if (*slabSummaryPtr == NULL) { + return; + } + + SlabSummary *summary = *slabSummaryPtr; + for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { + SlabSummaryZone *summaryZone = summary->zones[zone]; + if (summaryZone != NULL) { + for (BlockCount i = 0; i < summary->blocksPerZone; i++) { + freeVIO(&summaryZone->summaryBlocks[i].vio); + FREE(summaryZone->summaryBlocks[i].outgoingEntries); + } + FREE(summaryZone); + } + } + FREE(summary->entries); + FREE(summary); + *slabSummaryPtr = NULL; +} + +/**********************************************************************/ +SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) +{ + return summary->zones[zone]; +} + +// WRITING FUNCTIONALITY + +/** + * Check whether a summary zone has finished draining. + * + * @param summaryZone The zone to check + **/ +static void checkForDrainComplete(SlabSummaryZone *summaryZone) +{ + if (!isDraining(&summaryZone->state) || (summaryZone->writeCount > 0)) { + return; + } + + finishOperationWithResult(&summaryZone->state, + (isReadOnly(summaryZone->summary->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/** + * Wake all the waiters in a given queue. If the VDO is in read-only mode they + * will be given a VDO_READ_ONLY error code as their context, otherwise they + * will be given VDO_SUCCESS. + * + * @param summaryZone The slab summary which owns the queue + * @param queue The queue to notify + **/ +static void notifyWaiters(SlabSummaryZone *summaryZone, WaitQueue *queue) +{ + int result = (isReadOnly(summaryZone->summary->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS); + notifyAllWaiters(queue, NULL, &result); +} + +/** + * Finish processing a block which attempted to write, whether or not the + * attempt succeeded. + * + * @param block The block + **/ +static void finishUpdatingSlabSummaryBlock(SlabSummaryBlock *block) +{ + notifyWaiters(block->zone, &block->currentUpdateWaiters); + block->writing = false; + block->zone->writeCount--; + if (hasWaiters(&block->nextUpdateWaiters)) { + launchWrite(block); + } else { + checkForDrainComplete(block->zone); + } +} + +/** + * This is the callback for a successful block write. + * + * @param completion The write VIO + **/ +static void finishUpdate(VDOCompletion *completion) +{ + SlabSummaryBlock *block = completion->parent; + atomicAdd64(&block->zone->summary->statistics.blocksWritten, 1); + finishUpdatingSlabSummaryBlock(block); +} + +/** + * Handle an error writing a slab summary block. + * + * @param completion The write VIO + **/ +static void handleWriteError(VDOCompletion *completion) +{ + SlabSummaryBlock *block = completion->parent; + enterReadOnlyMode(block->zone->summary->readOnlyNotifier, + completion->result); + finishUpdatingSlabSummaryBlock(block); +} + +/** + * Write a slab summary block unless it is currently out for writing. + * + * @param [in] block The block that needs to be committed + **/ +static void launchWrite(SlabSummaryBlock *block) +{ + if (block->writing) { + return; + } + + SlabSummaryZone *zone = block->zone; + zone->writeCount++; + transferAllWaiters(&block->nextUpdateWaiters, &block->currentUpdateWaiters); + block->writing = true; + + SlabSummary *summary = zone->summary; + if (isReadOnly(summary->readOnlyNotifier)) { + finishUpdatingSlabSummaryBlock(block); + return; + } + + memcpy(block->outgoingEntries, block->entries, + sizeof(SlabSummaryEntry) * summary->entriesPerBlock); + + // Flush before writing to ensure that the slab journal tail blocks and + // reference updates covered by this summary update are stable (VDO-2332). + PhysicalBlockNumber pbn = (summary->origin + + (summary->blocksPerZone * zone->zoneNumber) + + block->index); + launchWriteMetadataVIOWithFlush(block->vio, pbn, finishUpdate, + handleWriteError, true, false); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, SlabSummaryZone, state)); +} + +/**********************************************************************/ +void drainSlabSummaryZone(SlabSummaryZone *summaryZone, + AdminStateCode operation, + VDOCompletion *parent) +{ + startDraining(&summaryZone->state, operation, parent, initiateDrain); +} + +/**********************************************************************/ +void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, VDOCompletion *parent) +{ + finishCompletion(parent, resumeIfQuiescent(&summaryZone->state)); +} + +// READ/UPDATE FUNCTIONS + +/** + * Get the summary block, and offset into it, for storing the summary for a + * slab. + * + * @param summaryZone The SlabSummaryZone being queried + * @param slabNumber The slab whose summary location is sought + * + * @return A pointer to the SlabSummaryEntryBlock containing this + * SlabSummaryEntry + **/ +static SlabSummaryBlock *getSummaryBlockForSlab(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + SlabCount entriesPerBlock = summaryZone->summary->entriesPerBlock; + return &summaryZone->summaryBlocks[slabNumber / entriesPerBlock]; +} + +/**********************************************************************/ +void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, + Waiter *waiter, + SlabCount slabNumber, + TailBlockOffset tailBlockOffset, + bool loadRefCounts, + bool isClean, + BlockCount freeBlocks) +{ + SlabSummaryBlock *block = getSummaryBlockForSlab(summaryZone, slabNumber); + int result; + if (isReadOnly(summaryZone->summary->readOnlyNotifier)) { + result = VDO_READ_ONLY; + } else if (isDraining(&summaryZone->state) + || isQuiescent(&summaryZone->state)) { + result = VDO_INVALID_ADMIN_STATE; + } else { + uint8_t hint = computeFullnessHint(summaryZone->summary, freeBlocks); + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + *entry = (SlabSummaryEntry) { + .tailBlockOffset = tailBlockOffset, + .loadRefCounts = (entry->loadRefCounts || loadRefCounts), + .isDirty = !isClean, + .fullnessHint = hint, + }; + result = enqueueWaiter(&block->nextUpdateWaiters, waiter); + } + + if (result != VDO_SUCCESS) { + waiter->callback(waiter, &result); + return; + } + + launchWrite(block); +} + +/**********************************************************************/ +TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + return summaryZone->entries[slabNumber].tailBlockOffset; +} + +/**********************************************************************/ +bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) +{ + return summaryZone->entries[slabNumber].loadRefCounts; +} + +/**********************************************************************/ +bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + return !summaryZone->entries[slabNumber].isDirty; +} + +/**********************************************************************/ +BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + return getApproximateFreeBlocks(summaryZone->summary, entry->fullnessHint); +} + +/**********************************************************************/ +void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, + SlabCount slabNumber, + size_t *freeBlockHint, + bool *isClean) +{ + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + *freeBlockHint = entry->fullnessHint; + *isClean = !entry->isDirty; +} + +/**********************************************************************/ +void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, + SlabCount slabCount, + SlabStatus *statuses) +{ + for (SlabCount i = 0; i < slabCount; i++) { + statuses[i] = (SlabStatus) { + .slabNumber = i, + .isClean = !summaryZone->entries[i].isDirty, + .emptiness = summaryZone->entries[i].fullnessHint + }; + } +} + +// RESIZE FUNCTIONS + +/**********************************************************************/ +void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition) +{ + summary->origin = getFixedLayoutPartitionOffset(partition); +} + +// COMBINING FUNCTIONS (LOAD) + +/** + * Clean up after saving out the combined slab summary. This callback is + * registered in finishLoadingSummary() and loadSlabSummary(). + * + * @param completion The extent which was used to write the summary data + **/ +static void finishCombiningZones(VDOCompletion *completion) +{ + SlabSummary *summary = completion->parent; + int result = completion->result; + VDOExtent *extent = asVDOExtent(completion); + freeExtent(&extent); + finishLoadingWithResult(&summary->zones[0]->state, result); +} + +/**********************************************************************/ +void combineZones(SlabSummary *summary) +{ + // Combine all the old summary data into the portion of the buffer + // corresponding to the first zone. + ZoneCount zone = 0; + if (summary->zonesToCombine > 1) { + for (SlabCount entryNumber = 0; entryNumber < MAX_SLABS; entryNumber++) { + if (zone != 0) { + memcpy(summary->entries + entryNumber, + summary->entries + (zone * MAX_SLABS) + entryNumber, + sizeof(SlabSummaryEntry)); + } + zone++; + if (zone == summary->zonesToCombine) { + zone = 0; + } + } + } + + // Copy the combined data to each zones's region of the buffer. + for (zone = 1; zone < MAX_PHYSICAL_ZONES; zone++) { + memcpy(summary->entries + (zone * MAX_SLABS), summary->entries, + MAX_SLABS * sizeof(SlabSummaryEntry)); + } +} + +/** + * Combine the slab summary data from all the previously written zones + * and copy the combined summary to each partition's data region. Then write + * the combined summary back out to disk. This callback is registered in + * loadSlabSummary(). + * + * @param completion The extent which was used to read the summary data + **/ +static void finishLoadingSummary(VDOCompletion *completion) +{ + SlabSummary *summary = completion->parent; + VDOExtent *extent = asVDOExtent(completion); + + // Combine the zones so each zone is correct for all slabs. + combineZones(summary); + + // Write the combined summary back out. + extent->completion.callback = finishCombiningZones; + writeMetadataExtent(extent, summary->origin); +} + +/**********************************************************************/ +void loadSlabSummary(SlabSummary *summary, + AdminStateCode operation, + ZoneCount zonesToCombine, + VDOCompletion *parent) +{ + SlabSummaryZone *zone = summary->zones[0]; + if (!startLoading(&zone->state, operation, parent, NULL)) { + return; + } + + VDOExtent *extent; + BlockCount blocks = summary->blocksPerZone * MAX_PHYSICAL_ZONES; + int result = createExtent(parent->layer, VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, blocks, + (char *) summary->entries, &extent); + if (result != VDO_SUCCESS) { + finishLoadingWithResult(&zone->state, result); + return; + } + + if ((operation == ADMIN_STATE_FORMATTING) + || (operation == ADMIN_STATE_LOADING_FOR_REBUILD)) { + prepareCompletion(&extent->completion, finishCombiningZones, + finishCombiningZones, 0, summary); + writeMetadataExtent(extent, summary->origin); + return; + } + + summary->zonesToCombine = zonesToCombine; + prepareCompletion(&extent->completion, finishLoadingSummary, + finishCombiningZones, 0, summary); + readMetadataExtent(extent, summary->origin); +} + +/**********************************************************************/ +SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) +{ + const AtomicSlabSummaryStatistics *atoms = &summary->statistics; + return (SlabSummaryStatistics) { + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + }; +} diff --git a/vdo/base/slabSummary.h b/vdo/base/slabSummary.h new file mode 100644 index 0000000..4ce32cb --- /dev/null +++ b/vdo/base/slabSummary.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.h#5 $ + */ + +#ifndef SLAB_SUMMARY_H +#define SLAB_SUMMARY_H + +#include "completion.h" +#include "fixedLayout.h" +#include "slab.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +/** + * The SlabSummary provides hints during load and recovery about the state + * of the slabs in order to avoid the need to read the slab journals in their + * entirety before a VDO can come online. + * + * The information in the summary for each slab includes the rough number of + * free blocks (which is used to prioritize scrubbing), the cleanliness of a + * slab (so that clean slabs containing free space will be used on restart), + * and the location of the tail block of the slab's journal. + * + * The SlabSummary has its own partition at the end of the volume which is + * sized to allow for a complete copy of the summary for each of up to 16 + * physical zones. + * + * During resize, the SlabSummary moves its backing partition and is saved once + * moved; the SlabSummary is not permitted to overwrite the previous recovery + * journal space. + * + * The SlabSummary does not have its own version information, but relies on the + * master version number. + **/ + +/** + * The offset of a slab journal tail block. + **/ +typedef uint8_t TailBlockOffset; + +/** + * A slab status is a very small structure for use in determining the ordering + * of slabs in the scrubbing process. + **/ +typedef struct slabStatus { + SlabCount slabNumber; + bool isClean; + uint8_t emptiness; +} SlabStatus; + +/** + * Returns the size on disk of the SlabSummary structure. + * + * @param blockSize The block size of the physical layer + * + * @return the blocks required to store the SlabSummary on disk + **/ +BlockCount getSlabSummarySize(BlockSize blockSize) +__attribute__((warn_unused_result)); + +/** + * Create a slab summary. + * + * @param [in] layer The layer + * @param [in] partition The partition to hold the summary + * @param [in] threadConfig The thread config of the VDO + * @param [in] slabSizeShift The number of bits in the slab size + * @param [in] maximumFreeBlocksPerSlab The maximum number of free blocks a + * slab can have + * @param [in] readOnlyNotifier The context for entering read-only + * mode + * @param [out] slabSummaryPtr A pointer to hold the summary + * + * @return VDO_SUCCESS or an error + **/ +int makeSlabSummary(PhysicalLayer *layer, + Partition *partition, + const ThreadConfig *threadConfig, + unsigned int slabSizeShift, + BlockCount maximumFreeBlocksPerSlab, + ReadOnlyNotifier *readOnlyNotifier, + SlabSummary **slabSummaryPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a SlabSummary and NULL out the reference to it. + * + * @param [in,out] slabSummaryPtr A pointer to the SlabSummary to free + **/ +void freeSlabSummary(SlabSummary **slabSummaryPtr); + +/** + * Get the portion of the slab summary for a specified zone. + * + * @param summary The slab summary + * @param zone The zone + * + * @return The portion of the slab summary for the specified zone + **/ +SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) + __attribute__((warn_unused_result)); + +/** + * Drain a zone of the slab summary. + * + * @param summaryZone The zone to drain + * @param operation The type of drain to perform + * @param parent The object to notify when the suspend is complete + **/ +void drainSlabSummaryZone(SlabSummaryZone *summaryZone, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a zone of the slab summary. + * + * @param summaryZone The zone to resume + * @param parent The object to notify when the zone is resumed + **/ +void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, + VDOCompletion *parent); + +/** + * Update the entry for a slab. + * + * @param summaryZone The SlabSummaryZone for the zone of the slab + * @param waiter The waiter that is updating the summary + * @param slabNumber The slab number to update + * @param tailBlockOffset The offset of slab journal's tail block + * @param loadRefCounts Whether the refCounts must be loaded from the layer + * on the next load + * @param isClean Whether the slab is clean + * @param freeBlocks The number of free blocks + **/ +void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, + Waiter *waiter, + SlabCount slabNumber, + TailBlockOffset tailBlockOffset, + bool loadRefCounts, + bool isClean, + BlockCount freeBlocks); + +/** + * Get the stored tail block offset for a slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get the offset for + * + * @return The tail block offset for the slab + **/ +TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Whether refCounts must be loaded from the layer. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return Whether refCounts must be loaded + **/ +bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored cleanliness information for a single slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return Whether the slab is clean + **/ +bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored emptiness information for a single slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return An approximation to the free blocks in the slab + **/ +BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored RefCounts state information for a single slab. Used + * in testing only. + * + * @param [in] summaryZone The SlabSummaryZone to use + * @param [in] slabNumber The slab number to get information for + * @param [out] freeBlockHint The approximate number of free blocks + * @param [out] isClean Whether the slab is clean + **/ +void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, + SlabCount slabNumber, + size_t *freeBlockHint, + bool *isClean); + +/** + * Get the stored slab statuses for all slabs in a zone. + * + * @param [in] summaryZone The SlabSummaryZone to use + * @param [in] slabCount The number of slabs to fetch + * @param [in,out] statuses An array of SlabStatuses to populate + **/ +void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, + SlabCount slabCount, + SlabStatus *statuses); + +/** + * Set the origin of the slab summary relative to the physical layer. + * + * @param summary The SlabSummary to update + * @param partition The slab summary partition + **/ +void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition); + +/** + * Read in all the slab summary data from the slab summary partition, + * combine all the previously used zones into a single zone, and then + * write the combined summary back out to each possible zones' summary + * region. + * + * @param summary The summary to load + * @param operation The type of load to perform + * @param zonesToCombine The number of zones to be combined; if set to 0, + * all of the summary will be initialized as new. + * @param parent The parent of this operation + **/ +void loadSlabSummary(SlabSummary *summary, + AdminStateCode operation, + ZoneCount zonesToCombine, + VDOCompletion *parent); + +/** + * Fetch the cumulative statistics for all slab summary zones in a summary. + * + * @param summary The summary in question + * + * @return the cumulative slab summary statistics for the summary + **/ +SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) + __attribute__((warn_unused_result)); + +#endif // SLAB_SUMMARY_H diff --git a/vdo/base/slabSummaryInternals.h b/vdo/base/slabSummaryInternals.h new file mode 100644 index 0000000..8ac071c --- /dev/null +++ b/vdo/base/slabSummaryInternals.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummaryInternals.h#7 $ + */ + +#ifndef SLAB_SUMMARY_INTERNALS_H +#define SLAB_SUMMARY_INTERNALS_H + +#include "slabSummary.h" + +#include "adminState.h" +#include "atomic.h" + +typedef struct slabSummaryEntry { + /** Bits 7..0: The offset of the tail block within the slab journal */ + TailBlockOffset tailBlockOffset; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /** Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullnessHint : 6; + /** Bit 14: Whether the refCounts must be loaded from the layer */ + unsigned int loadRefCounts : 1; + /** Bit 15: The believed cleanliness of this slab */ + unsigned int isDirty : 1; +#else + /** Bit 15: The believed cleanliness of this slab */ + unsigned int isDirty : 1; + /** Bit 14: Whether the refCounts must be loaded from the layer */ + unsigned int loadRefCounts : 1; + /** Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullnessHint : 6; +#endif +} __attribute__((packed)) SlabSummaryEntry; + +typedef struct slabSummaryBlock { + /** The zone to which this block belongs */ + SlabSummaryZone *zone; + /** The index of this block in its zone's summary */ + BlockCount index; + /** Whether this block has a write outstanding */ + bool writing; + /** Ring of updates waiting on the outstanding write */ + WaitQueue currentUpdateWaiters; + /** Ring of updates waiting on the next write */ + WaitQueue nextUpdateWaiters; + /** The active SlabSummaryEntry array for this block */ + SlabSummaryEntry *entries; + /** The VIO used to write this block */ + VIO *vio; + /** The packed entries, one block long, backing the VIO */ + char *outgoingEntries; +} SlabSummaryBlock; + +/** + * The statistics for all the slab summary zones owned by this slab summary. + * These fields are all mutated only by their physical zone threads, but are + * read by other threads when gathering statistics for the entire depot. + **/ +typedef struct atomicSlabSummaryStatistics { + /** Number of blocks written */ + Atomic64 blocksWritten; +} AtomicSlabSummaryStatistics; + +struct slabSummaryZone { + /** The summary of which this is a zone */ + SlabSummary *summary; + /** The number of this zone */ + ZoneCount zoneNumber; + /** Count of the number of blocks currently out for writing */ + BlockCount writeCount; + /** The state of this zone */ + AdminState state; + /** The array (owned by the blocks) of all entries */ + SlabSummaryEntry *entries; + /** The array of SlabSummaryEntryBlocks */ + SlabSummaryBlock summaryBlocks[]; +}; + +struct slabSummary { + /** The context for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The statistics for this slab summary */ + AtomicSlabSummaryStatistics statistics; + /** The start of the slab summary partition relative to the layer */ + PhysicalBlockNumber origin; + /** The number of bits to shift to get a 7-bit fullness hint */ + unsigned int hintShift; + /** The number of blocks (calculated based on MAX_SLABS) */ + BlockCount blocksPerZone; + /** The number of slabs per block (calculated from block size) */ + SlabCount entriesPerBlock; + /** The entries for all of the zones the partition can hold */ + SlabSummaryEntry *entries; + /** The number of zones which were active at the time of the last update */ + ZoneCount zonesToCombine; + /** The current number of active zones */ + ZoneCount zoneCount; + /** The currently active zones */ + SlabSummaryZone *zones[]; +}; + +/** + * Treating the current entries buffer as the on-disk value of all zones, + * update every zone to the correct values for every slab. + * + * @param summary The summary whose entries should be combined + **/ +void combineZones(SlabSummary *summary); + +#endif // SLAB_SUMMARY_INTERNALS_H diff --git a/vdo/base/statistics.h b/vdo/base/statistics.h new file mode 100644 index 0000000..2511076 --- /dev/null +++ b/vdo/base/statistics.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef STATISTICS_H +#define STATISTICS_H + +#include "header.h" +#include "types.h" + +enum { + STATISTICS_VERSION = 31, +}; + +typedef struct { + /** The total number of slabs from which blocks may be allocated */ + uint64_t slabCount; + /** The total number of slabs from which blocks have ever been allocated */ + uint64_t slabsOpened; + /** The number of times since loading that a slab has been re-opened */ + uint64_t slabsReopened; +} BlockAllocatorStatistics; + +/** + * Counters for tracking the number of items written (blocks, requests, etc.) + * that keep track of totals at steps in the write pipeline. Three counters + * allow the number of buffered, in-memory items and the number of in-flight, + * unacknowledged writes to be derived, while still tracking totals for + * reporting purposes + **/ +typedef struct { + /** The total number of items on which processing has started */ + uint64_t started; + /** The total number of items for which a write operation has been issued */ + uint64_t written; + /** The total number of items for which a write operation has completed */ + uint64_t committed; +} CommitStatistics; + +/** Counters for events in the recovery journal */ +typedef struct { + /** Number of times the on-disk journal was full */ + uint64_t diskFull; + /** Number of times the recovery journal requested slab journal commits. */ + uint64_t slabJournalCommitsRequested; + /** Write/Commit totals for individual journal entries */ + CommitStatistics entries; + /** Write/Commit totals for journal blocks */ + CommitStatistics blocks; +} RecoveryJournalStatistics; + +/** The statistics for the compressed block packer. */ +typedef struct { + /** Number of compressed data items written since startup */ + uint64_t compressedFragmentsWritten; + /** Number of blocks containing compressed items written since startup */ + uint64_t compressedBlocksWritten; + /** Number of VIOs that are pending in the packer */ + uint64_t compressedFragmentsInPacker; +} PackerStatistics; + +/** The statistics for the slab journals. */ +typedef struct { + /** Number of times the on-disk journal was full */ + uint64_t diskFullCount; + /** Number of times an entry was added over the flush threshold */ + uint64_t flushCount; + /** Number of times an entry was added over the block threshold */ + uint64_t blockedCount; + /** Number of times a tail block was written */ + uint64_t blocksWritten; + /** Number of times we had to wait for the tail to write */ + uint64_t tailBusyCount; +} SlabJournalStatistics; + +/** The statistics for the slab summary. */ +typedef struct { + /** Number of blocks written */ + uint64_t blocksWritten; +} SlabSummaryStatistics; + +/** The statistics for the reference counts. */ +typedef struct { + /** Number of reference blocks written */ + uint64_t blocksWritten; +} RefCountsStatistics; + +/** The statistics for the block map. */ +typedef struct { + /** number of dirty (resident) pages */ + uint32_t dirtyPages; + /** number of clean (resident) pages */ + uint32_t cleanPages; + /** number of free pages */ + uint32_t freePages; + /** number of pages in failed state */ + uint32_t failedPages; + /** number of pages incoming */ + uint32_t incomingPages; + /** number of pages outgoing */ + uint32_t outgoingPages; + /** how many times free page not avail */ + uint32_t cachePressure; + /** number of getVDOPageAsync() for read */ + uint64_t readCount; + /** number or getVDOPageAsync() for write */ + uint64_t writeCount; + /** number of times pages failed to read */ + uint64_t failedReads; + /** number of times pages failed to write */ + uint64_t failedWrites; + /** number of gets that are reclaimed */ + uint64_t reclaimed; + /** number of gets for outgoing pages */ + uint64_t readOutgoing; + /** number of gets that were already there */ + uint64_t foundInCache; + /** number of gets requiring discard */ + uint64_t discardRequired; + /** number of gets enqueued for their page */ + uint64_t waitForPage; + /** number of gets that have to fetch */ + uint64_t fetchRequired; + /** number of page fetches */ + uint64_t pagesLoaded; + /** number of page saves */ + uint64_t pagesSaved; + /** the number of flushes issued */ + uint64_t flushCount; +} BlockMapStatistics; + +/** The dedupe statistics from hash locks */ +typedef struct { + /** Number of times the UDS advice proved correct */ + uint64_t dedupeAdviceValid; + /** Number of times the UDS advice proved incorrect */ + uint64_t dedupeAdviceStale; + /** Number of writes with the same data as another in-flight write */ + uint64_t concurrentDataMatches; + /** Number of writes whose hash collided with an in-flight write */ + uint64_t concurrentHashCollisions; +} HashLockStatistics; + +/** Counts of error conditions in VDO. */ +typedef struct { + /** number of times VDO got an invalid dedupe advice PBN from UDS */ + uint64_t invalidAdvicePBNCount; + /** number of times a VIO completed with a VDO_NO_SPACE error */ + uint64_t noSpaceErrorCount; + /** number of times a VIO completed with a VDO_READ_ONLY error */ + uint64_t readOnlyErrorCount; +} ErrorStatistics; + +/** The statistics of the vdo service. */ +struct vdoStatistics { + uint32_t version; + uint32_t releaseVersion; + /** Number of blocks used for data */ + uint64_t dataBlocksUsed; + /** Number of blocks used for VDO metadata */ + uint64_t overheadBlocksUsed; + /** Number of logical blocks that are currently mapped to physical blocks */ + uint64_t logicalBlocksUsed; + /** number of physical blocks */ + BlockCount physicalBlocks; + /** number of logical blocks */ + BlockCount logicalBlocks; + /** Size of the block map page cache, in bytes */ + uint64_t blockMapCacheSize; + /** String describing the active write policy of the VDO */ + char writePolicy[15]; + /** The physical block size */ + uint64_t blockSize; + /** Number of times the VDO has successfully recovered */ + uint64_t completeRecoveries; + /** Number of times the VDO has recovered from read-only mode */ + uint64_t readOnlyRecoveries; + /** String describing the operating mode of the VDO */ + char mode[15]; + /** Whether the VDO is in recovery mode */ + bool inRecoveryMode; + /** What percentage of recovery mode work has been completed */ + uint8_t recoveryPercentage; + /** The statistics for the compressed block packer */ + PackerStatistics packer; + /** Counters for events in the block allocator */ + BlockAllocatorStatistics allocator; + /** Counters for events in the recovery journal */ + RecoveryJournalStatistics journal; + /** The statistics for the slab journals */ + SlabJournalStatistics slabJournal; + /** The statistics for the slab summary */ + SlabSummaryStatistics slabSummary; + /** The statistics for the reference counts */ + RefCountsStatistics refCounts; + /** The statistics for the block map */ + BlockMapStatistics blockMap; + /** The dedupe statistics from hash locks */ + HashLockStatistics hashLock; + /** Counts of error conditions */ + ErrorStatistics errors; +}; + +/** + * Get the proc file path for reading VDOStatistics. + * + * @return The proc file path + **/ +static inline const char *getVDOStatisticsProcFile(void) { + return "dedupe_stats"; +} + +#endif /* not STATISTICS_H */ diff --git a/vdo/base/statusCodes.c b/vdo/base/statusCodes.c new file mode 100644 index 0000000..40be3fd --- /dev/null +++ b/vdo/base/statusCodes.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.c#3 $ + */ + +#include "statusCodes.h" + +#include "errors.h" +#include "permassert.h" +#include "threadOnce.h" + +const struct errorInfo vdoStatusList[] = { + { "VDO_NOT_IMPLEMENTED", "Not implemented" }, + { "VDO_OUT_OF_RANGE", "Out of range" }, + { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" }, + { "VDO_NO_SPACE", "Out of space" }, + { "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" }, + { "VDO_BAD_CONFIGURATION", "Bad configuration option" }, + { "VDO_SOCKET_ERROR", "Socket error" }, + { "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" }, + { "VDO_COMPONENT_BUSY", "Prior operation still in progress" }, + { "VDO_BAD_PAGE", "Corrupt or incorrect page" }, + { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" }, + { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" }, + { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" }, + { "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" }, + { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" }, + { "VDO_PARTITION_EXISTS", "A partition already exists with a given id"}, + { "VDO_NOT_READ_ONLY", "The device is not in read-only mode" }, + { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" }, + { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" }, + { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" }, + { "VDO_LOCK_ERROR", "A lock is held incorrectly" }, + { "VDO_READ_ONLY", "The device is in read-only mode" }, + { "VDO_SHUTTING_DOWN", "The device is shutting down" }, + { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" }, + { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" }, + { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" }, + { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" }, + { "VDO_UNKNOWN_COMMAND", "The extended command is not known" }, + { "VDO_COMMAND_ERROR", "Bad extended command parameters" }, + { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" }, + { "VDO_BAD_MAPPING", "Invalid page mapping" }, + { "VDO_READ_CACHE_BUSY", "Read cache has no free slots" }, + { "VDO_BIO_CREATION_FAILED", "Bio creation failed" }, + { "VDO_BAD_MAGIC", "Bad magic number" }, + { "VDO_BAD_NONCE", "Bad nonce" }, + { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" }, + { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" }, +}; + +#ifndef __KERNEL__ +static OnceState vdoStatusCodesRegistered = ONCE_STATE_INITIALIZER; +static int statusCodeRegistrationResult; + +/**********************************************************************/ +static void doStatusCodeRegistration(void) +{ + STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) + == COUNT_OF(vdoStatusList)); + + int result = registerErrorBlock("VDO Status", + VDO_STATUS_CODE_BASE, + VDO_STATUS_CODE_BLOCK_END, + vdoStatusList, + sizeof(vdoStatusList)); + /* + * The following test handles cases where libvdo is statically linked + * against both the test modules and the test driver (because multiple + * instances of this module call their own copy of this function + * once each, resulting in multiple calls to registerErrorBlock which + * is shared in libuds). + */ + if (result == UDS_DUPLICATE_NAME) { + result = UDS_SUCCESS; + } + + statusCodeRegistrationResult + = (result == UDS_SUCCESS) ? VDO_SUCCESS : result; +} +#endif + +/**********************************************************************/ +int registerStatusCodes(void) +{ +#ifdef __KERNEL__ + return VDO_SUCCESS; +#else + performOnce(&vdoStatusCodesRegistered, doStatusCodeRegistration); + return statusCodeRegistrationResult; +#endif +} diff --git a/vdo/base/statusCodes.h b/vdo/base/statusCodes.h new file mode 100644 index 0000000..dd3a3ff --- /dev/null +++ b/vdo/base/statusCodes.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.h#2 $ + */ + +#ifndef STATUS_CODES_H +#define STATUS_CODES_H + +#include "errors.h" + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +/** + * VDO-specific status codes. + **/ +enum vdoStatusCodes { + /** successful result */ + VDO_SUCCESS = 0, + /** base of all VDO errors */ + VDO_STATUS_CODE_BASE = VDO_BLOCK_START, + /** we haven't written this yet */ + VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, + /** input out of range */ + VDO_OUT_OF_RANGE, + /** an invalid reference count would result */ + VDO_REF_COUNT_INVALID, + /** a free block could not be allocated */ + VDO_NO_SPACE, + /** unexpected EOF on block read */ + VDO_UNEXPECTED_EOF, + /** improper or missing configuration option */ + VDO_BAD_CONFIGURATION, + /** socket opening or binding problem */ + VDO_SOCKET_ERROR, + /** read or write on non-aligned offset */ + VDO_BAD_ALIGNMENT, + /** prior operation still in progress */ + VDO_COMPONENT_BUSY, + /** page contents incorrect or corrupt data */ + VDO_BAD_PAGE, + /** unsupported version of some component */ + VDO_UNSUPPORTED_VERSION, + /** component id mismatch in decoder */ + VDO_INCORRECT_COMPONENT, + /** parameters have conflicting values */ + VDO_PARAMETER_MISMATCH, + /** the block size is too small */ + VDO_BLOCK_SIZE_TOO_SMALL, + /** no partition exists with a given id */ + VDO_UNKNOWN_PARTITION, + /** a partition already exists with a given id */ + VDO_PARTITION_EXISTS, + /** the VDO is not in read-only mode */ + VDO_NOT_READ_ONLY, + /** physical block growth of too few blocks */ + VDO_INCREMENT_TOO_SMALL, + /** incorrect checksum */ + VDO_CHECKSUM_MISMATCH, + /** the recovery journal is full */ + VDO_RECOVERY_JOURNAL_FULL, + /** a lock is held incorrectly */ + VDO_LOCK_ERROR, + /** the VDO is in read-only mode */ + VDO_READ_ONLY, + /** the VDO is shutting down */ + VDO_SHUTTING_DOWN, + /** the recovery journal has corrupt entries */ + VDO_CORRUPT_JOURNAL, + /** exceeds maximum number of slabs supported */ + VDO_TOO_MANY_SLABS, + /** a compressed block fragment is invalid */ + VDO_INVALID_FRAGMENT, + /** action is unsupported while rebuilding */ + VDO_RETRY_AFTER_REBUILD, + /** the extended command is not known */ + VDO_UNKNOWN_COMMAND, + /** bad extended command parameters */ + VDO_COMMAND_ERROR, + /** cannot determine sizes to fit */ + VDO_CANNOT_DETERMINE_SIZE, + /** a block map entry is invalid */ + VDO_BAD_MAPPING, + /** read cache has no free slots */ + VDO_READ_CACHE_BUSY, + /** bio_add_page failed */ + VDO_BIO_CREATION_FAILED, + /** bad magic number */ + VDO_BAD_MAGIC, + /** bad nonce */ + VDO_BAD_NONCE, + /** sequence number overflow */ + VDO_JOURNAL_OVERFLOW, + /** the VDO is not in a state to perform an admin operation */ + VDO_INVALID_ADMIN_STATE, + /** one more than last error code */ + VDO_STATUS_CODE_LAST, + VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END +}; + +extern const struct errorInfo vdoStatusList[]; + +/** + * Register the VDO status codes if needed. + * + * @return a success or error code + **/ +int registerStatusCodes(void); + +#endif // STATUS_CODES_H diff --git a/vdo/base/superBlock.c b/vdo/base/superBlock.c new file mode 100644 index 0000000..a7376e9 --- /dev/null +++ b/vdo/base/superBlock.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.c#5 $ + */ + +#include "superBlock.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "completion.h" +#include "constants.h" +#include "header.h" +#include "releaseVersions.h" +#include "statusCodes.h" +#include "types.h" +#include "vio.h" + +struct superBlock { + /** The parent for asynchronous load and save operations */ + VDOCompletion *parent; + /** The VIO for reading and writing the super block to disk */ + VIO *vio; + /** The buffer for encoding and decoding component data */ + Buffer *componentBuffer; + /** + * A sector-sized buffer wrapping the first sector of encodedSuperBlock, for + * encoding and decoding the entire super block. + **/ + Buffer *blockBuffer; + /** A 1-block buffer holding the encoded on-disk super block */ + byte *encodedSuperBlock; + /** The release version number loaded from the volume */ + ReleaseVersionNumber loadedReleaseVersion; + /** Whether this super block may not be written */ + bool unwriteable; +}; + +enum { + SUPER_BLOCK_FIXED_SIZE + = ENCODED_HEADER_SIZE + sizeof(ReleaseVersionNumber) + CHECKSUM_SIZE, + MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE, +}; + +static const Header SUPER_BLOCK_HEADER_12_0 = { + .id = SUPER_BLOCK, + .version = { + .majorVersion = 12, + .minorVersion = 0, + }, + + // This is the minimum size, if the super block contains no components. + .size = SUPER_BLOCK_FIXED_SIZE - ENCODED_HEADER_SIZE, +}; + +/** + * Allocate a super block. Callers must free the allocated super block even + * on error. + * + * @param layer The physical layer which holds the super block on disk + * @param superBlockPtr A pointer to hold the new super block + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int allocateSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) +{ + int result = ALLOCATE(1, SuperBlock, __func__, superBlockPtr); + if (result != UDS_SUCCESS) { + return result; + } + + SuperBlock *superBlock = *superBlockPtr; + result = makeBuffer(MAX_COMPONENT_DATA_SIZE, &superBlock->componentBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, + "encoded super block", + (char **) &superBlock->encodedSuperBlock); + if (result != UDS_SUCCESS) { + return result; + } + + // Even though the buffer is a full block, to avoid the potential corruption + // from a torn write, the entire encoding must fit in the first sector. + result = wrapBuffer(superBlock->encodedSuperBlock, VDO_SECTOR_SIZE, 0, + &superBlock->blockBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + return createVIO(layer, VIO_TYPE_SUPER_BLOCK, VIO_PRIORITY_METADATA, + superBlock, (char *) superBlock->encodedSuperBlock, + &superBlock->vio); +} + +/**********************************************************************/ +int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) +{ + SuperBlock *superBlock; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + // For a new super block, use the current release. + superBlock->loadedReleaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + *superBlockPtr = superBlock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSuperBlock(SuperBlock **superBlockPtr) +{ + if (*superBlockPtr == NULL) { + return; + } + + SuperBlock *superBlock = *superBlockPtr; + freeBuffer(&superBlock->blockBuffer); + freeBuffer(&superBlock->componentBuffer); + freeVIO(&superBlock->vio); + FREE(superBlock->encodedSuperBlock); + FREE(superBlock); + *superBlockPtr = NULL; +} + +/** + * Encode a super block into its on-disk representation. + * + * @param layer The physical layer which implements the checksum + * @param superBlock The super block to encode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) +{ + Buffer *buffer = superBlock->blockBuffer; + int result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + return result; + } + + size_t componentDataSize = contentLength(superBlock->componentBuffer); + + // Encode the header. + Header header = SUPER_BLOCK_HEADER_12_0; + header.size += componentDataSize; + result = encodeHeader(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + // Encode the loaded release version. + result = putUInt32LEIntoBuffer(buffer, superBlock->loadedReleaseVersion); + if (result != UDS_SUCCESS) { + return result; + } + + // Copy the already-encoded component data. + result = putBytes(buffer, componentDataSize, + getBufferContents(superBlock->componentBuffer)); + if (result != UDS_SUCCESS) { + return result; + } + + // Compute and encode the checksum. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, + superBlock->encodedSuperBlock, + contentLength(buffer)); + result = putUInt32LEIntoBuffer(buffer, checksum); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int saveSuperBlock(PhysicalLayer *layer, + SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset) +{ + int result = encodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + return result; + } + + return layer->writer(layer, superBlockOffset, 1, + (char *) superBlock->encodedSuperBlock, NULL); +} + +/** + * Finish the parent of a super block load or save operation. This + * callback is registered in saveSuperBlockAsync() and loadSuperBlockAsync. + * + * @param completion The super block VIO + **/ +static void finishSuperBlockParent(VDOCompletion *completion) +{ + SuperBlock *superBlock = completion->parent; + VDOCompletion *parent = superBlock->parent; + superBlock->parent = NULL; + finishCompletion(parent, completion->result); +} + +/** + * Log a super block save error. This error handler is registered in + * saveSuperBlockAsync(). + * + * @param completion The super block VIO + **/ +static void handleSaveError(VDOCompletion *completion) +{ + logErrorWithStringError(completion->result, "super block save failed"); + /* + * Mark the super block as unwritable so that we won't attempt to write it + * again. This avoids the case where a growth attempt fails writing the + * super block with the new size, but the subsequent attempt to write out + * the read-only state succeeds. In this case, writes which happened just + * before the suspend would not be visible if the VDO is restarted without + * rebuilding, but, after a read-only rebuild, the effects of those writes + * would reappear. + */ + ((SuperBlock *) completion->parent)->unwriteable = true; + completion->callback(completion); +} + +/**********************************************************************/ +void saveSuperBlockAsync(SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset, + VDOCompletion *parent) +{ + if (superBlock->unwriteable) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + if (superBlock->parent != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + PhysicalLayer *layer = parent->layer; + int result = encodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + superBlock->parent = parent; + superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; + launchWriteMetadataVIOWithFlush(superBlock->vio, superBlockOffset, + finishSuperBlockParent, handleSaveError, + true, true); +} + +/** + * Decode a super block from its on-disk representation. + * + * @param layer The physical layer which implements the checksum + * @param superBlock The super block to decode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) +{ + // Reset the block buffer to start decoding the entire first sector. + Buffer *buffer = superBlock->blockBuffer; + clearBuffer(buffer); + + // Decode and validate the header. + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + if (header.size > contentLength(buffer)) { + // We can't check release version or checksum until we know the content + // size, so we have to assume a version mismatch on unexpected values. + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "super block contents too large: %zu", + header.size); + } + + // Restrict the buffer to the actual payload bytes that remain. + result = resetBufferEnd(buffer, uncompactedAmount(buffer) + header.size); + if (result != VDO_SUCCESS) { + return result; + } + + // Decode and store the release version number. It will be checked when the + // VDO master version is decoded and validated. + result = getUInt32LEFromBuffer(buffer, &superBlock->loadedReleaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + // The component data is all the rest, except for the checksum. + size_t componentDataSize = contentLength(buffer) - sizeof(CRC32Checksum); + result = putBuffer(superBlock->componentBuffer, buffer, componentDataSize); + if (result != VDO_SUCCESS) { + return result; + } + + // Checksum everything up to but not including the saved checksum itself. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, + superBlock->encodedSuperBlock, + uncompactedAmount(buffer)); + + // Decode and verify the saved checksum. + CRC32Checksum savedChecksum; + result = getUInt32LEFromBuffer(buffer, &savedChecksum); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(contentLength(buffer) == 0, + "must have decoded entire superblock payload"); + if (result != VDO_SUCCESS) { + return result; + } + + return ((checksum != savedChecksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); +} + +/**********************************************************************/ +int loadSuperBlock(PhysicalLayer *layer, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) +{ + SuperBlock *superBlock = NULL; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + result = layer->reader(layer, superBlockOffset, 1, + (char *) superBlock->encodedSuperBlock, NULL); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + result = decodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + *superBlockPtr = superBlock; + return result; +} + +/** + * Continue after loading the super block. This callback is registered + * in loadSuperBlockAsync(). + * + * @param completion The super block VIO + **/ +static void finishReadingSuperBlock(VDOCompletion *completion) +{ + SuperBlock *superBlock = completion->parent; + VDOCompletion *parent = superBlock->parent; + superBlock->parent = NULL; + finishCompletion(parent, decodeSuperBlock(completion->layer, superBlock)); +} + +/**********************************************************************/ +void loadSuperBlockAsync(VDOCompletion *parent, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) +{ + PhysicalLayer *layer = parent->layer; + SuperBlock *superBlock = NULL; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + finishCompletion(parent, result); + return; + } + + *superBlockPtr = superBlock; + + superBlock->parent = parent; + superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; + launchReadMetadataVIO(superBlock->vio, superBlockOffset, + finishReadingSuperBlock, finishSuperBlockParent); +} + +/**********************************************************************/ +Buffer *getComponentBuffer(SuperBlock *superBlock) +{ + return superBlock->componentBuffer; +} + +/**********************************************************************/ +ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) +{ + return superBlock->loadedReleaseVersion; +} + +/**********************************************************************/ +size_t getFixedSuperBlockSize(void) +{ + return SUPER_BLOCK_FIXED_SIZE; +} diff --git a/vdo/base/superBlock.h b/vdo/base/superBlock.h new file mode 100644 index 0000000..bfed7c6 --- /dev/null +++ b/vdo/base/superBlock.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.h#2 $ + */ + +#ifndef SUPER_BLOCK_H +#define SUPER_BLOCK_H + +#include "buffer.h" + +#include "completion.h" +#include "types.h" + +typedef struct superBlock SuperBlock; + +/** + * Make a new super block. + * + * @param [in] layer The layer on which to write this super block + * @param [out] superBlockPtr A pointer to hold the new super block + * + * @return VDO_SUCCESS or an error + **/ +int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) + __attribute__((warn_unused_result)); + +/** + * Free a super block and null out the reference to it. + * + * @param superBlockPtr the reference to the super block to free + **/ +void freeSuperBlock(SuperBlock **superBlockPtr); + +/** + * Save a super block. + * + * @param layer The physical layer on which to save the super block + * @param superBlock The super block to save + * @param superBlockOffset The location of the super block + * + * @return VDO_SUCCESS or an error + **/ +int saveSuperBlock(PhysicalLayer *layer, + SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset) + __attribute__((warn_unused_result)); + +/** + * Save a super block asynchronously. + * + * @param superBlock The super block to save + * @param superBlockOffset The location at which to write the super block + * @param parent The object to notify when the save is complete + **/ +void saveSuperBlockAsync(SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset, + VDOCompletion *parent); + +/** + * Allocate a super block and read its contents from storage. + * + * @param [in] layer The layer from which to load the super block + * @param [in] superBlockOffset The location from which to read the super + * block + * @param [out] superBlockPtr A pointer to hold the loaded super block + * + * @return VDO_SUCCESS or an error + **/ +int loadSuperBlock(PhysicalLayer *layer, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate a super block and read its contents from storage asynchronously. If + * a load error occurs before the super block's own completion can be allocated, + * the parent will be finished with the error. + * + * @param [in] parent The completion to finish after loading the + * super block + * @param [in] superBlockOffset The location from which to read the super + * block + * @param [out] superBlockPtr A pointer to hold the super block + **/ +void loadSuperBlockAsync(VDOCompletion *parent, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr); + +/** + * Get a buffer which contains the component data from a super block. + * + * @param superBlock The super block from which to get the component data + * + * @return the component data in a buffer + **/ +Buffer *getComponentBuffer(SuperBlock *superBlock) + __attribute__((warn_unused_result)); + +/** + * Get the release version number that was loaded from the volume when the + * SuperBlock was decoded. + * + * @param superBlock The super block to query + * + * @return the release version number that was decoded from the volume + **/ +ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) + __attribute__((warn_unused_result)); + +/** + * Get the encoded size of the fixed (non-component data) portion of a super + * block (this is for unit testing). + * + * @return The encoded size of the fixed portion of the super block + **/ +size_t getFixedSuperBlockSize(void) + __attribute__((warn_unused_result)); + +#endif /* SUPER_BLOCK_H */ diff --git a/vdo/base/threadConfig.c b/vdo/base/threadConfig.c new file mode 100644 index 0000000..b671b73 --- /dev/null +++ b/vdo/base/threadConfig.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.c#2 $ + */ + +#include "threadConfig.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "constants.h" +#include "types.h" + +/**********************************************************************/ +static int allocateThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ZoneCount baseThreadCount, + ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = ALLOCATE(1, ThreadConfig, "thread config", &config); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(logicalZoneCount, ThreadID, "logical thread array", + &config->logicalThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + result = ALLOCATE(physicalZoneCount, ThreadID, "physical thread array", + &config->physicalThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + result = ALLOCATE(hashZoneCount, ThreadID, "hash thread array", + &config->hashZoneThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + config->logicalZoneCount = logicalZoneCount; + config->physicalZoneCount = physicalZoneCount; + config->hashZoneCount = hashZoneCount; + config->baseThreadCount = baseThreadCount; + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void assignThreadIDs(ThreadID threadIDs[], + ZoneCount count, + ThreadID *idPtr) +{ + for (ZoneCount zone = 0; zone < count; zone++) { + threadIDs[zone] = (*idPtr)++; + } +} + +/**********************************************************************/ +int makeThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ThreadConfig **configPtr) +{ + if ((logicalZoneCount == 0) + && (physicalZoneCount == 0) + && (hashZoneCount == 0)) { + return makeOneThreadConfig(configPtr); + } + + if (physicalZoneCount > MAX_PHYSICAL_ZONES) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "Physical zone count %u exceeds maximum " + "(%u)", + physicalZoneCount, MAX_PHYSICAL_ZONES); + } + + if (logicalZoneCount > MAX_LOGICAL_ZONES) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "Logical zone count %u exceeds maximum " + "(%u)", + logicalZoneCount, MAX_LOGICAL_ZONES); + } + + ThreadConfig *config; + ThreadCount total = logicalZoneCount + physicalZoneCount + hashZoneCount + 2; + int result = allocateThreadConfig(logicalZoneCount, physicalZoneCount, + hashZoneCount, total, &config); + if (result != VDO_SUCCESS) { + return result; + } + + ThreadID id = 0; + config->adminThread = id; + config->journalThread = id++; + config->packerThread = id++; + assignThreadIDs(config->logicalThreads, logicalZoneCount, &id); + assignThreadIDs(config->physicalThreads, physicalZoneCount, &id); + assignThreadIDs(config->hashZoneThreads, hashZoneCount, &id); + + ASSERT_LOG_ONLY(id == total, "correct number of thread IDs assigned"); + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeZeroThreadConfig(ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = ALLOCATE(1, ThreadConfig, __func__, &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->logicalZoneCount = 0; + config->physicalZoneCount = 0; + config->hashZoneCount = 0; + config->baseThreadCount = 0; + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeOneThreadConfig(ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = allocateThreadConfig(1, 1, 1, 1, &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->logicalThreads[0] = 0; + config->physicalThreads[0] = 0; + config->hashZoneThreads[0] = 0; + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = allocateThreadConfig(oldConfig->logicalZoneCount, + oldConfig->physicalZoneCount, + oldConfig->hashZoneCount, + oldConfig->baseThreadCount, + &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->adminThread = oldConfig->adminThread; + config->journalThread = oldConfig->journalThread; + config->packerThread = oldConfig->packerThread; + for (ZoneCount i = 0; i < config->logicalZoneCount; i++) { + config->logicalThreads[i] = oldConfig->logicalThreads[i]; + } + for (ZoneCount i = 0; i < config->physicalZoneCount; i++) { + config->physicalThreads[i] = oldConfig->physicalThreads[i]; + } + for (ZoneCount i = 0; i < config->hashZoneCount; i++) { + config->hashZoneThreads[i] = oldConfig->hashZoneThreads[i]; + } + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeThreadConfig(ThreadConfig **configPtr) +{ + if (*configPtr == NULL) { + return; + } + + ThreadConfig *config = *configPtr; + *configPtr = NULL; + + FREE(config->logicalThreads); + FREE(config->physicalThreads); + FREE(config->hashZoneThreads); + FREE(config); +} + +/**********************************************************************/ +static bool getZoneThreadName(const ThreadID threadIDs[], + ZoneCount count, + ThreadID id, + const char *prefix, + char *buffer, + size_t bufferLength) +{ + if (id >= threadIDs[0]) { + ThreadID index = id - threadIDs[0]; + if (index < count) { + snprintf(buffer, bufferLength, "%s%d", prefix, index); + return true; + } + } + return false; +} + +/**********************************************************************/ +void getVDOThreadName(const ThreadConfig *threadConfig, + ThreadID threadID, + char *buffer, + size_t bufferLength) +{ + if (threadConfig->baseThreadCount == 1) { + // Historically this was the "request queue" thread. + snprintf(buffer, bufferLength, "reqQ"); + return; + } + if (threadID == threadConfig->journalThread) { + snprintf(buffer, bufferLength, "journalQ"); + return; + } else if (threadID == threadConfig->adminThread) { + // Theoretically this could be different from the journal thread. + snprintf(buffer, bufferLength, "adminQ"); + return; + } else if (threadID == threadConfig->packerThread) { + snprintf(buffer, bufferLength, "packerQ"); + return; + } + if (getZoneThreadName(threadConfig->logicalThreads, + threadConfig->logicalZoneCount, + threadID, "logQ", buffer, bufferLength)) { + return; + } + if (getZoneThreadName(threadConfig->physicalThreads, + threadConfig->physicalZoneCount, + threadID, "physQ", buffer, bufferLength)) { + return; + } + if (getZoneThreadName(threadConfig->hashZoneThreads, + threadConfig->hashZoneCount, + threadID, "hashQ", buffer, bufferLength)) { + return; + } + + // Some sort of misconfiguration? + snprintf(buffer, bufferLength, "reqQ%d", threadID); +} diff --git a/vdo/base/threadConfig.h b/vdo/base/threadConfig.h new file mode 100644 index 0000000..6401651 --- /dev/null +++ b/vdo/base/threadConfig.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.h#1 $ + */ + +#ifndef THREAD_CONFIG_H +#define THREAD_CONFIG_H + +#include "permassert.h" + +#include "types.h" + +struct threadConfig { + ZoneCount logicalZoneCount; + ZoneCount physicalZoneCount; + ZoneCount hashZoneCount; + ThreadCount baseThreadCount; + ThreadID adminThread; + ThreadID journalThread; + ThreadID packerThread; + ThreadID *logicalThreads; + ThreadID *physicalThreads; + ThreadID *hashZoneThreads; +}; + +/** + * Make a thread configuration. If both the logical zone count and the + * physical zone count are set to 0, a one thread configuration will be + * made. + * + * @param [in] logicalZoneCount The number of logical zones + * @param [in] physicalZoneCount The number of physical zones + * @param [in] hashZoneCount The number of hash zones + * @param [out] configPtr A pointer to hold the new thread + * configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Make a thread configuration that uses no threads. This is the configuration + * for VDOs which are constructed from user mode that have only a synchronous + * layer. + * + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeZeroThreadConfig(ThreadConfig **configPtr); + +/** + * Make a thread configuration that uses only one thread. + * + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeOneThreadConfig(ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Make a new thread config which is a copy of an existing one. + * + * @param [in] oldConfig The thread configuration to copy + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a thread configuration and null out the reference to it. + * + * @param configPtr The reference to the thread configuration to destroy + **/ +void freeThreadConfig(ThreadConfig **configPtr); + +/** + * Get the thread id for a given logical zone. + * + * @param threadConfig the thread config + * @param logicalZone the number of the logical zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getLogicalZoneThread(const ThreadConfig *threadConfig, + ZoneCount logicalZone) +{ + ASSERT_LOG_ONLY((logicalZone <= threadConfig->logicalZoneCount), + "logical zone valid"); + return threadConfig->logicalThreads[logicalZone]; +} + +/** + * Get the thread id for a given physical zone. + * + * @param threadConfig the thread config + * @param physicalZone the number of the physical zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getPhysicalZoneThread(const ThreadConfig *threadConfig, + ZoneCount physicalZone) +{ + ASSERT_LOG_ONLY((physicalZone <= threadConfig->physicalZoneCount), + "physical zone valid"); + return threadConfig->physicalThreads[physicalZone]; +} + +/** + * Get the thread id for a given hash zone. + * + * @param threadConfig the thread config + * @param hashZone the number of the hash zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getHashZoneThread(const ThreadConfig *threadConfig, + ZoneCount hashZone) +{ + ASSERT_LOG_ONLY((hashZone <= threadConfig->hashZoneCount), + "hash zone valid"); + return threadConfig->hashZoneThreads[hashZone]; +} + +/** + * Get the thread id for the journal zone. + * + * @param threadConfig the thread config + * + * @return the thread id for the journal zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getJournalZoneThread(const ThreadConfig *threadConfig) +{ + return threadConfig->journalThread; +} + +/** + * Get the thread id for the packer zone. + * + * @param threadConfig the thread config + * + * @return the thread id for the packer zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getPackerZoneThread(const ThreadConfig *threadConfig) +{ + return threadConfig->packerThread; +} + +/** + * Get the thread ID for admin requests. + * + * @param threadConfig The thread config + * + * @return the thread id to use for admin requests + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getAdminThread(const ThreadConfig *threadConfig) +{ + return threadConfig->adminThread; +} + +/** + * Format the name of the worker thread desired to support a given + * work queue. The physical layer may add a prefix identifying the + * product; the output from this function should just identify the + * thread. + * + * @param threadConfig The thread configuration + * @param threadID The thread id + * @param buffer Where to put the formatted name + * @param bufferLength Size of the output buffer + **/ +void getVDOThreadName(const ThreadConfig *threadConfig, + ThreadID threadID, + char *buffer, + size_t bufferLength); + +#endif /* THREAD_CONFIG_H */ diff --git a/vdo/base/trace.c b/vdo/base/trace.c new file mode 100644 index 0000000..7b4e33f --- /dev/null +++ b/vdo/base/trace.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.c#1 $ + */ + +#include "trace.h" + +#include "logger.h" +#include "stringUtils.h" +#include "timeUtils.h" + +TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[] = { + { + .function = "", + .line = 0, + }, +}; + +/**********************************************************************/ +void addTraceRecord(Trace *trace, TraceLocation location) +{ + if (trace->used < NUM_TRACE_RECORDS) { + TraceRecord *record = &trace->records[trace->used]; + trace->used++; + + record->when = nowUsec(); + record->tid = getThreadId(); + record->location = location - baseTraceLocation; + } +} + +/* + * The record display format used is a comma-separated list, each item + * containing: optional function name; "@" + timestamp with seconds + * and microseconds for the first record; if not the first record, "+" + * and offset in microseconds from previous timestamp. + * + * If the buffer's too small, it'll end with an ellipsis. + */ +void formatTrace(Trace *trace, + char *buffer, + size_t bufferLength, + size_t *msgLen) +{ + if (trace == NULL) { + return; + } + memset(buffer, 0, bufferLength); + char *buf = buffer; + char *bufferEnd = buffer + bufferLength - 1; + if (trace->used > 0) { + TraceRecord *record = &trace->records[0]; + TraceLocationRecord *location = baseTraceLocation + record->location; + snprintf(buf, bufferEnd - buf, "Trace[%s@%llu.%06llu", + location->function, record->when / 1000000, + record->when % 1000000); + buf += strlen(buf); + + for (unsigned int i = 1; i < trace->used; i++) { + TraceRecord *prev = record; + record++; + + snprintf(buf, bufferEnd - buf, ","); + buf += strlen(buf); + + location = baseTraceLocation + record->location; + unsigned long timeDiff = record->when - prev->when; + snprintf(buf, bufferEnd - buf, "%s+%lu", + location->function, timeDiff); + buf += strlen(buf); + } + if (bufferLength > 7) { + if (buffer[bufferLength-5] != '\0') { + // too long + strcpy(buffer+bufferLength-5, "...]"); + } else { + strcpy(buf, "]"); + } + } + } + *msgLen = (buf - buffer); +} diff --git a/vdo/base/trace.h b/vdo/base/trace.h new file mode 100644 index 0000000..59dabf9 --- /dev/null +++ b/vdo/base/trace.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.h#1 $ + */ + +#ifndef TRACE_H +#define TRACE_H + +#ifndef __KERNEL__ +#include "cpu.h" +#endif + +#include "threads.h" + +/* + * We need these records to be glued together with no intervening + * bytes. That makes it rather sensitive to how the compiler, + * assembler, and linker may add padding. Force extra alignment to + * make it more reliable. + * + * Trace point descriptor language: + * + * The descriptor string provided at a trace point can have one or + * more components, separated by ";". The first (or only) component is + * a string to be formatted and shown in the flowchart graph. The + * remaining components must be of the form "var=string", and assign + * string values to "variables" that last through the processing of + * the remainder of the current trace being read. + * + * The string displayed has variable substitutions done for any + * occurrences of "$var" in the string. + * + * So, the descriptor sequence: + * kvdoWriteVIO;io=writeData;j=normal + * submitBio($io) + * writeJournalBlock($j) + * would cause the graph generator to show the strings: + * kvdoWriteVIO + * submitBio(writeData) + * writeJournalBlock(normal) + * + * Substitutions are done in the variable assignment strings when + * they're processed, so "foo=x($bar)" sets "foo" using the current + * value of "bar"; it doesn't cause "bar" to be looked up when "$foo" + * is seen later. + * + * The variable named "F" is automatically updated with the name of + * the function associated with the descriptor, so you don't have to + * explicitly repeat the name of the function if you just want to + * augment it with more information. This may be desirable if a trace + * point is expected to be reached more than once at different stages + * of processing, or in a static function with a generic-sounding name + * that needs disambiguation for graphing. + * + * If no descriptor string is provided, the + * function:lineNumber:threadName string reported via systemtap will + * be used in the graph. + * + * Current variable names used: + * cb=(various) random info to log when enqueueing VIO callback + * dup=post,update deduplication operation + * io=(various) kind of I/O and data it's being done on + * j=normal,dedupe kind of journal update being done + * js=mapWrite,writeZero,unmap which step of journaling we're doing + */ +typedef const struct __attribute__((aligned(16))) traceLocationRecord { + const char *function; + int line; + const char *description; +} TraceLocationRecord; + +/* + * With well under 100 locations defined at the moment, even with no + * idea where &baseTraceLocation will fall relative to the others, we + * only need to support a range of -100..+100. + */ +typedef int32_t TraceLocationNumber; + +/* The type to pass around */ +typedef TraceLocationRecord *TraceLocation; + +/* + * N.B.: This code uses GCC extensions to create static, initialized + * objects inline, describing the current function and line number. + * The objects are collected into a table we can index with small + * signed integers relative to &baseTraceLocation. + * + * We need baseTraceLocation because there's no standard way to get + * the address of the start of this array we're defining. And because + * we're not playing any (additional) special linker tricks to ensure + * ordering of the object files, the offsets may be signed, and we + * don't know the range beyond the fact that we don't have hundreds of + * these records lying around. + * + * By specifying a name that starts with neither .data nor .rodata, we + * leave it to the toolchain to pick a location for us, based on + * things like whether the section needs write access, which it does + * for a PIC library but not for a kernel module. + */ + +#define TRACE_LOCATION_SECTION \ + __attribute__((section(".kvdo_trace_locations"))) + +extern TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[]; + +#define TRACE_JOIN2(a,b) a##b +#define TRACE_JOIN(a,b) TRACE_JOIN2(a,b) +#define THIS_LOCATION(DESCRIPTION) \ + __extension__ \ + ({ \ + static TRACE_LOCATION_SECTION \ + TraceLocationRecord TRACE_JOIN(loc,__LINE__) = { \ + .function = __func__, \ + .line = __LINE__, \ + .description = DESCRIPTION, \ + }; \ + &TRACE_JOIN(loc,__LINE__); \ + }) + +typedef struct traceRecord { + uint64_t when; // counted in usec + pid_t tid; + TraceLocationNumber location; +} TraceRecord; + +enum { NUM_TRACE_RECORDS = 71 }; + +typedef struct trace { + unsigned int used; + TraceRecord records[NUM_TRACE_RECORDS]; +} Trace; + +/** + * Store a new record in the trace data. + * + * @param trace The trace data to be updated + * @param location The source-location descriptor to be recorded + **/ +void addTraceRecord(Trace *trace, TraceLocation location); + +/** + * Format trace data into a string for logging. + * + * @param [in] trace The trace data to be logged + * @param [in] buffer The buffer in which to store the string + * @param [in] bufferLength Length of the buffer + * @param [out] msgLen Length of the formatted string + **/ +void formatTrace(Trace *trace, + char *buffer, + size_t bufferLength, + size_t *msgLen); + +#endif /* TRACE_H */ diff --git a/vdo/base/types.h b/vdo/base/types.h new file mode 100644 index 0000000..d820da6 --- /dev/null +++ b/vdo/base/types.h @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/types.h#14 $ + */ + +#ifndef TYPES_H +#define TYPES_H + +#include "blockMappingState.h" +#include "common.h" +#include "statusCodes.h" + +/** + * A size type in blocks. + **/ +typedef uint64_t BlockCount; + +/** + * The size of a block. + **/ +typedef uint16_t BlockSize; + +/** + * A count of compressed fragments + **/ +typedef uint8_t CompressedFragmentCount; + +/** + * A CRC-32 checksum + **/ +typedef uint32_t CRC32Checksum; + +/** + * A height within a tree. + **/ +typedef uint8_t Height; + +/** + * The logical block number as used by the consumer. + **/ +typedef uint64_t LogicalBlockNumber; + +/** + * The type of the nonce used to identify instances of VDO. + **/ +typedef uint64_t Nonce; + +/** + * A size in pages. + **/ +typedef uint32_t PageCount; + +/** + * A page number. + **/ +typedef uint32_t PageNumber; + +/** + * The size of a page. Must be evenly divisible by block size. + **/ +typedef uint32_t PageSize; + +/** + * The physical (well, less logical) block number at which the block is found + * on the underlying device. + **/ +typedef uint64_t PhysicalBlockNumber; + +/** + * A release version number. These numbers are used to make the numbering + * space for component versions independent across release branches. + * + * Really an enum, but we have to specify the size for encoding; see + * releaseVersions.h for the enumeration values. + **/ +typedef uint32_t ReleaseVersionNumber; + +/** + * A count of tree roots. + **/ +typedef uint8_t RootCount; + +/** + * A number of sectors. + **/ +typedef uint8_t SectorCount; + +/** + * A sequence number. + **/ +typedef uint64_t SequenceNumber; + +/** + * A size type in slabs. + **/ +typedef uint16_t SlabCount; + +/** + * A slot in a bin or block map page. + **/ +typedef uint16_t SlotNumber; + +/** + * A number of VIOs. + **/ +typedef uint16_t VIOCount; + +/** + * A VDO thread configuration. + **/ +typedef struct threadConfig ThreadConfig; + +/** + * A thread counter + **/ +typedef uint8_t ThreadCount; + +/** + * A thread ID + * + * Base-code threads are numbered sequentially starting from 0. + **/ +typedef uint8_t ThreadID; + +/** + * The thread ID returned when the current base code thread ID cannot be found + * or is otherwise undefined. + **/ +static const ThreadID INVALID_THREAD_ID = (ThreadID) -1; + +/** + * A zone counter + **/ +typedef uint8_t ZoneCount; + +/** + * The type of request a VIO is performing + **/ +typedef enum __attribute__((packed)) vioOperation { + VIO_UNSPECIFIED_OPERATION = 0, + VIO_READ = 1, + VIO_WRITE = 2, + VIO_READ_MODIFY_WRITE = VIO_READ | VIO_WRITE, + VIO_READ_WRITE_MASK = VIO_READ_MODIFY_WRITE, + VIO_FLUSH_BEFORE = 4, + VIO_FLUSH_AFTER = 8, +} VIOOperation; + +/** + * VIO types for statistics and instrumentation. + **/ +typedef enum __attribute__((packed)) { + VIO_TYPE_UNINITIALIZED = 0, + VIO_TYPE_DATA, + VIO_TYPE_BLOCK_ALLOCATOR, + VIO_TYPE_BLOCK_MAP, + VIO_TYPE_BLOCK_MAP_INTERIOR, + VIO_TYPE_COMPRESSED_BLOCK, + VIO_TYPE_PARTITION_COPY, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_TYPE_SLAB_JOURNAL, + VIO_TYPE_SLAB_SUMMARY, + VIO_TYPE_SUPER_BLOCK, + VIO_TYPE_TEST, +} VIOType; + +/** + * The current operation on a physical block (from the point of view of the + * recovery journal, slab journals, and reference counts. + **/ +typedef enum __attribute__((packed)) { + DATA_DECREMENT = 0, + DATA_INCREMENT = 1, + BLOCK_MAP_DECREMENT = 2, + BLOCK_MAP_INCREMENT = 3, +} JournalOperation; + +/** + * Partition IDs are encoded in the volume layout in the super block. + **/ +typedef enum __attribute__((packed)) { + BLOCK_MAP_PARTITION = 0, + BLOCK_ALLOCATOR_PARTITION = 1, + RECOVERY_JOURNAL_PARTITION = 2, + SLAB_SUMMARY_PARTITION = 3, +} PartitionID; + +/** + * Check whether a VIOType is for servicing an external data request. + * + * @param vioType The VIOType to check + **/ +static inline bool isDataVIOType(VIOType vioType) +{ + return (vioType == VIO_TYPE_DATA); +} + +/** + * Check whether a VIOType is for compressed block writes + * + * @param vioType The VIOType to check + **/ +static inline bool isCompressedWriteVIOType(VIOType vioType) +{ + return (vioType == VIO_TYPE_COMPRESSED_BLOCK); +} + +/** + * Check whether a VIOType is for metadata + * + * @param vioType The VIOType to check + **/ +static inline bool isMetadataVIOType(VIOType vioType) +{ + return ((vioType != VIO_TYPE_UNINITIALIZED) + && !isDataVIOType(vioType) + && !isCompressedWriteVIOType(vioType)); +} + +/** + * Priority levels for asynchronous I/O operations performed on a VIO. + **/ +typedef enum __attribute__((packed)) vioPriority { + VIO_PRIORITY_LOW = 0, + VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, + VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, + VIO_PRIORITY_METADATA, + VIO_PRIORITY_HIGH, +} VIOPriority; + +/** + * Metadata types for the VDO. + **/ +typedef enum __attribute__((packed)) { + VDO_METADATA_RECOVERY_JOURNAL = 1, + VDO_METADATA_SLAB_JOURNAL, +} VDOMetadataType; + +/** + * The possible write policy values. + **/ +typedef enum { + WRITE_POLICY_SYNC, ///< All writes are synchronous, i. e., they + ///< are acknowledged only when the data is + ///< written to stable storage. + WRITE_POLICY_ASYNC, ///< Writes are acknowledged when the data is + ///< cached for writing to stable storage, subject + ///< to resiliency guarantees specified elsewhere. + ///< After a crash, the data will be either old or + ///< new value for unflushed writes, never garbage. + WRITE_POLICY_ASYNC_UNSAFE, ///< Writes are acknowledged when the data is + ///< cached for writing to stable storage, subject + ///< to resiliency guarantees specified elsewhere. + WRITE_POLICY_AUTO, ///< The appropriate policy is chosen based on the + ///< underlying device +} WritePolicy; + +typedef enum { + ZONE_TYPE_ADMIN, + ZONE_TYPE_JOURNAL, + ZONE_TYPE_LOGICAL, + ZONE_TYPE_PHYSICAL, +} ZoneType; + +/** + * A position in the block map where a block map entry is stored. + **/ +typedef struct { + PhysicalBlockNumber pbn; + SlotNumber slot; +} BlockMapSlot; + +/** + * A position in the arboreal block map at a specific level. + **/ +typedef struct { + PageNumber pageIndex; + BlockMapSlot blockMapSlot; +} BlockMapTreeSlot; + +/** + * The configuration of a single slab derived from the configured block size + * and slab size. + **/ +typedef struct slabConfig { + BlockCount slabBlocks; ///< total number of blocks in the slab + BlockCount dataBlocks; ///< number of blocks available for data + BlockCount referenceCountBlocks; ///< number of blocks for refCounts + BlockCount slabJournalBlocks; ///< number of blocks for the slab journal + /** + * Number of blocks after which the slab journal starts pushing out a + * ReferenceBlock for each new entry it receives. + **/ + BlockCount slabJournalFlushingThreshold; + /** + * Number of blocks after which the slab journal pushes out all + * ReferenceBlocks and makes all VIOs wait. + **/ + BlockCount slabJournalBlockingThreshold; + /** + * Number of blocks after which the slab must be scrubbed before coming + * online. + **/ + BlockCount slabJournalScrubbingThreshold; +} __attribute__((packed)) SlabConfig; + +/** + * The configuration of the VDO service. + **/ +typedef struct vdoConfig { + BlockCount logicalBlocks; ///< number of logical blocks + BlockCount physicalBlocks; ///< number of physical blocks + BlockCount slabSize; ///< number of blocks in a slab + BlockCount recoveryJournalSize; ///< number of recovery journal blocks + BlockCount slabJournalBlocks; ///< number of slab journal blocks +} __attribute__((packed)) VDOConfig; + +/** + * The configuration parameters of the VDO service specified at load time. + **/ +typedef struct vdoLoadConfig { + /** the offset on the physical layer where the VDO begins */ + PhysicalBlockNumber firstBlockOffset; + /** the expected release version number of the VDO */ + ReleaseVersionNumber releaseVersion; + /** the expected nonce of the VDO */ + Nonce nonce; + /** the thread configuration of the VDO */ + ThreadConfig *threadConfig; + /** the page cache size, in pages */ + PageCount cacheSize; + /** whether writes are synchronous */ + WritePolicy writePolicy; + /** the maximum age of a dirty block map page in recovery journal blocks */ + BlockCount maximumAge; +} VDOLoadConfig; + +/** + * Forward declarations of abstract types + **/ +typedef struct actionManager ActionManager; +typedef struct allocatingVIO AllocatingVIO; +typedef struct allocationSelector AllocationSelector; +typedef struct blockAllocator BlockAllocator; +typedef struct blockMap BlockMap; +typedef struct blockMapTreeZone BlockMapTreeZone; +typedef struct blockMapZone BlockMapZone; +typedef struct dataVIO DataVIO; +typedef struct flusher Flusher; +typedef struct forest Forest; +typedef struct hashLock HashLock; +typedef struct hashZone HashZone; +typedef struct indexConfig IndexConfig; +typedef struct inputBin InputBin; +typedef struct lbnLock LBNLock; +typedef struct lockCounter LockCounter; +typedef struct logicalZone LogicalZone; +typedef struct logicalZones LogicalZones; +typedef struct pbnLock PBNLock; +typedef struct physicalLayer PhysicalLayer; +typedef struct physicalZone PhysicalZone; +typedef struct recoveryJournal RecoveryJournal; +typedef struct readOnlyNotifier ReadOnlyNotifier; +typedef struct refCounts RefCounts; +typedef struct vdoSlab Slab; +typedef struct slabDepot SlabDepot; +typedef struct slabJournal SlabJournal; +typedef struct slabJournalEntry SlabJournalEntry; +typedef struct slabScrubber SlabScrubber; +typedef struct slabSummary SlabSummary; +typedef struct slabSummaryZone SlabSummaryZone; +typedef struct vdo VDO; +typedef struct vdoCompletion VDOCompletion; +typedef struct vdoExtent VDOExtent; +typedef struct vdoFlush VDOFlush; +typedef struct vdoLayout VDOLayout; +typedef struct vdoStatistics VDOStatistics; +typedef struct vio VIO; +typedef struct vioPool VIOPool; + +typedef struct { + PhysicalBlockNumber pbn; + BlockMappingState state; +} DataLocation; + +typedef struct { + PhysicalBlockNumber pbn; + BlockMappingState state; + PhysicalZone *zone; +} ZonedPBN; + +/** + * Callback which will be called by the VDO when all of the VIOs in the + * extent have been processed. + * + * @param extent The extent which is complete + **/ +typedef void VDOExtentCallback(VDOExtent *extent); + +/** + * An asynchronous operation. + * + * @param vio The VIO on which to operate + **/ +typedef void AsyncOperation(VIO *vio); + +/** + * An asynchronous compressed write operation. + * + * @param allocatingVIO The AllocatingVIO to write + **/ +typedef void CompressedWriter(AllocatingVIO *allocatingVIO); + +/** + * An asynchronous data operation. + * + * @param dataVIO The DataVIO on which to operate + **/ +typedef void AsyncDataOperation(DataVIO *dataVIO); + +/** + * A reference to a completion which (the reference) can be enqueued + * for completion on a specified thread. + **/ +typedef struct enqueueable { + VDOCompletion *completion; +} Enqueueable; + +#endif // TYPES_H diff --git a/vdo/base/upgrade.c b/vdo/base/upgrade.c new file mode 100644 index 0000000..4d58d6f --- /dev/null +++ b/vdo/base/upgrade.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.c#6 $ + */ + +#include "upgrade.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "blockMap.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "statusCodes.h" +#include "superBlock.h" +#include "vdoInternal.h" +#include "volumeGeometry.h" + +/* The latest supported Sodium version */ +/* Commented out because not currently used. + * static const VersionNumber SODIUM_MASTER_VERSION_67_0 = { + * .majorVersion = 67, + * .minorVersion = 0, + * }; + */ + +/* The component data version for current Sodium */ +static const VersionNumber SODIUM_COMPONENT_DATA_41_0 = { + .majorVersion = 41, + .minorVersion = 0, +}; + +/** + * Current Sodium's configuration of the VDO component. + **/ +typedef struct { + VDOState state; + uint64_t completeRecoveries; + uint64_t readOnlyRecoveries; + VDOConfig config; + Nonce nonce; +} __attribute__((packed)) SodiumComponent41_0; + +/** + * Checks whether the release version loaded in the superblock is the + * current VDO version. + * + * @param vdo The VDO to validate + * + * @return true if the release version number is the current version + **/ +static bool isCurrentReleaseVersion(VDO *vdo) +{ + ReleaseVersionNumber loadedVersion + = getLoadedReleaseVersion(vdo->superBlock); + + return (loadedVersion == CURRENT_RELEASE_VERSION_NUMBER); +} + +/** + * Loads the VDO master version into the VDO and checks that the version + * can be understood by VDO. + * + * @param vdo The VDO to validate + * + * @return VDO_SUCCESS or an error if the loaded version is not supported + **/ +static int validateSodiumVersion(VDO *vdo) +{ + int result = decodeVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + if (isCurrentReleaseVersion(vdo)) { + return VDO_SUCCESS; + } + + ReleaseVersionNumber loadedVersion + = getLoadedReleaseVersion(vdo->superBlock); + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Release version %d, load version %d.%d" + " cannot be upgraded", loadedVersion, + vdo->loadVersion.majorVersion, + vdo->loadVersion.minorVersion); +} + +/** + * Decode a SodiumComponent41_0. + * + * @param buffer The component data buffer + * @param component The component structure to decode into + * + * @return VDO_SUCCESS or an error code + **/ +static int decodeSodium41_0Component(Buffer *buffer, + SodiumComponent41_0 *component) +{ + return getBytesFromBuffer(buffer, sizeof(*component), component); +} + +/** + * Decode the component data for the VDO itself from the component data + * buffer in the super block. + * + * @param vdo The VDO to decode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeSodiumComponent(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + VersionNumber version; + int result = decodeVersionNumber(buffer, &version); + if (result != VDO_SUCCESS) { + return result; + } + + SodiumComponent41_0 component; + if (areSameVersion(SODIUM_COMPONENT_DATA_41_0, version)) { + result = decodeSodium41_0Component(buffer, &component); + } else { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "VDO component data version mismatch," + " expected 41.0, got %d.%d", + version.majorVersion, + version.minorVersion); + } + if (result != VDO_SUCCESS) { + return result; + } + + // Copy the decoded component into the VDO structure. + vdo->state = component.state; + vdo->loadState = component.state; + vdo->completeRecoveries = component.completeRecoveries; + vdo->readOnlyRecoveries = component.readOnlyRecoveries; + vdo->config = component.config; + vdo->nonce = component.nonce; + + logInfo("Converted VDO component data version %d.%d", + version.majorVersion, version.minorVersion); + return VDO_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int finishSodiumDecode(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + int result = makeRecoveryJournal(vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION), + vdo->completeRecoveries, + vdo->config.recoveryJournalSize, + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, + vdo->readOnlyNotifier, threadConfig, + &vdo->recoveryJournal); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + SLAB_SUMMARY_PARTITION), + vdo->readOnlyNotifier, vdo->recoveryJournal, + &vdo->depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumBlockMap(buffer, vdo->config.logicalBlocks, + threadConfig, &vdo->blockMap); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == 0), + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int upgradePriorVDO(PhysicalLayer *layer) +{ + VolumeGeometry geometry; + int result = loadVolumeGeometry(layer, &geometry); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo; + result = makeVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = loadSuperBlock(vdo->layer, getDataRegionOffset(geometry), + &vdo->superBlock); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return logErrorWithStringError(result, "Could not load VDO super block"); + } + + // Load the necessary pieces to save again. + result = validateSodiumVersion(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + if (isCurrentReleaseVersion(vdo)) { + logInfo("VDO already up-to-date"); + freeVDO(&vdo); + return VDO_SUCCESS; + } + + result = decodeSodiumComponent(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + if (requiresRebuild(vdo)) { + // Do not attempt to upgrade a dirty prior version. + freeVDO(&vdo); + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Cannot upgrade a dirty VDO."); + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, + &vdo->readOnlyNotifier); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + result = finishSodiumDecode(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + // Saving will automatically change the release version to current. + result = saveVDOComponents(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + logInfo("Successfully saved upgraded VDO"); + freeVDO(&vdo); + + return result; +} diff --git a/vdo/base/upgrade.h b/vdo/base/upgrade.h new file mode 100644 index 0000000..be2bd05 --- /dev/null +++ b/vdo/base/upgrade.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.h#1 $ + */ + +#ifndef UPGRADE_H +#define UPGRADE_H + +#include "types.h" + +/** + * Reconfigure the superblock of a prior VDO, preparing it for upgrading. + * + * @param layer The layer with a VDO to prepare + * + * @return VDO_SUCCESS or an error + **/ +int upgradePriorVDO(PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +#endif /* UPGRADE_H */ diff --git a/vdo/base/vdo.c b/vdo/base/vdo.c new file mode 100644 index 0000000..b4b9a41 --- /dev/null +++ b/vdo/base/vdo.c @@ -0,0 +1,1154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.c#21 $ + */ + +/* + * This file contains the main entry points for normal operations on a VDO as + * well as functions for constructing and destroying VDO instances (in memory). + */ + +#include "vdoInternal.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "extent.h" +#include "hashZone.h" +#include "header.h" +#include "logicalZone.h" +#include "numUtils.h" +#include "packer.h" +#include "physicalZone.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "statistics.h" +#include "statusCodes.h" +#include "threadConfig.h" +#include "vdoLayout.h" +#include "vioWrite.h" +#include "volumeGeometry.h" + +/** + * The master version of the on-disk format of a VDO. This should be + * incremented any time the on-disk representation of any VDO structure + * changes. Changes which require only online upgrade steps should increment + * the minor version. Changes which require an offline upgrade or which can not + * be upgraded to at all should increment the major version and set the minor + * version to 0. + **/ +static const VersionNumber VDO_MASTER_VERSION_67_0 = { + .majorVersion = 67, + .minorVersion = 0, +}; + +/** + * The current version for the data encoded in the super block. This must + * be changed any time there is a change to encoding of the component data + * of any VDO component. + **/ +static const VersionNumber VDO_COMPONENT_DATA_41_0 = { + .majorVersion = 41, + .minorVersion = 0, +}; + +/** + * This is the structure that captures the VDO fields saved as a SuperBlock + * component. + **/ +typedef struct { + VDOState state; + uint64_t completeRecoveries; + uint64_t readOnlyRecoveries; + VDOConfig config; + Nonce nonce; +} __attribute__((packed)) VDOComponent41_0; + +/**********************************************************************/ +int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) +{ + int result = registerStatusCodes(); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo; + result = ALLOCATE(1, VDO, __func__, &vdo); + if (result != UDS_SUCCESS) { + return result; + } + + vdo->layer = layer; + if (layer->createEnqueueable != NULL) { + result = initializeAdminCompletion(vdo, &vdo->adminCompletion); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + } + + *vdoPtr = vdo; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) +{ + VDO *vdo; + int result = allocateVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeZeroThreadConfig(&vdo->loadConfig.threadConfig); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + *vdoPtr = vdo; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void destroyVDO(VDO *vdo) +{ + freeFlusher(&vdo->flusher); + freePacker(&vdo->packer); + freeRecoveryJournal(&vdo->recoveryJournal); + freeSlabDepot(&vdo->depot); + freeVDOLayout(&vdo->layout); + freeSuperBlock(&vdo->superBlock); + freeBlockMap(&vdo->blockMap); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (vdo->hashZones != NULL) { + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + freeHashZone(&vdo->hashZones[zone]); + } + } + FREE(vdo->hashZones); + vdo->hashZones = NULL; + + freeLogicalZones(&vdo->logicalZones); + + if (vdo->physicalZones != NULL) { + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + freePhysicalZone(&vdo->physicalZones[zone]); + } + } + FREE(vdo->physicalZones); + vdo->physicalZones = NULL; + + uninitializeAdminCompletion(&vdo->adminCompletion); + freeReadOnlyNotifier(&vdo->readOnlyNotifier); + freeThreadConfig(&vdo->loadConfig.threadConfig); +} + +/**********************************************************************/ +void freeVDO(VDO **vdoPtr) +{ + if (*vdoPtr == NULL) { + return; + } + + destroyVDO(*vdoPtr); + FREE(*vdoPtr); + *vdoPtr = NULL; +} + +/**********************************************************************/ +size_t getComponentDataSize(VDO *vdo) +{ + return (sizeof(VersionNumber) + + sizeof(VersionNumber) + + sizeof(VDOComponent41_0) + + getVDOLayoutEncodedSize(vdo->layout) + + getRecoveryJournalEncodedSize() + + getSlabDepotEncodedSize() + + getBlockMapEncodedSize()); +} + +/** + * Encode the VDO master version. + * + * @param buffer The buffer in which to encode the version + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeMasterVersion(Buffer *buffer) +{ + return encodeVersionNumber(VDO_MASTER_VERSION_67_0, buffer); +} + +/** + * Encode a VDOConfig structure into a buffer. + * + * @param config The config structure to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeVDOConfig(const VDOConfig *config, Buffer *buffer) +{ + int result = putUInt64LEIntoBuffer(buffer, config->logicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabSize); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->recoveryJournalSize); + if (result != VDO_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); +} + +/** + * Encode the component data for the VDO itself. + * + * @param vdo The vdo to encode + * @param buffer The buffer in which to encode the VDO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeVDOComponent(const VDO *vdo, Buffer *buffer) +{ + int result = encodeVersionNumber(VDO_COMPONENT_DATA_41_0, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt32LEIntoBuffer(buffer, vdo->state); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->completeRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->readOnlyRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOConfig(&vdo->config, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->nonce); + if (result != VDO_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(encodedSize == sizeof(VDOComponent41_0), + "encoded VDO component size must match structure size"); +} + +/**********************************************************************/ +static int encodeVDO(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + int result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeMasterVersion(buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOComponent(vdo, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOLayout(vdo->layout, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeSlabDepot(vdo->depot, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeBlockMap(vdo->blockMap, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == getComponentDataSize(vdo)), + "All super block component data was encoded"); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int saveVDOComponents(VDO *vdo) +{ + int result = encodeVDO(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); +} + +/**********************************************************************/ +void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent) +{ + int result = encodeVDO(vdo); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + saveSuperBlockAsync(vdo->superBlock, getFirstBlockOffset(vdo), parent); +} + +/**********************************************************************/ +int saveReconfiguredVDO(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + size_t componentsSize = contentLength(buffer); + + byte *components; + int result = copyBytes(buffer, componentsSize, &components); + if (result != VDO_SUCCESS) { + return result; + } + + result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = encodeMasterVersion(buffer); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = encodeVDOComponent(vdo, buffer); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = putBytes(buffer, componentsSize, components); + FREE(components); + if (result != VDO_SUCCESS) { + return result; + } + + return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); +} + +/**********************************************************************/ +int decodeVDOVersion(VDO *vdo) +{ + return decodeVersionNumber(getComponentBuffer(vdo->superBlock), + &vdo->loadVersion); +} + +/**********************************************************************/ +int validateVDOVersion(VDO *vdo) +{ + int result = decodeVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + ReleaseVersionNumber loadedReleaseVersion + = getLoadedReleaseVersion(vdo->superBlock); + if (vdo->loadConfig.releaseVersion != loadedReleaseVersion) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Geometry release version %" PRIu32 " does " + "not match super block release version %" + PRIu32, + vdo->loadConfig.releaseVersion, + loadedReleaseVersion); + } + + return validateVersion(VDO_MASTER_VERSION_67_0, vdo->loadVersion, "master"); +} + +/** + * Decode a VDOConfig structure from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The config structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int decodeVDOConfig(Buffer *buffer, VDOConfig *config) +{ + BlockCount logicalBlocks; + int result = getUInt64LEFromBuffer(buffer, &logicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount physicalBlocks; + result = getUInt64LEFromBuffer(buffer, &physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabSize; + result = getUInt64LEFromBuffer(buffer, &slabSize); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount recoveryJournalSize; + result = getUInt64LEFromBuffer(buffer, &recoveryJournalSize); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabJournalBlocks; + result = getUInt64LEFromBuffer(buffer, &slabJournalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + *config = (VDOConfig) { + .logicalBlocks = logicalBlocks, + .physicalBlocks = physicalBlocks, + .slabSize = slabSize, + .recoveryJournalSize = recoveryJournalSize, + .slabJournalBlocks = slabJournalBlocks, + }; + return VDO_SUCCESS; +} + +/** + * Decode the version 41.0 component state for the VDO itself from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) + static int decodeVDOComponent_41_0(Buffer *buffer, VDOComponent41_0 *state) +{ + size_t initialLength = contentLength(buffer); + + VDOState vdoState; + int result = getUInt32LEFromBuffer(buffer, &vdoState); + if (result != VDO_SUCCESS) { + return result; + } + + uint64_t completeRecoveries; + result = getUInt64LEFromBuffer(buffer, &completeRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + uint64_t readOnlyRecoveries; + result = getUInt64LEFromBuffer(buffer, &readOnlyRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + VDOConfig config; + result = decodeVDOConfig(buffer, &config); + if (result != VDO_SUCCESS) { + return result; + } + + Nonce nonce; + result = getUInt64LEFromBuffer(buffer, &nonce); + if (result != VDO_SUCCESS) { + return result; + } + + *state = (VDOComponent41_0) { + .state = vdoState, + .completeRecoveries = completeRecoveries, + .readOnlyRecoveries = readOnlyRecoveries, + .config = config, + .nonce = nonce, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(decodedSize == sizeof(VDOComponent41_0), + "decoded VDO component size must match structure size"); +} + +/**********************************************************************/ +int decodeVDOComponent(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + + VersionNumber version; + int result = decodeVersionNumber(buffer, &version); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateVersion(version, VDO_COMPONENT_DATA_41_0, + "VDO component data"); + if (result != VDO_SUCCESS) { + return result; + } + + VDOComponent41_0 component; + result = decodeVDOComponent_41_0(buffer, &component); + if (result != VDO_SUCCESS) { + return result; + } + + // Copy the decoded component into the VDO structure. + vdo->state = component.state; + vdo->loadState = component.state; + vdo->completeRecoveries = component.completeRecoveries; + vdo->readOnlyRecoveries = component.readOnlyRecoveries; + vdo->config = component.config; + vdo->nonce = component.nonce; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int validateVDOConfig(const VDOConfig *config, + BlockCount blockCount, + bool requireLogical) +{ + int result = ASSERT(config->slabSize > 0, "slab size unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(isPowerOfTwo(config->slabSize), + "slab size must be a power of two"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->slabSize <= (1 << MAX_SLAB_BITS), + "slab size must be less than or equal to 2^%d", + MAX_SLAB_BITS); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(config->slabJournalBlocks >= MINIMUM_SLAB_JOURNAL_BLOCKS, + "slab journal size meets minimum size"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->slabJournalBlocks <= config->slabSize, + "slab journal size is within expected bound"); + if (result != UDS_SUCCESS) { + return result; + } + + SlabConfig slabConfig; + result = configureSlab(config->slabSize, config->slabJournalBlocks, + &slabConfig); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT((slabConfig.dataBlocks >= 1), + "slab must be able to hold at least one block"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->physicalBlocks > 0, "physical blocks unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->physicalBlocks <= MAXIMUM_PHYSICAL_BLOCKS, + "physical block count %llu exceeds maximum %llu", + config->physicalBlocks, MAXIMUM_PHYSICAL_BLOCKS); + if (result != UDS_SUCCESS) { + return VDO_OUT_OF_RANGE; + } + + // This can't check equality because FileLayer et al can only known about + // the storage size, which may not match the super block size. + if (blockCount < config->physicalBlocks) { + logError("A physical size of %llu blocks was specified," + " but that is smaller than the %llu blocks" + " configured in the VDO super block", + blockCount, config->physicalBlocks); + return VDO_PARAMETER_MISMATCH; + } + + result = ASSERT(!requireLogical || (config->logicalBlocks > 0), + "logical blocks unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->logicalBlocks <= MAXIMUM_LOGICAL_BLOCKS, + "logical blocks too large"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->recoveryJournalSize > 0, + "recovery journal size unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(isPowerOfTwo(config->recoveryJournalSize), + "recovery journal size must be a power of two"); + if (result != UDS_SUCCESS) { + return result; + } + + return result; +} + +/** + * Notify a VDO that it is going read-only. This will save the read-only state + * to the super block. + * + *

Implements ReadOnlyNotification. + * + * @param listener The VDO + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyVDOOfReadOnlyMode(void *listener, VDOCompletion *parent) +{ + VDO *vdo = listener; + if (inReadOnlyMode(vdo)) { + completeCompletion(parent); + } + + vdo->state = VDO_READ_ONLY_MODE; + saveVDOComponentsAsync(vdo, parent); +} + +/**********************************************************************/ +int enableReadOnlyEntry(VDO *vdo) +{ + return registerReadOnlyListener(vdo->readOnlyNotifier, vdo, + notifyVDOOfReadOnlyMode, + getAdminThread(getThreadConfig(vdo))); +} + +/**********************************************************************/ +bool inReadOnlyMode(const VDO *vdo) +{ + return (vdo->state == VDO_READ_ONLY_MODE); +} + +/**********************************************************************/ +bool isClean(const VDO *vdo) +{ + return ((vdo->state == VDO_CLEAN) || (vdo->state == VDO_NEW)); +} + +/**********************************************************************/ +bool wasClean(const VDO *vdo) +{ + return ((vdo->loadState == VDO_CLEAN) || (vdo->loadState == VDO_NEW)); +} + +/**********************************************************************/ +bool wasNew(const VDO *vdo) +{ + return (vdo->loadState == VDO_NEW); +} + +/**********************************************************************/ +bool requiresReadOnlyRebuild(const VDO *vdo) +{ + return ((vdo->loadState == VDO_FORCE_REBUILD) + || (vdo->loadState == VDO_REBUILD_FOR_UPGRADE)); +} + +/**********************************************************************/ +bool requiresRebuild(const VDO *vdo) +{ + return ((vdo->state == VDO_DIRTY) + || (vdo->state == VDO_FORCE_REBUILD) + || (vdo->state == VDO_REPLAYING) + || (vdo->state == VDO_REBUILD_FOR_UPGRADE)); +} + +/**********************************************************************/ +bool requiresRecovery(const VDO *vdo) +{ + return ((vdo->loadState == VDO_DIRTY) || (vdo->loadState == VDO_REPLAYING) + || (vdo->loadState == VDO_RECOVERING)); +} + +/**********************************************************************/ +bool isReplaying(const VDO *vdo) +{ + return (vdo->state == VDO_REPLAYING); +} + +/**********************************************************************/ +bool inRecoveryMode(const VDO *vdo) +{ + return (vdo->state == VDO_RECOVERING); +} + +/**********************************************************************/ +void enterRecoveryMode(VDO *vdo) +{ + assertOnAdminThread(vdo, __func__); + + if (inReadOnlyMode(vdo)) { + return; + } + + logInfo("Entering recovery mode"); + vdo->state = VDO_RECOVERING; +} + +/**********************************************************************/ +void leaveRecoveryMode(VDO *vdo) +{ + assertOnAdminThread(vdo, __func__); + + /* + * Since scrubbing can be stopped by vdoClose during recovery mode, + * do not change the VDO state if there are outstanding unrecovered slabs. + */ + if (inReadOnlyMode(vdo)) { + return; + } + + ASSERT_LOG_ONLY(inRecoveryMode(vdo), "VDO is in recovery mode"); + logInfo("Exiting recovery mode"); + vdo->state = VDO_DIRTY; +} + +/**********************************************************************/ +void makeVDOReadOnly(VDO *vdo, int errorCode) +{ + enterReadOnlyMode(vdo->readOnlyNotifier, errorCode); +} + +/**********************************************************************/ +bool setVDOCompressing(VDO *vdo, bool enableCompression) +{ + bool stateChanged = compareAndSwapBool(&vdo->compressing, !enableCompression, + enableCompression); + if (stateChanged && !enableCompression) { + // Flushing the packer is asynchronous, but we don't care when it + // finishes. + flushPacker(vdo->packer); + } + + logInfo("compression is %s", (enableCompression ? "enabled" : "disabled")); + return (stateChanged ? !enableCompression : enableCompression); +} + +/**********************************************************************/ +bool getVDOCompressing(VDO *vdo) +{ + return atomicLoadBool(&vdo->compressing); +} + +/**********************************************************************/ +static size_t getBlockMapCacheSize(const VDO *vdo) +{ + return ((size_t) vdo->loadConfig.cacheSize) * VDO_BLOCK_SIZE; +} + +/** + * Tally the hash lock statistics from all the hash zones. + * + * @param vdo The vdo to query + * + * @return The sum of the hash lock statistics from all hash zones + **/ +static HashLockStatistics getHashLockStatistics(const VDO *vdo) +{ + HashLockStatistics totals; + memset(&totals, 0, sizeof(totals)); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + HashLockStatistics stats = getHashZoneStatistics(vdo->hashZones[zone]); + totals.dedupeAdviceValid += stats.dedupeAdviceValid; + totals.dedupeAdviceStale += stats.dedupeAdviceStale; + totals.concurrentDataMatches += stats.concurrentDataMatches; + totals.concurrentHashCollisions += stats.concurrentHashCollisions; + } + + return totals; +} + +/** + * Get the current error statistics from VDO. + * + * @param vdo The vdo to query + * + * @return a copy of the current VDO error counters + **/ +static ErrorStatistics getVDOErrorStatistics(const VDO *vdo) +{ + /* + * The error counts can be incremented from arbitrary threads and so must be + * incremented atomically, but they are just statistics with no semantics + * that could rely on memory order, so unfenced reads are sufficient. + */ + const AtomicErrorStatistics *atoms = &vdo->errorStats; + return (ErrorStatistics) { + .invalidAdvicePBNCount = relaxedLoad64(&atoms->invalidAdvicePBNCount), + .noSpaceErrorCount = relaxedLoad64(&atoms->noSpaceErrorCount), + .readOnlyErrorCount = relaxedLoad64(&atoms->readOnlyErrorCount), + }; +} + +/**********************************************************************/ +static const char *describeWritePolicy(WritePolicy policy) +{ + switch (policy) { + case WRITE_POLICY_ASYNC: + return "async"; + case WRITE_POLICY_ASYNC_UNSAFE: + return "async-unsafe"; + case WRITE_POLICY_SYNC: + return "sync"; + default: + return "unknown"; + } +} + +/**********************************************************************/ +void getVDOStatistics(const VDO *vdo, VDOStatistics *stats) +{ + // These are immutable properties of the VDO object, so it is safe to + // query them from any thread. + RecoveryJournal *journal = vdo->recoveryJournal; + SlabDepot *depot = vdo->depot; + // XXX config.physicalBlocks is actually mutated during resize and is in a + // packed structure, but resize runs on the admin thread so we're usually OK. + stats->version = STATISTICS_VERSION; + stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + stats->logicalBlocks = vdo->config.logicalBlocks; + stats->physicalBlocks = vdo->config.physicalBlocks; + stats->blockSize = VDO_BLOCK_SIZE; + stats->completeRecoveries = vdo->completeRecoveries; + stats->readOnlyRecoveries = vdo->readOnlyRecoveries; + stats->blockMapCacheSize = getBlockMapCacheSize(vdo); + snprintf(stats->writePolicy, sizeof(stats->writePolicy), "%s", + describeWritePolicy(getWritePolicy(vdo))); + + // The callees are responsible for thread-safety. + stats->dataBlocksUsed = getPhysicalBlocksAllocated(vdo); + stats->overheadBlocksUsed = getPhysicalBlocksOverhead(vdo); + stats->logicalBlocksUsed = getJournalLogicalBlocksUsed(journal); + stats->allocator = getDepotBlockAllocatorStatistics(depot); + stats->journal = getRecoveryJournalStatistics(journal); + stats->packer = getPackerStatistics(vdo->packer); + stats->slabJournal = getDepotSlabJournalStatistics(depot); + stats->slabSummary = getSlabSummaryStatistics(getSlabSummary(depot)); + stats->refCounts = getDepotRefCountsStatistics(depot); + stats->blockMap = getBlockMapStatistics(vdo->blockMap); + stats->hashLock = getHashLockStatistics(vdo); + stats->errors = getVDOErrorStatistics(vdo); + SlabCount slabTotal = getDepotSlabCount(depot); + stats->recoveryPercentage + = (slabTotal - getDepotUnrecoveredSlabCount(depot)) * 100 / slabTotal; + + // The "state" field is mutable, but we just need a unfenced atomic read. + VDOState state = *((const volatile VDOState *) &vdo->state); + stats->inRecoveryMode = (state == VDO_RECOVERING); + snprintf(stats->mode, sizeof(stats->mode), "%s", describeVDOState(state)); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksAllocated(const VDO *vdo) +{ + return (getDepotAllocatedBlocks(vdo->depot) + - getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksFree(const VDO *vdo) +{ + return getDepotFreeBlocks(vdo->depot); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksOverhead(const VDO *vdo) +{ + // XXX config.physicalBlocks is actually mutated during resize and is in a + // packed structure, but resize runs on admin thread so we're usually OK. + return (vdo->config.physicalBlocks + - getDepotDataBlocks(vdo->depot) + + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +BlockCount getTotalBlockMapBlocks(const VDO *vdo) +{ + return (getNumberOfFixedBlockMapPages(vdo->blockMap) + + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +WritePolicy getWritePolicy(const VDO *vdo) +{ + return vdo->loadConfig.writePolicy; +} + +/**********************************************************************/ +void setWritePolicy(VDO *vdo, WritePolicy new) +{ + vdo->loadConfig.writePolicy = new; +} + +/**********************************************************************/ +const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) +{ + return &vdo->loadConfig; +} + +/**********************************************************************/ +const ThreadConfig *getThreadConfig(const VDO *vdo) +{ + return vdo->loadConfig.threadConfig; +} + +/**********************************************************************/ +BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) +{ + return vdo->loadConfig.maximumAge; +} + +/**********************************************************************/ +PageCount getConfiguredCacheSize(const VDO *vdo) +{ + return vdo->loadConfig.cacheSize; +} + +/**********************************************************************/ +PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) +{ + return vdo->loadConfig.firstBlockOffset; +} + +/**********************************************************************/ +BlockMap *getBlockMap(const VDO *vdo) +{ + return vdo->blockMap; +} + +/**********************************************************************/ +SlabDepot *getSlabDepot(VDO *vdo) +{ + return vdo->depot; +} + +/**********************************************************************/ +RecoveryJournal *getRecoveryJournal(VDO *vdo) +{ + return vdo->recoveryJournal; +} + +/**********************************************************************/ +void dumpVDOStatus(const VDO *vdo) +{ + dumpFlusher(vdo->flusher); + dumpRecoveryJournalStatistics(vdo->recoveryJournal); + dumpPacker(vdo->packer); + dumpSlabDepot(vdo->depot); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { + dumpLogicalZone(getLogicalZone(vdo->logicalZones, zone)); + } + + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + dumpPhysicalZone(vdo->physicalZones[zone]); + } + + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + dumpHashZone(vdo->hashZones[zone]); + } +} + +/**********************************************************************/ +void setVDOTracingFlags(VDO *vdo, bool vioTracing) +{ + vdo->vioTraceRecording = vioTracing; +} + +/**********************************************************************/ +bool vdoVIOTracingEnabled(const VDO *vdo) +{ + return ((vdo != NULL) && vdo->vioTraceRecording); +} + +/**********************************************************************/ +void assertOnAdminThread(VDO *vdo, const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getAdminThread(getThreadConfig(vdo))), + "%s called on admin thread", name); +} + +/**********************************************************************/ +void assertOnLogicalZoneThread(const VDO *vdo, + ZoneCount logicalZone, + const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getLogicalZoneThread(getThreadConfig(vdo), logicalZone)), + "%s called on logical thread", name); +} + +/**********************************************************************/ +void assertOnPhysicalZoneThread(const VDO *vdo, + ZoneCount physicalZone, + const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getPhysicalZoneThread(getThreadConfig(vdo), + physicalZone)), + "%s called on physical thread", name); +} + +/**********************************************************************/ +HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) +{ + /* + * Use a fragment of the chunk name as a hash code. To ensure uniform + * distributions, it must not overlap with fragments used elsewhere. Eight + * bits of hash should suffice since the number of hash zones is small. + */ + // XXX Make a central repository for these offsets ala hashUtils. + // XXX Verify that the first byte is independent enough. + uint32_t hash = name->name[0]; + + /* + * Scale the 8-bit hash fragment to a zone index by treating it as a binary + * fraction and multiplying that by the zone count. If the hash is uniformly + * distributed over [0 .. 2^8-1], then (hash * count / 2^8) should be + * uniformly distributed over [0 .. count-1]. The multiply and shift is much + * faster than a divide (modulus) on X86 CPUs. + */ + return vdo->hashZones[(hash * getThreadConfig(vdo)->hashZoneCount) >> 8]; +} + +/**********************************************************************/ +int getPhysicalZone(const VDO *vdo, + PhysicalBlockNumber pbn, + PhysicalZone **zonePtr) +{ + if (pbn == ZERO_BLOCK) { + *zonePtr = NULL; + return VDO_SUCCESS; + } + + // Used because it does a more restrictive bounds check than getSlab(), and + // done first because it won't trigger read-only mode on an invalid PBN. + if (!isPhysicalDataBlock(vdo->depot, pbn)) { + return VDO_OUT_OF_RANGE; + } + + // With the PBN already checked, we should always succeed in finding a slab. + Slab *slab = getSlab(vdo->depot, pbn); + int result = ASSERT(slab != NULL, "getSlab must succeed on all valid PBNs"); + if (result != VDO_SUCCESS) { + return result; + } + + *zonePtr = vdo->physicalZones[getSlabZoneNumber(slab)]; + return VDO_SUCCESS; +} + +/**********************************************************************/ +ZonedPBN validateDedupeAdvice(VDO *vdo, + const DataLocation *advice, + LogicalBlockNumber lbn) +{ + ZonedPBN noAdvice = { .pbn = ZERO_BLOCK }; + if (advice == NULL) { + return noAdvice; + } + + // Don't use advice that's clearly meaningless. + if ((advice->state == MAPPING_STATE_UNMAPPED) + || (advice->pbn == ZERO_BLOCK)) { + logDebug("Invalid advice from deduplication server: pbn %llu, " + "state %u. Giving up on deduplication of logical block %llu", + advice->pbn, advice->state, lbn); + atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); + return noAdvice; + } + + PhysicalZone *zone; + int result = getPhysicalZone(vdo, advice->pbn, &zone); + if ((result != VDO_SUCCESS) || (zone == NULL)) { + logDebug("Invalid physical block number from deduplication server: %" + PRIu64 ", giving up on deduplication of logical block %llu", + advice->pbn, lbn); + atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); + return noAdvice; + } + + return (ZonedPBN) { + .pbn = advice->pbn, + .state = advice->state, + .zone = zone, + }; +} diff --git a/vdo/base/vdo.h b/vdo/base/vdo.h new file mode 100644 index 0000000..5741112 --- /dev/null +++ b/vdo/base/vdo.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.h#3 $ + */ + +#ifndef VDO_H +#define VDO_H + +#include "types.h" + +/** + * Allocate a VDO and associate it with its physical layer. + * + * @param [in] layer The physical layer the VDO sits on + * @param [out] vdoPtr A pointer to hold the allocated VDO + * + * @return VDO_SUCCESS or an error + **/ +int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Construct a VDO for use in user space with a synchronous layer. + * + * @param [in] layer The physical layer the VDO sits on + * @param [out] vdoPtr A pointer to hold the allocated VDO + * + * @return VDO_SUCCESS or an error + **/ +int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a VDO instance. + * + * @param vdo The VDO to destroy + **/ +void destroyVDO(VDO *vdo); + +/** + * Destroy a VDO instance, free it, and null out the reference to it. + * + * @param vdoPtr A reference to the VDO to free + **/ +void freeVDO(VDO **vdoPtr); + +/** + * Put a VDO into read-only mode and save the read-only state in the super + * block. + * + * @param vdo The VDO to put into read-only mode + * @param errorCode The error which caused the VDO to enter read-only + * mode + **/ +void makeVDOReadOnly(VDO *vdo, int errorCode); + +/** + * Set whether compression is enabled in VDO. + * + * @param vdo The VDO + * @param enableCompression Whether to enable compression in VDO + * + * @return State of compression before new value is set + **/ +bool setVDOCompressing(VDO *vdo, bool enableCompression); + +/** + * Get whether compression is enabled in VDO. + * + * @param vdo The VDO + * + * @return State of compression + **/ +bool getVDOCompressing(VDO *vdo); + +/** + * Get the VDO statistics. + * + * @param [in] vdo The VDO + * @param [out] stats The VDO statistics are returned here + **/ +void getVDOStatistics(const VDO *vdo, VDOStatistics *stats); + +/** + * Get the number of physical blocks in use by user data. + * + * @param vdo The VDO + * + * @return The number of blocks allocated for user data + **/ +BlockCount getPhysicalBlocksAllocated(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the number of unallocated physical blocks. + * + * @param vdo The VDO + * + * @return The number of free blocks + **/ +BlockCount getPhysicalBlocksFree(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the number of physical blocks used by VDO metadata. + * + * @param vdo The VDO + * + * @return The number of overhead blocks + **/ +BlockCount getPhysicalBlocksOverhead(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the total number of blocks used for the block map. + * + * @param vdo The VDO + * + * @return The number of block map blocks + **/ +BlockCount getTotalBlockMapBlocks(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the VDO write policy. + * + * @param vdo The VDO + * + * @return The write policy + **/ +WritePolicy getWritePolicy(const VDO *vdo); + +/** + * Set the VDO write policy. + * + * @param vdo The VDO + * @param new The new write policy + **/ +void setWritePolicy(VDO *vdo, WritePolicy new); + +/** + * Get a copy of the load-time configuration of the VDO. + * + * @param vdo The VDO + * + * @return The load-time configuration of the VDO + **/ +const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the thread config of the VDO. + * + * @param vdo The VDO + * + * @return The thread config + **/ +const ThreadConfig *getThreadConfig(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the configured maximum age of a dirty block map page. + * + * @param vdo The VDO + * + * @return The block map era length + **/ +BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the configured page cache size of the VDO. + * + * @param vdo The VDO + * + * @return The number of pages for the page cache + **/ +PageCount getConfiguredCacheSize(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the location of the first block of the VDO. + * + * @param vdo The VDO + * + * @return The location of the first block managed by the VDO + **/ +PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO was new when it was loaded. + * + * @param vdo The VDO to query + * + * @return true if the VDO was new + **/ +bool wasNew(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataLocation containing potential dedupe advice is + * well-formed and addresses a data block in one of the configured physical + * zones of the VDO. If it is, return the location and zone as a ZonedPBN; + * otherwise increment statistics tracking invalid advice and return an + * unmapped ZonedPBN. + * + * @param vdo The VDO + * @param advice The advice to validate (NULL indicates no advice) + * @param lbn The logical block number of the write that requested advice, + * which is only used for debug-level logging of invalid advice + * + * @return The ZonedPBN representing the advice, if valid, otherwise an + * unmapped ZonedPBN if the advice was invalid or NULL + **/ +ZonedPBN validateDedupeAdvice(VDO *vdo, + const DataLocation *advice, + LogicalBlockNumber lbn) + __attribute__((warn_unused_result)); + +// TEST SUPPORT ONLY BEYOND THIS POINT + +/** + * Dump status information about VDO to the log for debugging. + * + * @param vdo The vdo to dump + **/ +void dumpVDOStatus(const VDO *vdo); + +/** + * Set the VIO tracing flag. + * + * @param vdo The VDO + * @param vioTracing Whether VIO tracing is enabled for this device + **/ +void setVDOTracingFlags(VDO *vdo, bool vioTracing); + +/** + * Indicate whether VIO tracing is enabled. + * + * @param vdo The VDO + * + * @return Whether VIO tracing is enabled + **/ +bool vdoVIOTracingEnabled(const VDO *vdo); + +/** + * Indicate whether extent tracing is enabled. + * + * @param vdo The VDO + * + * @return Whether extent tracing is enabled + **/ +bool vdoExtentTracingEnabled(const VDO *vdo); + +#endif /* VDO_H */ diff --git a/vdo/base/vdoDebug.c b/vdo/base/vdoDebug.c new file mode 100644 index 0000000..6c03ece --- /dev/null +++ b/vdo/base/vdoDebug.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.c#1 $ + */ + +#include "vdoDebug.h" + +#include "logger.h" +#include "stringUtils.h" +#include "vdoInternal.h" + +static const char xLogDebugMessage[] = "x-log-debug-message"; + +/**********************************************************************/ +int initializeVDOCommandCompletion(VDOCommandCompletion *command, + VDO *vdo, + int argc, + char **argv) +{ + *command = (VDOCommandCompletion) { + .vdo = vdo, + .argc = argc, + .argv = argv, + }; + initializeCompletion(&command->completion, VDO_COMMAND_COMPLETION, + vdo->layer); + return initializeEnqueueableCompletion(&command->subCompletion, + VDO_COMMAND_SUB_COMPLETION, + vdo->layer); +} + +/**********************************************************************/ +int destroyVDOCommandCompletion(VDOCommandCompletion *command) +{ + if (command == NULL) { + return VDO_SUCCESS; + } + + destroyEnqueueable(&command->subCompletion); + return command->completion.result; +} + +/**********************************************************************/ +static inline VDOCommandCompletion * +asVDOCommandCompletion(VDOCompletion *completion) +{ + if (completion->type == VDO_COMMAND_COMPLETION) { + return (VDOCommandCompletion *) + ((uintptr_t) completion - offsetof(VDOCommandCompletion, completion)); + } else if (completion->type == VDO_COMMAND_SUB_COMPLETION) { + return (VDOCommandCompletion *) + ((uintptr_t) completion - offsetof(VDOCommandCompletion, subCompletion)); + } else { + ASSERT_LOG_ONLY(((completion->type == VDO_COMMAND_COMPLETION) || + (completion->type == VDO_COMMAND_SUB_COMPLETION)), + "completion type is %s instead of " + "VDO_COMMAND_COMPLETION or VDO_COMMAND_SUB_COMPLETION", + getCompletionTypeName(completion->type)); + return NULL; + } +} + +/**********************************************************************/ +static void logDebugMessage(VDOCommandCompletion *cmd) +{ + static char buffer[256]; + + char *buf = buffer; + char *end = buffer + sizeof(buffer); + + for (int i = 1; i < cmd->argc; ++i) { + buf = appendToBuffer(buf, end, " %s", cmd->argv[i]); + } + if (buf == end) { + strcpy(buf - 4, "..."); + } + logInfo("debug message:%s", buffer); + finishCompletion(&cmd->completion, VDO_SUCCESS); +} + +/**********************************************************************/ +void executeVDOExtendedCommand(VDOCompletion *completion) +{ + VDOCommandCompletion *cmd = asVDOCommandCompletion(completion); + + if ((cmd->vdo == NULL) || (cmd->argc == 0)) { + finishCompletion(&cmd->completion, VDO_COMMAND_ERROR); + return; + } + if (strcmp(cmd->argv[0], xLogDebugMessage) == 0) { + logDebugMessage(cmd); + } else { + finishCompletion(&cmd->completion, VDO_UNKNOWN_COMMAND); + } +} diff --git a/vdo/base/vdoDebug.h b/vdo/base/vdoDebug.h new file mode 100644 index 0000000..c626533 --- /dev/null +++ b/vdo/base/vdoDebug.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.h#1 $ + */ + +#ifndef VDO_DEBUG_H +#define VDO_DEBUG_H + +#include "completion.h" +#include "vdo.h" + +/** + * A completion used to pass information to a potentially asynchronous + * (because it must run in a different zone) extended command. + * + * These commands are dispatched according to argv[0], which is of the form + * "x-some-command-name", and intentionally open ended for debugging. + * + * The command "x-log-debug-message" is currently defined to echo the + * remainder of the arguments into the kernel log via the vdo logger at + * info level. + **/ +typedef struct vdoCommandCompletion { + VDOCompletion completion; + VDOCompletion subCompletion; + VDO *vdo; + int argc; + char **argv; +} VDOCommandCompletion; + +/** + * Initialize a VDO command completion. + * + * @param command The command completion to initialize. + * @param vdo The VDO. + * @param argc An argument count. + * @param argv An argument vector of length argc. + * + * @return VDO_SUCCESS or an error code + **/ +int initializeVDOCommandCompletion(VDOCommandCompletion *command, + VDO *vdo, + int argc, + char **argv); + +/** + * Destroy a VDO command completion. + * + * @param command The command completion. + * + * @return the completion result + **/ +int destroyVDOCommandCompletion(VDOCommandCompletion *command); + +/** + * Perform an asynchronous extended command (usually debugging related). + * + * @param completion The completion embedded in VDOCommandCompletion. + **/ +void executeVDOExtendedCommand(VDOCompletion *completion); + +#endif // VDO_DEBUG_H diff --git a/vdo/base/vdoInternal.h b/vdo/base/vdoInternal.h new file mode 100644 index 0000000..1337e73 --- /dev/null +++ b/vdo/base/vdoInternal.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoInternal.h#11 $ + */ + +#ifndef VDO_INTERNAL_H +#define VDO_INTERNAL_H + +#include "vdo.h" + +#include "adminCompletion.h" +#include "adminState.h" +#include "atomic.h" +#include "header.h" +#include "packer.h" +#include "statistics.h" +#include "superBlock.h" +#include "readOnlyNotifier.h" +#include "types.h" +#include "uds.h" +#include "vdoLayout.h" +#include "vdoState.h" + +/** + * Error counters are atomic since updates can arrive concurrently from + * arbitrary threads. + **/ +typedef struct atomicErrorStatistics { + // Dedupe path error stats + Atomic64 invalidAdvicePBNCount; + Atomic64 noSpaceErrorCount; + Atomic64 readOnlyErrorCount; +} AtomicErrorStatistics; + +struct vdo { + /* The state of this VDO */ + VDOState state; + /* The read-only notifier */ + ReadOnlyNotifier *readOnlyNotifier; + /* The number of times this VDO has recovered from a dirty state */ + uint64_t completeRecoveries; + /* The number of times this VDO has recovered from a read-only state */ + uint64_t readOnlyRecoveries; + /* The format-time configuration of this VDO */ + VDOConfig config; + /* The load-time configuration of this VDO */ + VDOLoadConfig loadConfig; + /* The nonce for this VDO */ + Nonce nonce; + + /* The super block */ + SuperBlock *superBlock; + + /* The physical storage below us */ + PhysicalLayer *layer; + + /* Our partitioning of the physical layer's storage */ + VDOLayout *layout; + + /* The block map */ + BlockMap *blockMap; + + /* The journal for block map recovery */ + RecoveryJournal *recoveryJournal; + + /* The slab depot */ + SlabDepot *depot; + + /* The compressed-block packer */ + Packer *packer; + /* Whether incoming data should be compressed */ + AtomicBool compressing; + + /* The handler for flush requests */ + Flusher *flusher; + + /* The master version of the VDO when loaded (for upgrading) */ + VersionNumber loadVersion; + /* The state the VDO was in when loaded (primarily for unit tests) */ + VDOState loadState; + /* Whether VIO tracing is enabled */ + bool vioTraceRecording; + + /* The logical zones of this VDO */ + LogicalZones *logicalZones; + + /* The physical zones of this VDO */ + PhysicalZone **physicalZones; + + /* The hash lock zones of this VDO */ + HashZone **hashZones; + + /* The completion for administrative operations */ + AdminCompletion adminCompletion; + + /* The administrative state of the VDO */ + AdminState adminState; + + /* Whether a close is required */ + bool closeRequired; + + /* Atomic global counts of error events */ + AtomicErrorStatistics errorStats; +}; + +/** + * Get the component data size of a VDO. + * + * @param vdo The VDO whose component data size is desired + * + * @return the component data size of the VDO + **/ +size_t getComponentDataSize(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Encode the VDO and save the super block synchronously. + * + * @param vdo The VDO whose state is being saved + * + * @return VDO_SUCCESS or an error + **/ +int saveVDOComponents(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Encode the VDO and save the super block asynchronously. All non-user mode + * super block savers should use this bottle neck instead of calling + * saveSuperBlockAsync() directly. + * + * @param vdo The VDO whose state is being saved + * @param parent The completion to notify when the save is complete + **/ +void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent); + +/** + * Re-encode the VDO component after a reconfiguration and save the super + * block synchronously. This function avoids the need to decode and re-encode + * the other components by simply copying their previous encoding. + * + * @param vdo The VDO which was reconfigured + * + * @return VDO_SUCCESS or an error code + **/ +int saveReconfiguredVDO(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Decode the VDO master version from the component data buffer in the super + * block and store it in the VDO's loadVersion field. + **/ +int decodeVDOVersion(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Loads the VDO master version into the VDO and checks that the version + * can be understood by VDO. + * + * @param vdo The VDO to validate + * + * @return VDO_SUCCESS or an error if the loaded version is not supported + **/ +int validateVDOVersion(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Decode the component data for the VDO itself from the component data buffer + * in the super block. + * + * @param vdo The VDO to decode + * + * @return VDO_SUCCESS or an error + **/ +int decodeVDOComponent(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Validate constraints on VDO config. + * + * @param config The VDO config + * @param blockCount The block count of the VDO + * @param requireLogical Set to true if the number logical blocks + * must be configured (otherwise, it may be zero) + * + * @return a success or error code + **/ +int validateVDOConfig(const VDOConfig *config, + BlockCount blockCount, + bool requireLogical) + __attribute__((warn_unused_result)); + +/** + * Enable a VDO to enter read-only mode on errors. + * + * @param vdo The VDO to enable + * + * @return VDO_SUCCESS or an error + **/ +int enableReadOnlyEntry(VDO *vdo); + +/** + * Get the block map. + * + * @param vdo The VDO whose block map is desired + * + * @return the block map from the VDO + **/ +BlockMap *getBlockMap(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the slab depot from a VDO. + * + * @param vdo The VDO whose slab depot is desired + * + * @return the slab depot from the VDO + **/ +SlabDepot *getSlabDepot(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the recovery journal from a VDO. + * + * @param vdo The VDO whose recovery journal is desired + * + * @return the recovery journal from the VDO + **/ +RecoveryJournal *getRecoveryJournal(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO is in read-only mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO is in read-only mode + **/ +bool inReadOnlyMode(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is in a clean state. + * + * @param vdo The VDO to query + * + * @return true if the VDO is clean + **/ +bool isClean(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO was in a clean state when it was loaded. + * + * @param vdo The VDO to query + * + * @return true if the VDO was clean + **/ +bool wasClean(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO requires a read-only mode rebuild. + * + * @param vdo The VDO to query + * + * @return true if the VDO requires a read-only rebuild + **/ +bool requiresReadOnlyRebuild(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO requires rebuilding. + * + * @param vdo The VDO to query + * + * @return true if the VDO must be rebuilt + **/ +bool requiresRebuild(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO should enter recovery mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO requires recovery + **/ +bool requiresRecovery(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO was replaying the recovery journal into the block map + * when it crashed. + * + * @param vdo The VDO to query + * + * @return true if the VDO crashed while reconstructing the + * block map + **/ +bool isReplaying(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is in recovery mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO is in recovery mode + **/ +bool inRecoveryMode(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Put the VDO into recovery mode + * + * @param vdo The VDO + **/ +void enterRecoveryMode(VDO *vdo); + +/** + * Leave recovery mode if slab scrubbing has actually finished. + * + * @param vdo The VDO + **/ +void leaveRecoveryMode(VDO *vdo); + +/** + * Assert that we are running on the admin thread. + * + * @param vdo The VDO + * @param name The name of the function which should be running on the admin + * thread (for logging). + **/ +void assertOnAdminThread(VDO *vdo, const char *name); + +/** + * Assert that this function was called on the specified logical zone thread. + * + * @param vdo The VDO + * @param logicalZone The number of the logical zone + * @param name The name of the calling function + **/ +void assertOnLogicalZoneThread(const VDO *vdo, + ZoneCount logicalZone, + const char *name); + +/** + * Assert that this function was called on the specified physical zone thread. + * + * @param vdo The VDO + * @param physicalZone The number of the physical zone + * @param name The name of the calling function + **/ +void assertOnPhysicalZoneThread(const VDO *vdo, + ZoneCount physicalZone, + const char *name); + +/** + * Select the hash zone responsible for locking a given chunk name. + * + * @param vdo The VDO containing the hash zones + * @param name The chunk name + * + * @return The hash zone responsible for the chunk name + **/ +HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) + __attribute__((warn_unused_result)); + +/** + * Get the physical zone responsible for a given physical block number of a + * data block in this VDO instance, or of the zero block (for which a NULL + * zone is returned). For any other block number that is not in the range of + * valid data block numbers in any slab, an error will be returned. This + * function is safe to call on invalid block numbers; it will not put the VDO + * into read-only mode. + * + * @param [in] vdo The VDO containing the physical zones + * @param [in] pbn The PBN of the data block + * @param [out] zonePtr A pointer to return the physical zone + * + * @return VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid + * or an error code for any other failure + **/ +int getPhysicalZone(const VDO *vdo, + PhysicalBlockNumber pbn, + PhysicalZone **zonePtr) + __attribute__((warn_unused_result)); + +/**********************************************************************/ +// Asynchronous callback to share a duplicate block. This is only public so +// test code may compare it against the current callback in the completion. +void shareBlock(VDOCompletion *completion); + +#endif /* VDO_INTERNAL_H */ diff --git a/vdo/base/vdoLayout.c b/vdo/base/vdoLayout.c new file mode 100644 index 0000000..3dfce96 --- /dev/null +++ b/vdo/base/vdoLayout.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.c#2 $ + */ + +#include "vdoLayout.h" +#include "vdoLayoutInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "partitionCopy.h" +#include "slab.h" +#include "slabSummary.h" +#include "types.h" +#include "vdoInternal.h" + +#include "statusCodes.h" + +static const PartitionID REQUIRED_PARTITIONS[] = { + BLOCK_MAP_PARTITION, + BLOCK_ALLOCATOR_PARTITION, + RECOVERY_JOURNAL_PARTITION, + SLAB_SUMMARY_PARTITION, +}; + +static const uint8_t REQUIRED_PARTITION_COUNT = 4; + +/** + * Make a fixed layout for a VDO. + * + * @param [in] physicalBlocks The number of physical blocks in the VDO + * @param [in] startingOffset The starting offset of the layout + * @param [in] blockMapBlocks The size of the block map partition + * @param [in] journalBlocks The size of the journal partition + * @param [in] summaryBlocks The size of the slab summary partition + * @param [out] layoutPtr A pointer to hold the new FixedLayout + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeVDOFixedLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + FixedLayout **layoutPtr) +{ + BlockCount necessarySize + = (startingOffset + blockMapBlocks + journalBlocks + summaryBlocks); + if (necessarySize > physicalBlocks) { + return logErrorWithStringError(VDO_NO_SPACE, "Not enough space to" + " make a VDO"); + } + + FixedLayout *layout; + int result = makeFixedLayout(physicalBlocks - startingOffset, + startingOffset, &layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeFixedLayoutPartition(layout, BLOCK_MAP_PARTITION, + blockMapBlocks, FROM_BEGINNING, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + result = makeFixedLayoutPartition(layout, SLAB_SUMMARY_PARTITION, + summaryBlocks, FROM_END, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + result = makeFixedLayoutPartition(layout, RECOVERY_JOURNAL_PARTITION, + journalBlocks, FROM_END, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + /* + * The block allocator no longer traffics in relative PBNs so the offset + * doesn't matter. We need to keep this partition around both for upgraded + * systems, and because we decided that all of the usable space in the + * volume, other than the super block, should be part of some partition. + */ + result = makeFixedLayoutPartition(layout, BLOCK_ALLOCATOR_PARTITION, + ALL_FREE_BLOCKS, FROM_BEGINNING, + blockMapBlocks); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + *layoutPtr = layout; + return VDO_SUCCESS; +} + +/** + * Get the offset of a given partition. + * + * @param layout The layout containing the partition + * @param partitionID The ID of the partition whose offset is desired + * + * @return The offset of the partition (in blocks) + **/ +__attribute__((warn_unused_result)) +static BlockCount getPartitionOffset(VDOLayout *layout, + PartitionID partitionID) +{ + return getFixedLayoutPartitionOffset(getVDOPartition(layout, partitionID)); +} + +/**********************************************************************/ +int makeVDOLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout; + int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeVDOFixedLayout(physicalBlocks, startingOffset, blockMapBlocks, + journalBlocks, summaryBlocks, &vdoLayout->layout); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return result; + } + + vdoLayout->startingOffset = startingOffset; + + *vdoLayoutPtr = vdoLayout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout; + int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeFixedLayout(buffer, &vdoLayout->layout); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return result; + } + + // Check that all the expected partitions exist + Partition *partition; + for (uint8_t i = 0; i < REQUIRED_PARTITION_COUNT; i++) { + result = getPartition(vdoLayout->layout, REQUIRED_PARTITIONS[i], + &partition); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return logErrorWithStringError(result, + "VDO layout is missing required partition" + " %u", REQUIRED_PARTITIONS[i]); + } + } + + // XXX Assert this is the same as where we loaded the super block. + vdoLayout->startingOffset + = getPartitionOffset(vdoLayout, BLOCK_MAP_PARTITION); + + *vdoLayoutPtr = vdoLayout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVDOLayout(VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout = *vdoLayoutPtr; + if (vdoLayout == NULL) { + return; + } + + freeCopyCompletion(&vdoLayout->copyCompletion); + freeFixedLayout(&vdoLayout->nextLayout); + freeFixedLayout(&vdoLayout->layout); + freeFixedLayout(&vdoLayout->previousLayout); + FREE(vdoLayout); + *vdoLayoutPtr = NULL; +} + +/** + * Get a partition from a FixedLayout in conditions where we expect that it can + * not fail. + * + * @param layout The FixedLayout from which to get the partition + * @param id The ID of the partition to retrieve + * + * @return The desired partition + **/ +__attribute__((warn_unused_result)) +static Partition *retrievePartition(FixedLayout *layout, PartitionID id) +{ + Partition *partition; + int result = getPartition(layout, id, &partition); + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "VDOLayout has expected partition"); + return partition; +} + +/**********************************************************************/ +Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) +{ + return retrievePartition(vdoLayout->layout, id); +} + +/** + * Get a partition from a VDOLayout's next FixedLayout. This method should + * only be called when the VDOLayout is prepared to grow. + * + * @param vdoLayout The VDOLayout from which to get the partition + * @param id The ID of the desired partition + * + * @return The requested partition + **/ +__attribute__((warn_unused_result)) +static Partition *getPartitionFromNextLayout(VDOLayout *vdoLayout, + PartitionID id) +{ + ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, + "VDOLayout is prepared to grow"); + return retrievePartition(vdoLayout->nextLayout, id); +} + +/** + * Get the size of a given partition. + * + * @param layout The layout containing the partition + * @param partitionID The partition ID whose size to find + * + * @return The size of the partition (in blocks) + **/ +__attribute__((warn_unused_result)) +static BlockCount getPartitionSize(VDOLayout *layout, PartitionID partitionID) +{ + return getFixedLayoutPartitionSize(getVDOPartition(layout, partitionID)); +} + +/**********************************************************************/ +int prepareToGrowVDOLayout(VDOLayout *vdoLayout, + BlockCount oldPhysicalBlocks, + BlockCount newPhysicalBlocks, + PhysicalLayer *layer) +{ + if (getNextVDOLayoutSize(vdoLayout) == newPhysicalBlocks) { + // We are already prepared to grow to the new size, so we're done. + return VDO_SUCCESS; + } + + // Make a copy completion if there isn't one + if (vdoLayout->copyCompletion == NULL) { + int result = makeCopyCompletion(layer, &vdoLayout->copyCompletion); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Free any unused preparation. + freeFixedLayout(&vdoLayout->nextLayout); + + // Make a new layout with the existing partition sizes for everything but the + // block allocator partition. + int result = makeVDOFixedLayout(newPhysicalBlocks, + vdoLayout->startingOffset, + getPartitionSize(vdoLayout, + BLOCK_MAP_PARTITION), + getPartitionSize(vdoLayout, + RECOVERY_JOURNAL_PARTITION), + getPartitionSize(vdoLayout, + SLAB_SUMMARY_PARTITION), + &vdoLayout->nextLayout); + if (result != VDO_SUCCESS) { + freeCopyCompletion(&vdoLayout->copyCompletion); + return result; + } + + // Ensure the new journal and summary are entirely within the added blocks. + Partition *slabSummaryPartition + = getPartitionFromNextLayout(vdoLayout, SLAB_SUMMARY_PARTITION); + Partition *recoveryJournalPartition + = getPartitionFromNextLayout(vdoLayout, RECOVERY_JOURNAL_PARTITION); + BlockCount minNewSize + = (oldPhysicalBlocks + + getFixedLayoutPartitionSize(slabSummaryPartition) + + getFixedLayoutPartitionSize(recoveryJournalPartition)); + if (minNewSize > newPhysicalBlocks) { + // Copying the journal and summary would destroy some old metadata. + freeFixedLayout(&vdoLayout->nextLayout); + freeCopyCompletion(&vdoLayout->copyCompletion); + return VDO_INCREMENT_TOO_SMALL; + } + + return VDO_SUCCESS; +} + +/** + * Get the size of a VDO from the specified FixedLayout and the + * starting offset thereof. + * + * @param layout The fixed layout whose size to use + * @param startingOffset The starting offset of the layout + * + * @return The total size of a VDO (in blocks) with the given layout + **/ +__attribute__((warn_unused_result)) +static BlockCount getVDOSize(FixedLayout *layout, BlockCount startingOffset) +{ + // The FixedLayout does not include the super block or any earlier + // metadata; all that is captured in the VDOLayout's starting offset + return getTotalFixedLayoutSize(layout) + startingOffset; +} + +/**********************************************************************/ +BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) +{ + return ((vdoLayout->nextLayout == NULL) + ? 0 : getVDOSize(vdoLayout->nextLayout, vdoLayout->startingOffset)); +} + +/**********************************************************************/ +BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) +{ + if (vdoLayout->nextLayout == NULL) { + return 0; + } + + Partition *partition = getPartitionFromNextLayout(vdoLayout, + BLOCK_ALLOCATOR_PARTITION); + return getFixedLayoutPartitionSize(partition); +} + +/**********************************************************************/ +BlockCount growVDOLayout(VDOLayout *vdoLayout) +{ + ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, + "VDO prepared to grow physical"); + vdoLayout->previousLayout = vdoLayout->layout; + vdoLayout->layout = vdoLayout->nextLayout; + vdoLayout->nextLayout = NULL; + + return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); +} + +/**********************************************************************/ +BlockCount revertVDOLayout(VDOLayout *vdoLayout) +{ + if ((vdoLayout->previousLayout != NULL) + && (vdoLayout->previousLayout != vdoLayout->layout)) { + // Only revert if there's something to revert to. + freeFixedLayout(&vdoLayout->layout); + vdoLayout->layout = vdoLayout->previousLayout; + vdoLayout->previousLayout = NULL; + } + + return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); +} + +/**********************************************************************/ +void finishVDOLayoutGrowth(VDOLayout *vdoLayout) +{ + if (vdoLayout->layout != vdoLayout->previousLayout) { + freeFixedLayout(&vdoLayout->previousLayout); + } + + if (vdoLayout->layout != vdoLayout->nextLayout) { + freeFixedLayout(&vdoLayout->nextLayout); + } + + freeCopyCompletion(&vdoLayout->copyCompletion); +} + +/**********************************************************************/ +void copyPartition(VDOLayout *layout, + PartitionID partitionID, + VDOCompletion *parent) +{ + copyPartitionAsync(layout->copyCompletion, + getVDOPartition(layout, partitionID), + getPartitionFromNextLayout(layout, partitionID), parent); +} + +/**********************************************************************/ +size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) +{ + return getFixedLayoutEncodedSize(vdoLayout->layout); +} + +/**********************************************************************/ +int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) +{ + return encodeFixedLayout(vdoLayout->layout, buffer); +} + diff --git a/vdo/base/vdoLayout.h b/vdo/base/vdoLayout.h new file mode 100644 index 0000000..3de24ae --- /dev/null +++ b/vdo/base/vdoLayout.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.h#2 $ + */ + +/** + * VDOLayout is an object which manages the layout of a VDO. It wraps + * FixedLayout, but includes the knowledge of exactly which partitions a VDO is + * expected to have. Because of this knowledge, the VDOLayout validates the + * FixedLayout encoded in the super block at load time, obviating the need for + * subsequent error checking when other modules need to get partitions from the + * layout. + * + * The VDOLayout also manages the preparation and growth of the layout for grow + * physical operations. + **/ + +#ifndef VDO_LAYOUT_H +#define VDO_LAYOUT_H + +#include "fixedLayout.h" +#include "types.h" + +/** + * Make a VDO layout with the specified parameters. + * + * @param [in] physicalBlocks The number of physical blocks in the VDO + * @param [in] startingOffset The starting offset of the layout + * @param [in] blockMapBlocks The size of the block map partition + * @param [in] journalBlocks The size of the journal partition + * @param [in] summaryBlocks The size of the slab summary partition + * @param [out] vdoLayoutPtr A pointer to hold the new VDOLayout + * + * @return VDO_SUCCESS or an error + **/ +int makeVDOLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + VDOLayout **vdoLayoutPtr) + __attribute__((warn_unused_result)); + +/** + * Decode a VDOLayout from a buffer. + * + * @param [in] buffer The buffer from which to decode + * @param [out] vdoLayoutPtr A pointer to hold the VDOLayout + * + * @return VDO_SUCCESS or an error + **/ +int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) + __attribute__((warn_unused_result)); + +/** + * Free a VDOLayout and NULL out the reference to it. + * + * @param vdoLayoutPtr The pointer to a VDOLayout to free + **/ +void freeVDOLayout(VDOLayout **vdoLayoutPtr); + +/** + * Get a partition from a VDOLayout. Because the layout's FixedLayout has + * already been validated, this can not fail. + * + * @param vdoLayout The VDOLayout from which to get the partition + * @param id The ID of the desired partition + * + * @return The requested partition + **/ +Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) + __attribute__((warn_unused_result)); + +/** + * Prepare the layout to be grown. + * + * @param vdoLayout The layout to grow + * @param oldPhysicalBlocks The current size of the VDO + * @param newPhysicalBlocks The size to which the VDO will be grown + * @param layer The layer being grown + * + * @return VDO_SUCCESS or an error code + **/ +int prepareToGrowVDOLayout(VDOLayout *vdoLayout, + BlockCount oldPhysicalBlocks, + BlockCount newPhysicalBlocks, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Get the size of the next layout. + * + * @param vdoLayout The layout to check + * + * @return The size which was specified when the layout was prepared for growth + * or 0 if the layout is not prepared to grow + **/ +BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Get the size of the next block allocator partition. + * + * @param vdoLayout The VDOLayout which has been prepared to grow + * + * @return The size of the block allocator partition in the next layout or 0 + * if the layout is not prepared to grow + **/ +BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Grow the layout by swapping in the prepared layout. + * + * @param vdoLayout The layout to grow + * + * @return The new size of the VDO + **/ +BlockCount growVDOLayout(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Revert the last growth attempt. + * + * @param vdoLayout The layout to revert + * + * @return The reverted size (in blocks) of the VDO + **/ +BlockCount revertVDOLayout(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Clean up any unused resources once an attempt to grow has completed. + * + * @param vdoLayout The layout + **/ +void finishVDOLayoutGrowth(VDOLayout *vdoLayout); + +/** + * Copy a partition from the location specified in the current layout to that in + * the next layout. + * + * @param layout The VDOLayout which is prepared to grow + * @param partitionID The ID of the partition to copy + * @param parent The completion to notify when the copy is complete + **/ +void copyPartition(VDOLayout *layout, + PartitionID partitionID, + VDOCompletion *parent); + +/** + * Get the size of an encoded VDOLayout. + * + * @param vdoLayout The VDOLayout + * + * @return The encoded size of the VDOLayout + **/ +size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Encode a VDOLayout into a buffer. + * + * @param vdoLayout The VDOLayout to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) + __attribute__((warn_unused_result)); + +#endif // VDO_LAYOUT_H diff --git a/vdo/base/vdoLayoutInternals.h b/vdo/base/vdoLayoutInternals.h new file mode 100644 index 0000000..5f038fe --- /dev/null +++ b/vdo/base/vdoLayoutInternals.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayoutInternals.h#2 $ + */ + +#ifndef VDO_LAYOUT_INTERNALS_H +#define VDO_LAYOUT_INTERNALS_H + +#include "fixedLayout.h" +#include "types.h" + +struct vdoLayout { + // The current layout of the VDO + FixedLayout *layout; + // The next layout of the VDO + FixedLayout *nextLayout; + // The previous layout of the VDO + FixedLayout *previousLayout; + // The first block in the layouts + PhysicalBlockNumber startingOffset; + // A pointer to the copy completion (if there is one) + VDOCompletion *copyCompletion; +}; + +#endif // VDO_LAYOUT_INTERNALS_H diff --git a/vdo/base/vdoLoad.c b/vdo/base/vdoLoad.c new file mode 100644 index 0000000..c72f39e --- /dev/null +++ b/vdo/base/vdoLoad.c @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.c#17 $ + */ + +#include "vdoLoad.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "hashZone.h" +#include "header.h" +#include "logicalZone.h" +#include "physicalZone.h" +#include "readOnlyRebuild.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoRecovery.h" +#include "volumeGeometry.h" + +/** + * Extract the VDO from an AdminCompletion, checking that the current operation + * is a load. + * + * @param completion The AdminCompletion's sub-task completion + * + * @return The VDO + **/ +static inline VDO *vdoFromLoadSubTask(VDOCompletion *completion) +{ + return vdoFromAdminSubTask(completion, ADMIN_OPERATION_LOAD); +} + +/** + * Finish aborting a load now that any entry to read-only mode is complete. + * This callback is registered in abortLoad(). + * + * @param completion The sub-task completion + **/ +static void finishAborting(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + vdo->closeRequired = false; + finishParentCallback(completion); +} + +/** + * Make sure the recovery journal is closed when aborting a load. + * + * @param completion The sub-task completion + **/ +static void closeRecoveryJournalForAbort(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + prepareAdminSubTask(vdo, finishAborting, finishAborting); + drainRecoveryJournal(vdo->recoveryJournal, ADMIN_STATE_SAVING, completion); +} + +/** + * Clean up after an error loading a VDO. This error handler is set in + * loadCallback() and loadVDOComponents(). + * + * @param completion The sub-task completion + **/ +static void abortLoad(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + logErrorWithStringError(completion->result, "aborting load"); + if (vdo->readOnlyNotifier == NULL) { + // There are no threads, so we're done + finishParentCallback(completion); + return; + } + + // Preserve the error. + setCompletionResult(completion->parent, completion->result); + if (vdo->recoveryJournal == NULL) { + prepareAdminSubTask(vdo, finishAborting, finishAborting); + } else { + prepareAdminSubTaskOnThread(vdo, closeRecoveryJournalForAbort, + closeRecoveryJournalForAbort, + getJournalZoneThread(getThreadConfig(vdo))); + } + + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); +} + +/** + * Wait for the VDO to be in read-only mode. + * + * @param completion The sub-task completion + **/ +static void waitForReadOnlyMode(VDOCompletion *completion) +{ + prepareToFinishParent(completion, completion->parent); + setCompletionResult(completion, VDO_READ_ONLY); + VDO *vdo = vdoFromLoadSubTask(completion); + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); +} + +/** + * Finish loading the VDO after an error, but leave it in read-only + * mode. This error handler is set in makeDirty(), scrubSlabs(), and + * loadVDOComponents(). + * + * @param completion The sub-task completion + **/ +static void continueLoadReadOnly(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + logErrorWithStringError(completion->result, + "Entering read-only mode due to load error"); + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + waitForReadOnlyMode(completion); +} + +/** + * Exit recovery mode if necessary now that online slab scrubbing or loading + * is complete. This callback is registrered in scrubSlabs(). + * + * @param completion The slab scrubber completion + **/ +static void finishScrubbingSlabs(VDOCompletion *completion) +{ + VDO *vdo = completion->parent; + assertOnAdminThread(vdo, __func__); + if (inRecoveryMode(vdo)) { + leaveRecoveryMode(vdo); + } else { + logInfo("VDO commencing normal operation"); + } +} + +/** + * Handle an error scrubbing or loading all slabs after the VDO has come + * online. This error handler is registered in scrubSlabs(). + * + * @param completion The slab scrubber completion + **/ +static void handleScrubAllError(VDOCompletion *completion) +{ + VDO *vdo = completion->parent; + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); +} + +/** + * Initiate slab scrubbing if necessary. This callback is registered in + * prepareToComeOnline(). + * + * @param completion The sub-task completion + **/ +static void scrubSlabs(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + if (!hasUnrecoveredSlabs(vdo->depot)) { + finishParentCallback(completion); + return; + } + + if (requiresRecovery(vdo)) { + enterRecoveryMode(vdo); + } + + prepareAdminSubTask(vdo, finishParentCallback, continueLoadReadOnly); + scrubAllUnrecoveredSlabs(vdo->depot, vdo, finishScrubbingSlabs, + handleScrubAllError, 0, completion); +} + +/** + * This is the error handler for slab scrubbing. It is registered in + * prepareToComeOnline(). + * + * @param completion The sub-task completion + **/ +static void handleScrubbingError(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + waitForReadOnlyMode(completion); +} + +/** + * This is the callback after the super block is written. It prepares the block + * allocator to come online and start allocating. It is registered in + * makeDirty(). + * + * @param completion The sub-task completion + **/ +static void prepareToComeOnline(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + SlabDepotLoadType loadType = NORMAL_LOAD; + if (requiresReadOnlyRebuild(vdo)) { + loadType = REBUILD_LOAD; + } else if (requiresRecovery(vdo)) { + loadType = RECOVERY_LOAD; + } + + initializeBlockMapFromJournal(vdo->blockMap, vdo->recoveryJournal); + + prepareAdminSubTask(vdo, scrubSlabs, handleScrubbingError); + prepareToAllocate(vdo->depot, loadType, completion); +} + +/** + * Mark the super block as dirty now that everything has been loaded or + * rebuilt. + * + * @param completion The sub-task completion + **/ +static void makeDirty(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + if (isReadOnly(vdo->readOnlyNotifier)) { + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + vdo->state = VDO_DIRTY; + prepareAdminSubTask(vdo, prepareToComeOnline, continueLoadReadOnly); + saveVDOComponentsAsync(vdo, completion); +} + +/** + * Callback to do the destructive parts of a load now that the new VDO device + * is being resumed. + * + * @param completion The sub-task completion + **/ +static void loadCallback(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + assertOnAdminThread(vdo, __func__); + + // Prepare the recovery journal for new entries. + openRecoveryJournal(vdo->recoveryJournal, vdo->depot, vdo->blockMap); + vdo->closeRequired = true; + if (isReadOnly(vdo->readOnlyNotifier)) { + // In read-only mode we don't use the allocator and it may not + // even be readable, so use the default structure. + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + if (requiresReadOnlyRebuild(vdo)) { + prepareAdminSubTask(vdo, makeDirty, abortLoad); + launchRebuild(vdo, completion); + return; + } + + if (requiresRebuild(vdo)) { + prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); + launchRecovery(vdo, completion); + return; + } + + prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); + loadSlabDepot(vdo->depot, + (wasNew(vdo) ? ADMIN_STATE_FORMATTING : ADMIN_STATE_LOADING), + completion, NULL); +} + +/**********************************************************************/ +int performVDOLoad(VDO *vdo) +{ + return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, loadCallback, + loadCallback); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int startVDODecode(VDO *vdo, bool validateConfig) +{ + int result = validateVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOComponent(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + if (!validateConfig) { + return VDO_SUCCESS; + } + + if (vdo->loadConfig.nonce != vdo->nonce) { + return logErrorWithStringError(VDO_BAD_NONCE, "Geometry nonce %" PRIu64 + " does not match superblock nonce %llu", + vdo->loadConfig.nonce, vdo->nonce); + } + + BlockCount blockCount = vdo->layer->getBlockCount(vdo->layer); + return validateVDOConfig(&vdo->config, blockCount, true); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int finishVDODecode(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + int result = makeRecoveryJournal(vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION), + vdo->completeRecoveries, + vdo->config.recoveryJournalSize, + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, + vdo->readOnlyNotifier, threadConfig, + &vdo->recoveryJournal); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + SLAB_SUMMARY_PARTITION), + vdo->readOnlyNotifier, vdo->recoveryJournal, + &vdo->depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeBlockMap(buffer, vdo->config.logicalBlocks, threadConfig, + &vdo->blockMap); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == 0), + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/** + * Decode the component data portion of a super block and fill in the + * corresponding portions of the VDO being loaded. This will also allocate the + * recovery journal and slab depot. If this method is called with an + * asynchronous layer (i.e. a thread config which specifies at least one base + * thread), the block map and packer will be constructed as well. + * + * @param vdo The VDO being loaded + * @param validateConfig Whether to validate the config + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeVDO(VDO *vdo, bool validateConfig) +{ + int result = startVDODecode(vdo, validateConfig); + if (result != VDO_SUCCESS) { + return result; + } + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, + &vdo->readOnlyNotifier); + if (result != VDO_SUCCESS) { + return result; + } + + result = enableReadOnlyEntry(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = finishVDODecode(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeFlusher(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount maximumAge = getConfiguredBlockMapMaximumAge(vdo); + BlockCount journalLength + = getRecoveryJournalLength(vdo->config.recoveryJournalSize); + if ((maximumAge > (journalLength / 2)) || (maximumAge < 1)) { + return VDO_BAD_CONFIGURATION; + } + result = makeBlockMapCaches(vdo->blockMap, vdo->layer, + vdo->readOnlyNotifier, vdo->recoveryJournal, + vdo->nonce, getConfiguredCacheSize(vdo), + maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(threadConfig->hashZoneCount, HashZone *, __func__, + &vdo->hashZones); + if (result != VDO_SUCCESS) { + return result; + } + + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + result = makeHashZone(vdo, zone, &vdo->hashZones[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + result = makeLogicalZones(vdo, &vdo->logicalZones); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(threadConfig->physicalZoneCount, PhysicalZone *, __func__, + &vdo->physicalZones); + if (result != VDO_SUCCESS) { + return result; + } + + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + result = makePhysicalZone(vdo, zone, &vdo->physicalZones[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + return makePacker(vdo->layer, DEFAULT_PACKER_INPUT_BINS, + DEFAULT_PACKER_OUTPUT_BINS, threadConfig, &vdo->packer); +} + +/** + * Load the components of a VDO. This is the super block load callback + * set by loadCallback(). + * + * @param completion The sub-task completion + **/ +static void loadVDOComponents(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + + prepareCompletion(completion, finishParentCallback, abortLoad, + completion->callbackThreadID, completion->parent); + finishCompletion(completion, decodeVDO(vdo, true)); +} + +/** + * Callback to initiate a pre-load, registered in prepareToLoadVDO(). + * + * @param completion The sub-task completion + **/ +static void preLoadCallback(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + assertOnAdminThread(vdo, __func__); + prepareAdminSubTask(vdo, loadVDOComponents, abortLoad); + loadSuperBlockAsync(completion, getFirstBlockOffset(vdo), &vdo->superBlock); +} + +/**********************************************************************/ +int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) +{ + vdo->loadConfig = *loadConfig; + return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, + preLoadCallback, preLoadCallback); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeSynchronousVDO(VDO *vdo, bool validateConfig) +{ + int result = startVDODecode(vdo, validateConfig); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + return result; + } + + return finishVDODecode(vdo); +} + +/**********************************************************************/ +int loadVDOSuperblock(PhysicalLayer *layer, + VolumeGeometry *geometry, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) +{ + VDO *vdo; + int result = makeVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + setLoadConfigFromGeometry(geometry, &vdo->loadConfig); + result = loadSuperBlock(layer, getFirstBlockOffset(vdo), &vdo->superBlock); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + result = ((decoder == NULL) + ? decodeSynchronousVDO(vdo, validateConfig) + : decoder(vdo, validateConfig)); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + *vdoPtr = vdo; + return VDO_SUCCESS; + +} +/**********************************************************************/ +int loadVDO(PhysicalLayer *layer, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) +{ + VolumeGeometry geometry; + int result = loadVolumeGeometry(layer, &geometry); + if (result != VDO_SUCCESS) { + return result; + } + + return loadVDOSuperblock(layer, &geometry, validateConfig, decoder, vdoPtr); +} diff --git a/vdo/base/vdoLoad.h b/vdo/base/vdoLoad.h new file mode 100644 index 0000000..893d6e4 --- /dev/null +++ b/vdo/base/vdoLoad.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.h#3 $ + */ + +#ifndef VDO_LOAD_H +#define VDO_LOAD_H + +#include "volumeGeometry.h" +#include "types.h" + +/** + * A function which decodes a VDO from a super block. + * + * @param vdo The VDO to be decoded (its super block must already + * be loaded) + * @param validateConfig If true, the VDO's configuration will + * be validated before the decode is attempted + * + * @return VDO_SUCCESS or an error + **/ +typedef int VDODecoder(VDO *vdo, bool validateConfig); + +/** + * Load a VDO for normal operation. This method must not be called from a base + * thread. + * + * @param vdo The VDO to load + * + * @return VDO_SUCCESS or an error + **/ +int performVDOLoad(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Perpare a VDO for loading by reading structures off disk. This method does + * not alter the on-disk state. It should be called from the VDO constructor, + * whereas performVDOLoad() will be called during pre-resume if the VDO has + * not been resumed before. + **/ +int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) + __attribute__((warn_unused_result)); + +/** + * Synchronously load a VDO from a specified super block location for use by + * user-space tools. + * + * @param [in] layer The physical layer the VDO sits on + * @param [in] geometry A pointer to the geometry for the volume + * @param [in] validateConfig Whether to validate the VDO against the layer + * @param [in] decoder The VDO decoder to use, if NULL, the default + * decoder will be used + * @param [out] vdoPtr A pointer to hold the decoded VDO + * + * @return VDO_SUCCESS or an error + **/ +int loadVDOSuperblock(PhysicalLayer *layer, + VolumeGeometry *geometry, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Synchronously load a VDO volume for use by user-space tools. + * + * @param [in] layer The physical layer the VDO sits on + * @param [in] validateConfig Whether to validate the VDO against the layer + * @param [in] decoder The VDO decoder to use, if NULL, the default + * decoder will be used + * @param [out] vdoPtr A pointer to hold the decoded VDO + * + * @return VDO_SUCCESS or an error + **/ +int loadVDO(PhysicalLayer *layer, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) + __attribute__((warn_unused_result)); + +#endif /* VDO_LOAD_H */ diff --git a/vdo/base/vdoPageCache.c b/vdo/base/vdoPageCache.c new file mode 100644 index 0000000..c8f4585 --- /dev/null +++ b/vdo/base/vdoPageCache.c @@ -0,0 +1,1369 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.c#11 $ + */ + +#include "vdoPageCacheInternals.h" + +#if __KERNEL__ +#include +#endif + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "adminState.h" +#include "constants.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "statusCodes.h" +#include "types.h" +#include "vio.h" + +enum { + LOG_INTERVAL = 4000, + DISPLAY_INTERVAL = 100000, +}; + +/**********************************************************************/ +static char *getPageBuffer(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE]; +} + +/** + * Allocate components of the cache which require their own allocation. The + * caller is responsible for all clean up on errors. + * + * @param cache The cache being constructed + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int allocateCacheComponents(VDOPageCache *cache) +{ + int result = ALLOCATE(cache->pageCount, PageInfo, "page infos", + &cache->infos); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t size = cache->pageCount * (uint64_t) VDO_BLOCK_SIZE; + result = allocateMemory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages); + if (result != UDS_SUCCESS) { + return result; + } + + return makeIntMap(cache->pageCount, 0, &cache->pageMap); +} + +/** + * Initialize all page info structures and put them on the free list. + * + * @param cache The cache to initialize + * + * @return VDO_SUCCESS or an error + **/ +static int initializeInfo(VDOPageCache *cache) +{ + initializeRing(&cache->freeList); + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + info->cache = cache; + info->state = PS_FREE; + info->pbn = NO_PAGE; + + if (cache->layer->createMetadataVIO != NULL) { + int result = createVIO(cache->layer, VIO_TYPE_BLOCK_MAP, + VIO_PRIORITY_METADATA, info, getPageBuffer(info), + &info->vio); + if (result != VDO_SUCCESS) { + return result; + } + + // The thread ID should never change. + info->vio->completion.callbackThreadID = cache->zone->threadID; + } + + initializeRing(&info->listNode); + pushRingNode(&cache->freeList, &info->listNode); + initializeRing(&info->lruNode); + } + + relaxedStore64(&cache->stats.counts.freePages, cache->pageCount); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *node, void *context); + +/**********************************************************************/ +int makeVDOPageCache(PhysicalLayer *layer, + PageCount pageCount, + VDOPageReadFunction *readHook, + VDOPageWriteFunction *writeHook, + size_t pageContextSize, + BlockCount maximumAge, + BlockMapZone *zone, + VDOPageCache **cachePtr) +{ + int result = ASSERT(pageContextSize <= MAX_PAGE_CONTEXT_SIZE, + "page context size %zu cannot exceed %u bytes", + pageContextSize, MAX_PAGE_CONTEXT_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + + VDOPageCache *cache; + result = ALLOCATE(1, VDOPageCache, "page cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + cache->layer = layer; + cache->pageCount = pageCount; + cache->readHook = readHook; + cache->writeHook = writeHook; + cache->zone = zone; + + result = allocateCacheComponents(cache); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + result = initializeInfo(cache); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + result = makeDirtyLists(maximumAge, writeDirtyPagesCallback, cache, + &cache->dirtyLists); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + // initialize empty circular queues + initializeRing(&cache->lruList); + initializeRing(&cache->outgoingList); + + *cachePtr = cache; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVDOPageCache(VDOPageCache **cachePtr) +{ + VDOPageCache *cache = *cachePtr; + if (cache == NULL) { + return; + } + + if (cache->infos != NULL) { + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + freeVIO(&info->vio); + } + } + + freeDirtyLists(&cache->dirtyLists); + freeIntMap(&cache->pageMap); + FREE(cache->infos); + FREE(cache->pages); + FREE(cache); + *cachePtr = NULL; +} + +/**********************************************************************/ +void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period) +{ + setCurrentPeriod(cache->dirtyLists, period); +} + +/**********************************************************************/ +void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding) +{ + cache->rebuilding = rebuilding; +} + +/** + * Assert that a function has been called on the VDO page cache's thread. + * + * @param cache the page cache + * @param functionName the name of the function + **/ +static inline void assertOnCacheThread(VDOPageCache *cache, + const char *functionName) +{ + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((threadID == cache->zone->threadID), + "%s() must only be called on cache thread %d, not thread %d", + functionName, cache->zone->threadID, threadID); +} + +/** + * Assert that a page cache may issue I/O. + * + * @param cache the page cache + **/ +static inline void assertIOAllowed(VDOPageCache *cache) +{ + ASSERT_LOG_ONLY(!isQuiescent(&cache->zone->state), + "VDO page cache may issue I/O"); +} + +/** + * Log and, if enabled, report cache pressure. + * + * @param cache the page cache + **/ +static void reportCachePressure(VDOPageCache *cache) +{ + relaxedAdd64(&cache->stats.cachePressure, 1); + if (cache->waiterCount > cache->pageCount) { + if ((cache->pressureReport % LOG_INTERVAL) == 0) { + logInfo("page cache pressure %llu", + relaxedLoad64(&cache->stats.cachePressure)); + } + + if (++cache->pressureReport >= DISPLAY_INTERVAL) { + cache->pressureReport = 0; + } + } +} + +/**********************************************************************/ +const char *vpcPageStateName(PageState state) +{ + static const char *stateNames[] = { + "FREE", + "INCOMING", + "FAILED", + "RESIDENT", + "DIRTY", + "OUTGOING" + }; + STATIC_ASSERT(COUNT_OF(stateNames) == PAGE_STATE_COUNT); + + int result = ASSERT(state < COUNT_OF(stateNames), + "Unknown PageState value %d", state); + if (result != UDS_SUCCESS) { + return "[UNKNOWN PAGE STATE]"; + } + + return stateNames[state]; +} + +/** + * Update the counter associated with a given state. + * + * @param info the page info to count + * @param delta the delta to apply to the counter + **/ +static void updateCounter(PageInfo *info, int32_t delta) +{ + VDOPageCache *cache = info->cache; + switch (info->state) { + case PS_FREE: + relaxedAdd64(&cache->stats.counts.freePages, delta); + return; + + case PS_INCOMING: + relaxedAdd64(&cache->stats.counts.incomingPages, delta); + return; + + case PS_OUTGOING: + relaxedAdd64(&cache->stats.counts.outgoingPages, delta); + return; + + case PS_FAILED: + relaxedAdd64(&cache->stats.counts.failedPages, delta); + return; + + case PS_RESIDENT: + relaxedAdd64(&cache->stats.counts.cleanPages, delta); + return; + + case PS_DIRTY: + relaxedAdd64(&cache->stats.counts.dirtyPages, delta); + return; + + default: + return; + } +} + +/** + * Update the lru information for an active page. + **/ +static void updateLru(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + + if (cache->lruList.prev != &info->lruNode) { + pushRingNode(&cache->lruList, &info->lruNode); + } +} + +/** + * Set the state of a PageInfo and put it on the right list, adjusting + * counters. + * + * @param info the PageInfo to modify + * @param newState the new state for the PageInfo + **/ +static void setInfoState(PageInfo *info, PageState newState) +{ + if (newState == info->state) { + return; + } + + updateCounter(info, -1); + info->state = newState; + updateCounter(info, 1); + + switch (info->state) { + case PS_FREE: + case PS_FAILED: + pushRingNode(&info->cache->freeList, &info->listNode); + return; + + case PS_OUTGOING: + pushRingNode(&info->cache->outgoingList, &info->listNode); + return; + + case PS_DIRTY: + return; + + default: + unspliceRingNode(&info->listNode); + } +} + +/** + * Set the pbn for an info, updating the map as needed. + * + * @param info The page info + * @param pbn The physical block number to set + **/ +__attribute__((warn_unused_result)) +static int setInfoPBN(PageInfo *info, PhysicalBlockNumber pbn) +{ + VDOPageCache *cache = info->cache; + + // Either the new or the old page number must be NO_PAGE. + int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE), + "Must free a page before reusing it."); + if (result != VDO_SUCCESS) { + return result; + } + + if (info->pbn != NO_PAGE) { + intMapRemove(cache->pageMap, info->pbn); + } + + info->pbn = pbn; + + if (pbn != NO_PAGE) { + result = intMapPut(cache->pageMap, pbn, info, true, NULL); + if (result != UDS_SUCCESS) { + return result; + } + } + return VDO_SUCCESS; +} + +/** + * Reset page info to represent an unallocated page. + **/ +static int resetPageInfo(PageInfo *info) +{ + int result = ASSERT(info->busy == 0, "VDO Page must not be busy"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(!hasWaiters(&info->waiting), + "VDO Page must not have waiters"); + if (result != UDS_SUCCESS) { + return result; + } + + result = setInfoPBN(info, NO_PAGE); + setInfoState(info, PS_FREE); + unspliceRingNode(&info->lruNode); + return result; +} + +/** + * Find a free page. + * + * @param cache the page cache + * + * @return a pointer to the page info structure (if found), NULL otherwise + **/ +__attribute__((warn_unused_result)) +static PageInfo *findFreePage(VDOPageCache *cache) +{ + if (cache->freeList.next == &cache->freeList) { + return NULL; + } + PageInfo *info = pageInfoFromListNode(cache->freeList.next); + unspliceRingNode(&info->listNode); + return info; +} + +/**********************************************************************/ +PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) +{ + if ((cache->lastFound != NULL) + && (cache->lastFound->pbn == pbn)) { + return cache->lastFound; + } + cache->lastFound = intMapGet(cache->pageMap, pbn); + return cache->lastFound; +} + +/** + * Determine which page is least recently used. + * + * @param cache the page cache structure + * + * @return a pointer to the info structure for a relevant page, + * or NULL if no such page can be found. The page can be + * dirty or resident. + * + * @note Picks the least recently used from among the non-busy entries + * at the front of each of the lru ring. + * Since whenever we mark a page busy we also put it to the end + * of the ring it is unlikely that the entries at the front + * are busy unless the queue is very short, but not impossible. + **/ +__attribute__((warn_unused_result)) +static PageInfo *selectLRUPage(VDOPageCache *cache) +{ + PageInfoNode *lru; + for (lru = cache->lruList.next; + lru != &cache->lruList; + lru = lru->next) { + PageInfo *info = pageInfoFromLRUNode(lru); + if ((info->busy == 0) && !isInFlight(info)) { + return info; + } + } + + return NULL; +} + +/**********************************************************************/ +AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) +{ + return &cache->stats; +} + +// ASYNCHRONOUS INTERFACE BEYOND THIS POINT + +/** + * Helper to complete the VDO Page Completion request successfully. + * + * @param info the page info representing the result page + * @param vdoPageComp the VDO page completion to complete + **/ +static void completeWithPage(PageInfo *info, VDOPageCompletion *vdoPageComp) +{ + bool available = vdoPageComp->writable ? isPresent(info) : isValid(info); + if (!available) { + logErrorWithStringError(VDO_BAD_PAGE, + "Requested cache page %llu in state %s is" + " not %s", + info->pbn, vpcPageStateName(info->state), + vdoPageComp->writable ? "present" : "valid"); + finishCompletion(&vdoPageComp->completion, VDO_BAD_PAGE); + return; + } + + vdoPageComp->info = info; + vdoPageComp->ready = true; + finishCompletion(&vdoPageComp->completion, VDO_SUCCESS); +} + +/** + * Complete a page completion with an error code. Implements WaiterCallback. + * + * @param waiter The page completion, as a waiter + * @param resultPtr A pointer to the error code. + **/ +static void completeWaiterWithError(Waiter *waiter, void *resultPtr) +{ + int *result = resultPtr; + VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); + finishCompletion(&completion->completion, *result); +} + +/** + * Complete a queue of VDOPageCompletions with an error code. + * + * @param [in] result the error result + * @param [in, out] queue a pointer to the queue + * + * @note upon completion the queue will be empty + **/ +static void distributeErrorOverQueue(int result, WaitQueue *queue) +{ + notifyAllWaiters(queue, completeWaiterWithError, &result); +} + +/** + * Complete a page completion with a page. Implements WaiterCallback. + * + * @param waiter The page completion, as a waiter + * @param pageInfo The page info to complete with + **/ +static void completeWaiterWithPage(Waiter *waiter, void *pageInfo) +{ + PageInfo *info = pageInfo; + VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); + completeWithPage(info, completion); +} + +/** + * Complete a queue of VDOPageCompletions with a page result. + * + * @param [in] info the page info describing the page + * @param [in, out] queue a pointer to a queue of waiters + * + * @return the number of pages distributed + * + * @note upon completion the queue will be empty + * + **/ +static unsigned int distributePageOverQueue(PageInfo *info, WaitQueue *queue) +{ + updateLru(info); + + size_t pages = countWaiters(queue); + + /* + * Increment the busy count once for each pending completion so that + * this page does not stop being busy until all completions have + * been processed (VDO-83). + */ + info->busy += pages; + + notifyAllWaiters(queue, completeWaiterWithPage, info); + return pages; +} + +/** + * Set a persistent error which all requests will receive in the future. + * + * @param cache the page cache + * @param context a string describing what triggered the error + * @param result the error result + * + * Once triggered, all enqueued completions will get this error. + * Any future requests will result in this error as well. + **/ +static void setPersistentError(VDOPageCache *cache, + const char *context, + int result) +{ + // If we're already read-only, there's no need to log. + ReadOnlyNotifier *notifier = cache->zone->readOnlyNotifier; + if ((result != VDO_READ_ONLY) && !isReadOnly(notifier)) { + logErrorWithStringError(result, "VDO Page Cache persistent error: %s", + context); + enterReadOnlyMode(notifier, result); + } + + assertOnCacheThread(cache, __func__); + + distributeErrorOverQueue(result, &cache->freeWaiters); + cache->waiterCount = 0; + + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/**********************************************************************/ +void initVDOPageCompletion(VDOPageCompletion *pageCompletion, + VDOPageCache *cache, + PhysicalBlockNumber pbn, + bool writable, + void *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), + "New page completion was not already on a wait queue"); + + *pageCompletion = (VDOPageCompletion) { + .pbn = pbn, + .writable = writable, + .cache = cache, + }; + + VDOCompletion *completion = &pageCompletion->completion; + initializeCompletion(completion, VDO_PAGE_COMPLETION, cache->layer); + prepareCompletion(completion, callback, errorHandler, cache->zone->threadID, + parent); +} + +/** + * Helper function to check that a completion represents a successfully + * completed VDO Page Completion referring to a valid page. + * + * @param completion a VDO completion + * @param writable whether a writable page is required + * + * @return the embedding completion if valid, NULL if not + **/ +__attribute__((warn_unused_result)) +static VDOPageCompletion *validateCompletedPage(VDOCompletion *completion, + bool writable) +{ + VDOPageCompletion *vpc = asVDOPageCompletion(completion); + + int result = ASSERT(vpc->ready, "VDO Page completion not ready"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(vpc->info != NULL, "VDO Page Completion must be complete"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(vpc->info->pbn == vpc->pbn, + "VDO Page Completion pbn must be consistent"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(isValid(vpc->info), + "VDO Page Completion page must be valid"); + if (result != UDS_SUCCESS) { + return NULL; + } + + if (writable) { + result = ASSERT(vpc->writable, "VDO Page Completion is writable"); + if (result != UDS_SUCCESS) { + return NULL; + } + } + + return vpc; +} + +/**********************************************************************/ +bool isPageCacheActive(VDOPageCache *cache) +{ + return ((cache->outstandingReads != 0) || (cache->outstandingWrites != 0)); +} + +/** + * VIO callback used when a page has been loaded. + * + * @param completion A completion for the VIO, the parent of which is a + * PageInfo. + **/ +static void pageIsLoaded(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + setInfoState(info, PS_RESIDENT); + distributePageOverQueue(info, &info->waiting); + + /* + * Don't decrement until right before calling checkForDrainComplete() to + * ensure that the above work can't cause the page cache to be freed out from + * under us. + */ + cache->outstandingReads--; + checkForDrainComplete(cache->zone); +} + +/** + * Handle page load errors. + * + * @param completion The page read VIO + **/ +static void handleLoadError(VDOCompletion *completion) +{ + int result = completion->result; + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + enterReadOnlyMode(cache->zone->readOnlyNotifier, result); + relaxedAdd64(&cache->stats.failedReads, 1); + setInfoState(info, PS_FAILED); + distributeErrorOverQueue(result, &info->waiting); + resetPageInfo(info); + + /* + * Don't decrement until right before calling checkForDrainComplete() to + * ensure that the above work can't cause the page cache to be freed out from + * under us. + */ + cache->outstandingReads--; + checkForDrainComplete(cache->zone); +} + +/** + * Run the read hook after a page is loaded. This callback is registered in + * launchPageLoad() when there is a read hook. + * + * @param completion The page load completion + **/ +static void runReadHook(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + completion->callback = pageIsLoaded; + resetCompletion(completion); + int result = info->cache->readHook(getPageBuffer(info), info->pbn, + info->cache->zone, info->context); + continueCompletion(completion, result); +} + +/** + * Handle a read error during a read-only rebuild. + * + * @param completion The page load completion + **/ +static void handleRebuildReadError(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + // We are doing a read-only rebuild, so treat this as a successful read + // of an uninitialized page. + relaxedAdd64(&cache->stats.failedReads, 1); + memset(getPageBuffer(info), 0, VDO_BLOCK_SIZE); + resetCompletion(completion); + if (cache->readHook != NULL) { + runReadHook(completion); + } else { + pageIsLoaded(completion); + } +} + +/** + * Begin the process of loading a page. + * + * @param info the page info representing where to load the page + * @param pbn the absolute pbn of the desired page + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int launchPageLoad(PageInfo *info, PhysicalBlockNumber pbn) +{ + VDOPageCache *cache = info->cache; + assertIOAllowed(cache); + + int result = setInfoPBN(info, pbn); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT((info->busy == 0), "Page is not busy before loading."); + if (result != VDO_SUCCESS) { + return result; + } + + setInfoState(info, PS_INCOMING); + cache->outstandingReads++; + relaxedAdd64(&cache->stats.pagesLoaded, 1); + launchReadMetadataVIO(info->vio, pbn, + (cache->readHook != NULL) ? runReadHook : pageIsLoaded, + (cache->rebuilding + ? handleRebuildReadError : handleLoadError)); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void writePages(VDOCompletion *completion); + +/** + * Handle errors flushing the layer. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + VDOPageCache *cache = ((PageInfo *) completion->parent)->cache; + setPersistentError(cache, "flush failed", completion->result); + writePages(completion); +} + +/** + * Attempt to save the outgoing pages by first flushing the layer. + * + * @param cache The cache + **/ +static void savePages(VDOPageCache *cache) +{ + if ((cache->pagesInFlush > 0) || (cache->pagesToFlush == 0)) { + return; + } + + assertIOAllowed(cache); + + PageInfo *info = pageInfoFromListNode(cache->outgoingList.next); + cache->pagesInFlush = cache->pagesToFlush; + cache->pagesToFlush = 0; + relaxedAdd64(&cache->stats.flushCount, 1); + + VIO *vio = info->vio; + PhysicalLayer *layer = vio->completion.layer; + + /* + * We must make sure that the recovery journal entries that changed these + * pages were successfully persisted, and thus must issue a flush before + * each batch of pages is written to ensure this. However, in sync mode, + * every journal block is written with FUA, thus guaranteeing the journal + * persisted already. + */ + if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { + launchFlush(vio, writePages, handleFlushError); + return; + } + + writePages(&vio->completion); +} + +/** + * Add a page to the outgoing list of pages waiting to be saved. Once in the + * list, a page may not be used until it has been written out. + * + * @param info The page to save + **/ +static void schedulePageSave(PageInfo *info) +{ + if (info->busy > 0) { + info->writeStatus = WRITE_STATUS_DEFERRED; + return; + } + + info->cache->pagesToFlush++; + info->cache->outstandingWrites++; + setInfoState(info, PS_OUTGOING); +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *expired, void *context) +{ + while (!isRingEmpty(expired)) { + schedulePageSave(pageInfoFromListNode(chopRingNode(expired))); + } + + savePages((VDOPageCache *) context); +} + +/** + * Add a page to outgoing pages waiting to be saved, and then start saving + * pages if another save is not in progress. + * + * @param info The page to save + **/ +static void launchPageSave(PageInfo *info) +{ + schedulePageSave(info); + savePages(info->cache); +} + +/** + * Determine whether a given VDOPageCompletion (as a waiter) is requesting a + * given page number. Implements WaiterMatch. + * + * @param waiter The page completion in question + * @param context A pointer to the pbn of the desired page + * + * @return true if the page completion is for the desired page number + **/ +static bool completionNeedsPage(Waiter *waiter, void *context) +{ + PhysicalBlockNumber *pbn = context; + return (pageCompletionFromWaiter(waiter)->pbn == *pbn); +} + +/** + * Allocate a free page to the first completion in the waiting queue, + * and any other completions that match it in page number. + **/ +static void allocateFreePage(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + if (!hasWaiters(&cache->freeWaiters)) { + if (relaxedLoad64(&cache->stats.cachePressure) > 0) { + logInfo("page cache pressure relieved"); + relaxedStore64(&cache->stats.cachePressure, 0); + } + return; + } + + int result = resetPageInfo(info); + if (result != VDO_SUCCESS) { + setPersistentError(cache, "cannot reset page info", result); + return; + } + + Waiter *oldestWaiter = getFirstWaiter(&cache->freeWaiters); + PhysicalBlockNumber pbn = pageCompletionFromWaiter(oldestWaiter)->pbn; + + // Remove all entries which match the page number in question + // and push them onto the page info's wait queue. + dequeueMatchingWaiters(&cache->freeWaiters, completionNeedsPage, + &pbn, &info->waiting); + cache->waiterCount -= countWaiters(&info->waiting); + + result = launchPageLoad(info, pbn); + if (result != VDO_SUCCESS) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/** + * Begin the process of discarding a page. + * + * @param cache the page cache + * + * @note If no page is discardable, increments a count of deferred frees so + * that the next release of a page which is no longer busy will kick + * off another discard cycle. This is an indication that the cache is + * not big enough. + * + * @note If the selected page is not dirty, immediately allocates the page + * to the oldest completion waiting for a free page. + **/ +static void discardAPage(VDOPageCache *cache) +{ + PageInfo *info = selectLRUPage(cache); + if (info == NULL) { + reportCachePressure(cache); + return; + } + + if (!isDirty(info)) { + allocateFreePage(info); + return; + } + + ASSERT_LOG_ONLY(!isInFlight(info), + "page selected for discard is not in flight"); + + ++cache->discardCount; + info->writeStatus = WRITE_STATUS_DISCARD; + launchPageSave(info); +} + +/** + * Helper used to trigger a discard so that the completion can get a different + * page. + * + * @param vdoPageComp the VDO Page completion + **/ +static void discardPageForCompletion(VDOPageCompletion *vdoPageComp) +{ + VDOPageCache *cache = vdoPageComp->cache; + + ++cache->waiterCount; + + int result = enqueueWaiter(&cache->freeWaiters, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + setPersistentError(cache, "cannot enqueue waiter", result); + } + + discardAPage(cache); +} + +/** + * Helper used to trigger a discard if the cache needs another free page. + * + * @param cache the page cache + **/ +static void discardPageIfNeeded(VDOPageCache *cache) +{ + if (cache->waiterCount > cache->discardCount) { + discardAPage(cache); + } +} + +/**********************************************************************/ +void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period) +{ + assertOnCacheThread(cache, __func__); + advancePeriod(cache->dirtyLists, period); +} + +/** + * Inform the cache that a write has finished (possibly with an error). + * + * @param info The info structure for the page whose write just completed + * + * @return true if the page write was a discard + **/ +static bool writeHasFinished(PageInfo *info) +{ + assertOnCacheThread(info->cache, __func__); + info->cache->outstandingWrites--; + + bool wasDiscard = (info->writeStatus == WRITE_STATUS_DISCARD); + info->writeStatus = WRITE_STATUS_NORMAL; + return wasDiscard; +} + +/** + * Handler for page write errors. + * + * @param completion The page write VIO + **/ +static void handlePageWriteError(VDOCompletion *completion) +{ + int result = completion->result; + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + + // If we're already read-only, write failures are to be expected. + if (result != VDO_READ_ONLY) { +#if __KERNEL__ + static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (__ratelimit(&errorLimiter)) { + logError("failed to write block map page %llu", info->pbn); + } +#else + logError("failed to write block map page %llu", info->pbn); +#endif + } + + setInfoState(info, PS_DIRTY); + relaxedAdd64(&cache->stats.failedWrites, 1); + setPersistentError(cache, "cannot write page", result); + + if (!writeHasFinished(info)) { + discardPageIfNeeded(cache); + } + + checkForDrainComplete(cache->zone); +} + +/** + * VIO callback used when a page has been written out. + * + * @param completion A completion for the VIO, the parent of which + * is embedded in PageInfo. + **/ +static void pageIsWrittenOut(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + + if (cache->writeHook != NULL) { + bool rewrite = cache->writeHook(getPageBuffer(info), cache->zone, + info->context); + if (rewrite) { + launchWriteMetadataVIOWithFlush(info->vio, info->pbn, pageIsWrittenOut, + handlePageWriteError, true, false); + return; + } + } + + bool wasDiscard = writeHasFinished(info); + bool reclaimed = (!wasDiscard || (info->busy > 0) + || hasWaiters(&info->waiting)); + + setInfoState(info, PS_RESIDENT); + + uint32_t reclamations = distributePageOverQueue(info, &info->waiting); + relaxedAdd64(&cache->stats.reclaimed, reclamations); + + if (wasDiscard) { + cache->discardCount--; + } + + if (reclaimed) { + discardPageIfNeeded(cache); + } else { + allocateFreePage(info); + } + + checkForDrainComplete(cache->zone); +} + +/** + * Write the batch of pages which were covered by the layer flush which just + * completed. This callback is registered in savePages(). + * + * @param flushCompletion The flush VIO + **/ +static void writePages(VDOCompletion *flushCompletion) +{ + VDOPageCache *cache = ((PageInfo *) flushCompletion->parent)->cache; + + /* + * We need to cache these two values on the stack since in the error case + * below, it is possible for the last page info to cause the page cache to + * get freed. Hence once we launch the last page, it may be unsafe to + * dereference the cache [VDO-4724]. + */ + bool hasUnflushedPages = (cache->pagesToFlush > 0); + PageCount pagesInFlush = cache->pagesInFlush; + cache->pagesInFlush = 0; + while (pagesInFlush-- > 0) { + PageInfo *info = pageInfoFromListNode(chopRingNode(&cache->outgoingList)); + if (isReadOnly(info->cache->zone->readOnlyNotifier)) { + VDOCompletion *completion = &info->vio->completion; + resetCompletion(completion); + completion->callback = pageIsWrittenOut; + completion->errorHandler = handlePageWriteError; + finishCompletion(completion, VDO_READ_ONLY); + continue; + } + relaxedAdd64(&info->cache->stats.pagesSaved, 1); + launchWriteMetadataVIO(info->vio, info->pbn, pageIsWrittenOut, + handlePageWriteError); + } + + if (hasUnflushedPages) { + // If there are unflushed pages, the cache can't have been freed, so this + // call is safe. + savePages(cache); + } +} + +/**********************************************************************/ +void releaseVDOPageCompletion(VDOCompletion *completion) +{ + if (completion == NULL) { + return; + } + + PageInfo *discardInfo = NULL; + VDOPageCompletion *pageCompletion; + if (completion->result == VDO_SUCCESS) { + pageCompletion = validateCompletedPage(completion, false); + if (--pageCompletion->info->busy == 0) { + discardInfo = pageCompletion->info; + } + } else { + // Do not check for errors if the completion was not successful. + pageCompletion = asVDOPageCompletion(completion); + } + ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), + "Page being released after leaving all queues"); + + VDOPageCache *cache = pageCompletion->cache; + assertOnCacheThread(cache, __func__); + memset(pageCompletion, 0, sizeof(VDOPageCompletion)); + + if (discardInfo != NULL) { + if (discardInfo->writeStatus == WRITE_STATUS_DEFERRED) { + discardInfo->writeStatus = WRITE_STATUS_NORMAL; + launchPageSave(discardInfo); + } + // if there are excess requests for pages (that have not already started + // discards) we need to discard some page (which may be this one) + discardPageIfNeeded(cache); + } +} + +/** + * Helper function to load a page as described by a VDO Page Completion. + * + * @param info the page info representing where to load the page + * @param vdoPageComp the VDO Page Completion describing the page + **/ +static void loadPageForCompletion(PageInfo *info, + VDOPageCompletion *vdoPageComp) +{ + int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + finishCompletion(&vdoPageComp->completion, result); + return; + } + + result = launchPageLoad(info, vdoPageComp->pbn); + if (result != VDO_SUCCESS) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/**********************************************************************/ +void getVDOPageAsync(VDOCompletion *completion) +{ + VDOPageCompletion *vdoPageComp = asVDOPageCompletion(completion); + VDOPageCache *cache = vdoPageComp->cache; + assertOnCacheThread(cache, __func__); + + if (vdoPageComp->writable && isReadOnly(cache->zone->readOnlyNotifier)) { + finishCompletion(completion, VDO_READ_ONLY); + return; + } + + if (vdoPageComp->writable) { + relaxedAdd64(&cache->stats.writeCount, 1); + } else { + relaxedAdd64(&cache->stats.readCount, 1); + } + + PageInfo *info = vpcFindPage(cache, vdoPageComp->pbn); + if (info != NULL) { + // The page is in the cache already. + if ((info->writeStatus == WRITE_STATUS_DEFERRED) || isIncoming(info) + || (isOutgoing(info) && vdoPageComp->writable)) { + // The page is unusable until it has finished I/O. + relaxedAdd64(&cache->stats.waitForPage, 1); + int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + finishCompletion(&vdoPageComp->completion, result); + } + + return; + } + + if (isValid(info)) { + // The page is usable. + relaxedAdd64(&cache->stats.foundInCache, 1); + if (!isPresent(info)) { + relaxedAdd64(&cache->stats.readOutgoing, 1); + } + updateLru(info); + ++info->busy; + completeWithPage(info, vdoPageComp); + return; + } + // Something horrible has gone wrong. + ASSERT_LOG_ONLY(false, "Info found in a usable state."); + } + + // The page must be fetched. + info = findFreePage(cache); + if (info != NULL) { + relaxedAdd64(&cache->stats.fetchRequired, 1); + loadPageForCompletion(info, vdoPageComp); + return; + } + + // The page must wait for a page to be discarded. + relaxedAdd64(&cache->stats.discardRequired, 1); + discardPageForCompletion(vdoPageComp); +} + +/**********************************************************************/ +void markCompletedVDOPageDirty(VDOCompletion *completion, + SequenceNumber oldDirtyPeriod, + SequenceNumber newDirtyPeriod) +{ + VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); + if (vdoPageComp == NULL) { + return; + } + + PageInfo *info = vdoPageComp->info; + setInfoState(info, PS_DIRTY); + addToDirtyLists(info->cache->dirtyLists, &info->listNode, oldDirtyPeriod, + newDirtyPeriod); +} + +/**********************************************************************/ +void requestVDOPageWrite(VDOCompletion *completion) +{ + VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); + if (vdoPageComp == NULL) { + return; + } + + PageInfo *info = vdoPageComp->info; + setInfoState(info, PS_DIRTY); + launchPageSave(info); +} + +/**********************************************************************/ +static void *dereferencePageCompletion(VDOPageCompletion *completion) +{ + return ((completion != NULL) ? getPageBuffer(completion->info) : NULL); +} + +/**********************************************************************/ +const void *dereferenceReadableVDOPage(VDOCompletion *completion) +{ + return dereferencePageCompletion(validateCompletedPage(completion, false)); +} + +/**********************************************************************/ +void *dereferenceWritableVDOPage(VDOCompletion *completion) +{ + return dereferencePageCompletion(validateCompletedPage(completion, true)); +} + +/**********************************************************************/ +void *getVDOPageCompletionContext(VDOCompletion *completion) +{ + VDOPageCompletion *pageCompletion = asVDOPageCompletion(completion); + PageInfo *info = ((pageCompletion != NULL) ? pageCompletion->info : NULL); + return (((info != NULL) && isValid(info)) ? info->context : NULL); +} + +/**********************************************************************/ +void drainVDOPageCache(VDOPageCache *cache) +{ + assertOnCacheThread(cache, __func__); + ASSERT_LOG_ONLY(isDraining(&cache->zone->state), + "drainVDOPageCache() called during block map drain"); + + if (!isSuspending(&cache->zone->state)) { + flushDirtyLists(cache->dirtyLists); + savePages(cache); + } +} + +/**********************************************************************/ +int invalidateVDOPageCache(VDOPageCache *cache) +{ + assertOnCacheThread(cache, __func__); + + // Make sure we don't throw away any dirty pages. + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; info++) { + int result = ASSERT(!isDirty(info), "cache must have no dirty pages"); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Reset the pageMap by re-allocating it. + freeIntMap(&cache->pageMap); + return makeIntMap(cache->pageCount, 0, &cache->pageMap); +} diff --git a/vdo/base/vdoPageCache.h b/vdo/base/vdoPageCache.h new file mode 100644 index 0000000..e6a944d --- /dev/null +++ b/vdo/base/vdoPageCache.h @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.h#7 $ + */ + +#ifndef VDO_PAGE_CACHE_H +#define VDO_PAGE_CACHE_H + +#include "adminState.h" +#include "atomic.h" +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Structure describing page meta data (defined internally). + **/ +typedef struct pageInfo PageInfo; + +/** + * Structure describing entire page cache. + * (Unfortunately the name "PageCache" is already taken by Albireo.) + **/ +typedef struct vdoPageCache VDOPageCache; + +/** + * Generation counter for page references. + **/ +typedef uint32_t VDOPageGeneration; + +/** + * Page-state count statistics sub-structure. + **/ +typedef struct { + /* free pages */ + Atomic64 freePages; + /* clean (resident) pages */ + Atomic64 cleanPages; + /* dirty pages per era */ + Atomic64 dirtyPages; + /* pages incoming */ + Atomic64 incomingPages; + /* pages outgoing */ + Atomic64 outgoingPages; + /* pages in failed state */ + Atomic64 failedPages; +} AtomicPageStateCounts; + +/** + * Statistics and debugging fields for the page cache. + */ +typedef struct { + /* counts of how many pages are in each state */ + AtomicPageStateCounts counts; + /* how many times free page not available */ + Atomic64 cachePressure; + /* number of getVDOPageAsync() for read */ + Atomic64 readCount; + /* number or getVDOPageAsync() for write */ + Atomic64 writeCount; + /* number of times pages failed to read */ + Atomic64 failedReads; + /* number of times pages failed to write */ + Atomic64 failedWrites; + /* number of gets that are reclaimed */ + Atomic64 reclaimed; + /* number of gets for outgoing pages */ + Atomic64 readOutgoing; + /* number of gets that were already there */ + Atomic64 foundInCache; + /* number of gets requiring discard */ + Atomic64 discardRequired; + /* number of gets enqueued for their page */ + Atomic64 waitForPage; + /* number of gets that have to fetch */ + Atomic64 fetchRequired; + /* number of page fetches */ + Atomic64 pagesLoaded; + /* number of page saves */ + Atomic64 pagesSaved; + /* number of flushes initiated */ + Atomic64 flushCount; +} AtomicPageCacheStatistics; + +/** + * Signature for a function to call when a page is read into the cache. + * + *

If specified, this function is called when a page is fetched from disk. + * + * @param rawPage The raw memory of the freshly-fetched page + * @param pbn The absolute physical block number of the page + * @param zone The block map zone to which the cache belongs + * @param pageContext A pointer to client-specific data for the new page + * + * @return VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly + * formatted + **/ +typedef int VDOPageReadFunction(void *rawPage, + PhysicalBlockNumber pbn, + BlockMapZone *zone, + void *pageContext); + +/** + * Signature for a function to call when a page is written from the cache. + * + *

If specified, this function is called when a page is written to disk. + * + * @param rawPage The raw memory of the freshly-written page + * @param zone The block map zone to which the cache belongs + * @param pageContext A pointer to client-specific data for the new page + * + * @return whether the page needs to be rewritten + **/ +typedef bool VDOPageWriteFunction(void *rawPage, + BlockMapZone *zone, + void *pageContext); + +/** + * Construct a PageCache. + * + * @param [in] layer The physical layer to read and write + * @param [in] pageCount The number of cache pages to hold + * @param [in] readHook The function to be called when a page is read + * into the cache + * @param [in] writeHook The function to be called after a page is + * written from the cache + * @param [in] pageContextSize The size of the per-page context that will be + * passed to the read and write hooks + * @param [in] maximumAge The number of journal blocks before a dirtied + * page is considered old and must be written + * out + * @param [in] zone The block map zone which owns this cache + * @param [out] cachePtr A pointer to hold the cache + * + * @return a success or error code + **/ +int makeVDOPageCache(PhysicalLayer *layer, + PageCount pageCount, + VDOPageReadFunction *readHook, + VDOPageWriteFunction *writeHook, + size_t pageContextSize, + BlockCount maximumAge, + BlockMapZone *zone, + VDOPageCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Free the page cache structure and null out the reference to it. + * + * @param cachePtr a pointer to the cache to free + **/ +void freeVDOPageCache(VDOPageCache **cachePtr); + +/** + * Set the initial dirty period for a page cache. + * + * @param cache The cache + * @param period The initial dirty period to set + **/ +void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period); + +/** + * Switch the page cache into or out of read-only rebuild mode. + * + * @param cache The cache + * @param rebuilding true if the cache should be put into + * read-only rebuild mode, false otherwise + **/ +void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding); + +/** + * Check whether a page cache is active (i.e. has any active lookups, + * outstanding I/O, or pending I/O). + * + * @param cache The cache to check + * + * @return true if the cache is active + **/ +bool isPageCacheActive(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +/** + * Advance the dirty period for a page cache. + * + * @param cache The cache to advance + * @param period The new dirty period + **/ +void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period); + +/** + * Write one or more batches of dirty pages. + * + * All writable pages in the ancient era and some number in the old era + * are scheduled for writing. + * + * @param cache the VDO page cache + * @param batches how many batches to write now + * @param total how many batches (including those being written now) remain + * in this era + **/ +void writeVDOPageCachePages(VDOPageCache *cache, + size_t batches, + size_t total); + +/** + * Rotate the dirty page eras. + * + * Move all pages in the old era to the ancient era and then move + * the current era bin into the old era. + * + * @param cache the VDO page cache + **/ +void rotateVDOPageCacheEras(VDOPageCache *cache); + +// ASYNC + +/** + * A completion awaiting a specific page. Also a live reference into the + * page once completed, until freed. + **/ +typedef struct { + /** The generic completion */ + VDOCompletion completion; + /** The cache involved */ + VDOPageCache *cache; + /** The waiter for the pending list */ + Waiter waiter; + /** The absolute physical block number of the page on disk */ + PhysicalBlockNumber pbn; + /** Whether the page may be modified */ + bool writable; + /** Whether the page is available */ + bool ready; + /** The info structure for the page, only valid when ready */ + PageInfo *info; +} VDOPageCompletion; + +/** + * Initialize a VDO Page Completion, requesting a particular page from the + * cache. + * + * @param pageCompletion The VDOPageCompletion to initialize + * @param cache The VDO page cache + * @param pbn The absolute physical block of the desired page + * @param writable Whether the page can be modified + * @param parent The parent object + * @param callback The completion callback + * @param errorHandler The handler for page errors + * + * @note Once a completion has occurred for the getVDOPageAsync operation, + * the underlying page shall be busy (stuck in memory) until the + * VDOCompletion returned by this operation has been released. + **/ +void initVDOPageCompletion(VDOPageCompletion *pageCompletion, + VDOPageCache *cache, + PhysicalBlockNumber pbn, + bool writable, + void *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Release a VDO Page Completion. + * + * The page referenced by this completion (if any) will no longer be + * held busy by this completion. If a page becomes discardable and + * there are completions awaiting free pages then a new round of + * page discarding is started. + * + * @param completion The completion to release + **/ +void releaseVDOPageCompletion(VDOCompletion *completion); + +/** + * Asynchronous operation to get a VDO page. + * + * May cause another page to be discarded (potentially writing a dirty page) + * and the one nominated by the completion to be loaded from disk. + * + * When the page becomes available the callback registered in the completion + * provided is triggered. Once triggered the page is marked busy until + * the completion is destroyed. + * + * @param completion the completion initialized my initVDOPageCompletion(). + **/ +void getVDOPageAsync(VDOCompletion *completion); + +/** + * Mark a VDO page referenced by a completed VDOPageCompletion as dirty. + * + * @param completion a VDO Page Completion whose callback has been called + * @param oldDirtyPeriod the period in which the page was already dirty (0 if + * it wasn't) + * @param newDirtyPeriod the period in which the page is now dirty + **/ +void markCompletedVDOPageDirty(VDOCompletion *completion, + SequenceNumber oldDirtyPeriod, + SequenceNumber newDirtyPeriod); + +/** + * Request that a VDO page be written out as soon as it is not busy. + * + * @param completion the VDOPageCompletion containing the page + **/ +void requestVDOPageWrite(VDOCompletion *completion); + +/** + * Access the raw memory for a read-only page of a completed VDOPageCompletion. + * + * @param completion a vdo page completion whose callback has been called + * + * @return a pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available. + **/ +const void *dereferenceReadableVDOPage(VDOCompletion *completion); + +/** + * Access the raw memory for a writable page of a completed VDOPageCompletion. + * + * @param completion a vdo page completion whose callback has been called + * + * @return a pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available, or if the page is read-only + **/ +void *dereferenceWritableVDOPage(VDOCompletion *completion); + +/** + * Get the per-page client context for the page in a page completion whose + * callback has been invoked. Should only be called after dereferencing the + * page completion to validate the page. + * + * @param completion a vdo page completion whose callback has been invoked + * + * @return a pointer to the per-page client context, or NULL if + * the page is not available + **/ +void *getVDOPageCompletionContext(VDOCompletion *completion); + +/** + * Drain I/O for a page cache. + * + * @param cache The cache to drain + **/ +void drainVDOPageCache(VDOPageCache *cache); + +/** + * Invalidate all entries in the VDO page cache. There must not be any + * dirty pages in the cache. + * + * @param cache the cache to invalidate + * + * @return a success or error code + **/ +int invalidateVDOPageCache(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +// STATISTICS & TESTING + +/** + * Get current cache statistics. + * + * @param cache the page cache + * + * @return the statistics + **/ +AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +#endif // VDO_PAGE_CACHE_H diff --git a/vdo/base/vdoPageCacheInternals.h b/vdo/base/vdoPageCacheInternals.h new file mode 100644 index 0000000..4e2c67f --- /dev/null +++ b/vdo/base/vdoPageCacheInternals.h @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCacheInternals.h#8 $ + */ + +#ifndef VDO_PAGE_CACHE_INTERNALS_H +#define VDO_PAGE_CACHE_INTERNALS_H + +#include "vdoPageCache.h" + +#ifndef __KERNEL__ +# include +#endif + +#include "blockMapInternals.h" +#include "completion.h" +#include "dirtyLists.h" +#include "intMap.h" +#include "physicalLayer.h" +#include "ringNode.h" + +enum { + MAX_PAGE_CONTEXT_SIZE = 8, +}; + +static const PhysicalBlockNumber NO_PAGE = 0xFFFFFFFFFFFFFFFF; + +/** + * A PageInfoNode is a ring node. + **/ +typedef RingNode PageInfoNode; + +/** + * The VDO Page Cache abstraction. + **/ +struct vdoPageCache { + /** the physical layer to page to */ + PhysicalLayer *layer; + /** number of pages in cache */ + PageCount pageCount; + /** function to call on page read */ + VDOPageReadFunction *readHook; + /** function to call on page write */ + VDOPageWriteFunction *writeHook; + /** number of pages to write in the current batch */ + PageCount pagesInBatch; + /** Whether the VDO is doing a read-only rebuild */ + bool rebuilding; + + /** array of page information entries */ + PageInfo *infos; + /** raw memory for pages */ + char *pages; + /** cache last found page info */ + PageInfo *lastFound; + /** map of page number to info */ + IntMap *pageMap; + /** master LRU list (all infos) */ + PageInfoNode lruList; + /** dirty pages by period */ + DirtyLists *dirtyLists; + /** free page list (oldest first) */ + PageInfoNode freeList; + /** outgoing page list */ + PageInfoNode outgoingList; + /** number of read I/O operations pending */ + PageCount outstandingReads; + /** number of write I/O operations pending */ + PageCount outstandingWrites; + /** number of pages covered by the current flush */ + PageCount pagesInFlush; + /** number of pages waiting to be included in the next flush */ + PageCount pagesToFlush; + /** number of discards in progress */ + unsigned int discardCount; + /** how many VPCs waiting for free page */ + unsigned int waiterCount; + /** queue of waiters who want a free page */ + WaitQueue freeWaiters; + /** statistics */ + AtomicPageCacheStatistics stats; + /** counter for pressure reports */ + uint32_t pressureReport; + /** the block map zone to which this cache belongs */ + BlockMapZone *zone; +}; + +/** + * The state of a page buffer. If the page buffer is free no particular page is + * bound to it, otherwise the page buffer is bound to particular page whose + * absolute pbn is in the pbn field. If the page is resident or dirty the page + * data is stable and may be accessed. Otherwise the page is in flight + * (incoming or outgoing) and its data should not be accessed. + * + * @note Update the static data in vpcPageStateName() and vpcPageStateFlag() + * if you change this enumeration. + **/ +typedef enum __attribute__((packed)) pageState { + /* this page buffer is not being used */ + PS_FREE, + /* this page is being read from store */ + PS_INCOMING, + /* attempt to load this page failed */ + PS_FAILED, + /* this page is valid and un-modified */ + PS_RESIDENT, + /* this page is valid and modified */ + PS_DIRTY, + /* this page is being written and should not be used */ + PS_OUTGOING, + /* not a state */ + PAGE_STATE_COUNT, +} PageState; + +/** + * The write status of page + **/ +typedef enum __attribute__((packed)) { + WRITE_STATUS_NORMAL, + WRITE_STATUS_DISCARD, + WRITE_STATUS_DEFERRED, +} WriteStatus; + +/** + * Per-page-slot information. + **/ +struct pageInfo { + /** Preallocated page VIO */ + VIO *vio; + /** back-link for references */ + VDOPageCache *cache; + /** the pbn of the page */ + PhysicalBlockNumber pbn; + /** page is busy (temporarily locked) */ + uint16_t busy; + /** the write status the page */ + WriteStatus writeStatus; + /** page state */ + PageState state; + /** queue of completions awaiting this item */ + WaitQueue waiting; + /** state linked list node */ + PageInfoNode listNode; + /** LRU node */ + PageInfoNode lruNode; + /** Space for per-page client data */ + byte context[MAX_PAGE_CONTEXT_SIZE]; +}; + +// PAGE INFO LIST OPERATIONS + +/**********************************************************************/ +static inline PageInfo *pageInfoFromListNode(PageInfoNode *node) +{ + if (node == NULL) { + return NULL; + } + return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, listNode)); +} + +/**********************************************************************/ +static inline PageInfo *pageInfoFromLRUNode(PageInfoNode *node) +{ + if (node == NULL) { + return NULL; + } + return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, lruNode)); +} + +// PAGE INFO STATE ACCESSOR FUNCTIONS + +/**********************************************************************/ +static inline bool isFree(const PageInfo *info) +{ + return info->state == PS_FREE; +} + +/**********************************************************************/ +static inline bool isAvailable(const PageInfo *info) +{ + return (info->state == PS_FREE) || (info->state == PS_FAILED); +} + +/**********************************************************************/ +static inline bool isPresent(const PageInfo *info) +{ + return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY); +} + +/**********************************************************************/ +static inline bool isDirty(const PageInfo *info) +{ + return info->state == PS_DIRTY; +} + +/**********************************************************************/ +static inline bool isResident(const PageInfo *info) +{ + return info->state == PS_RESIDENT; +} + +/**********************************************************************/ +static inline bool isInFlight(const PageInfo *info) +{ + return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING); +} + +/**********************************************************************/ +static inline bool isIncoming(const PageInfo *info) +{ + return info->state == PS_INCOMING; +} + +/**********************************************************************/ +static inline bool isOutgoing(const PageInfo *info) +{ + return info->state == PS_OUTGOING; +} + +/**********************************************************************/ +static inline bool isValid(const PageInfo *info) +{ + return isPresent(info) || isOutgoing(info); +} + +// COMPLETION CONVERSIONS + +/**********************************************************************/ +static inline VDOPageCompletion *asVDOPageCompletion(VDOCompletion *completion) +{ + assertCompletionType(completion->type, VDO_PAGE_COMPLETION); + return (VDOPageCompletion *) ((uintptr_t) completion + - offsetof(VDOPageCompletion, completion)); +} + +/**********************************************************************/ +static inline +VDOPageCompletion *pageCompletionFromWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + VDOPageCompletion *completion = (VDOPageCompletion *) + ((uintptr_t) waiter - offsetof(VDOPageCompletion, waiter)); + assertCompletionType(completion->completion.type, VDO_PAGE_COMPLETION); + return completion; +} + +// COMMONLY USED FUNCTIONS + +// All of these functions are prefixed "vpc" in order to prevent namespace +// issues (ordinarily they would be static). + +/** + * Find the page info (if any) associated with a given pbn. + * + * @param cache the page cache + * @param pbn the absolute physical block number of the page + * + * @return the page info for the page if available, or NULL if not + **/ +PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Return the name of a page state. + * + * @param state a page state + * + * @return a pointer to a static page state name + * + * @note If the page state is invalid a static string is returned and the + * invalid state is logged. + **/ +const char *vpcPageStateName(PageState state) + __attribute__((warn_unused_result)); + +#endif // VDO_PAGE_CACHE_INTERNALS_H diff --git a/vdo/base/vdoRecovery.c b/vdo/base/vdoRecovery.c new file mode 100644 index 0000000..97e72eb --- /dev/null +++ b/vdo/base/vdoRecovery.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $ + */ + +#include "vdoRecoveryInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "blockAllocatorInternals.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapRecovery.h" +#include "completion.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournal.h" +#include "recoveryUtils.h" +#include "slab.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "vdoInternal.h" +#include "waitQueue.h" + +enum { + // The int map needs capacity of twice the number of VIOs in the system. + INT_MAP_CAPACITY = MAXIMUM_USER_VIOS * 2, + // There can be as many missing decrefs as there are VIOs in the system. + MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS, +}; + +typedef struct missingDecref { + /** A waiter for queueing this object */ + Waiter waiter; + /** The parent of this object */ + RecoveryCompletion *recovery; + /** Whether this decref is complete */ + bool complete; + /** The slot for which the last decref was lost */ + BlockMapSlot slot; + /** The penultimate block map entry for this LBN */ + DataLocation penultimateMapping; + /** The page completion used to fetch the block map page for this LBN */ + VDOPageCompletion pageCompletion; + /** The journal point which will be used for this entry */ + JournalPoint journalPoint; + /** The slab journal to which this entry will be applied */ + SlabJournal *slabJournal; +} MissingDecref; + +/** + * Convert a Waiter to the missing decref of which it is a part. + * + * @param waiter The Waiter to convert + * + * @return The MissingDecref wrapping the Waiter + **/ +__attribute__((warn_unused_result)) +static inline MissingDecref *asMissingDecref(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0); + return (MissingDecref *) waiter; +} + +/** + * Enqueue a MissingDecref. If the enqueue fails, enter read-only mode. + * + * @param queue The queue on which to enqueue the decref + * @param decref The MissingDecref to enqueue + * + * @return VDO_SUCCESS or an error + **/ +static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref) +{ + int result = enqueueWaiter(queue, &decref->waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result); + setCompletionResult(&decref->recovery->completion, result); + FREE(decref); + } + + return result; +} + +/** + * Convert a BlockMapSlot into a unique uint64_t. + * + * @param slot The block map slot to convert. + * + * @return a one-to-one mappable uint64_t. + **/ +static uint64_t slotAsNumber(BlockMapSlot slot) +{ + return (((uint64_t) slot.pbn << 10) + slot.slot); +} + +/** + * Create a MissingDecref and enqueue it to wait for a determination of its + * penultimate mapping. + * + * @param [in] recovery The parent recovery completion + * @param [in] entry The recovery journal entry for the increment which is + * missing a decref + * @param [out] decrefPtr A pointer to hold the new MissingDecref + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int makeMissingDecref(RecoveryCompletion *recovery, + RecoveryJournalEntry entry, + MissingDecref **decrefPtr) +{ + MissingDecref *decref; + int result = ALLOCATE(1, MissingDecref, __func__, &decref); + if (result != VDO_SUCCESS) { + return result; + } + + decref->recovery = recovery; + result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref); + if (result != VDO_SUCCESS) { + return result; + } + + /* + * Each synthsized decref needs a unique journal point. Otherwise, in the + * event of a crash, we would be unable to tell which synthesized decrefs had + * already been committed in the slab journals. Instead of using real + * recovery journal space for this, we can use fake journal points between + * the last currently valid entry in the tail block and the first journal + * entry in the next block. We can't overflow the entry count since the + * number of synthesized decrefs is bounded by the DataVIO limit. + * + * It is vital that any given missing decref always have the same fake + * journal point since a failed recovery may be retried with a different + * number of zones after having written out some slab journal blocks. Since + * the missing decrefs are always read out of the journal in the same order, + * we can assign them a journal point when they are read. Their subsequent + * use will ensure that, for any given slab journal, they are applied in + * the order dictated by these assigned journal points. + */ + decref->slot = entry.slot; + decref->journalPoint = recovery->nextSynthesizedJournalPoint; + recovery->nextSynthesizedJournalPoint.entryCount++; + recovery->missingDecrefCount++; + recovery->incompleteDecrefCount++; + + *decrefPtr = decref; + return VDO_SUCCESS; +} + +/** + * Move the given recovery point forward by one entry. + * + * @param point The recovery point to alter + **/ +static void incrementRecoveryPoint(RecoveryPoint *point) +{ + point->entryCount++; + if ((point->sectorCount == (SECTORS_PER_BLOCK - 1)) + && (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) { + point->sequenceNumber++; + point->sectorCount = 1; + point->entryCount = 0; + } + + if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) { + point->sectorCount++; + point->entryCount = 0; + return; + } +} + +/** + * Move the given recovery point backwards by one entry. + * + * @param point The recovery point to alter + **/ +static void decrementRecoveryPoint(RecoveryPoint *point) +{ + STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0); + + if ((point->sectorCount <= 1) && (point->entryCount == 0)) { + point->sequenceNumber--; + point->sectorCount = SECTORS_PER_BLOCK - 1; + point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1; + return; + } + + if (point->entryCount == 0) { + point->sectorCount--; + point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1; + return; + } + + point->entryCount--; +} + +/** + * Check whether the first point precedes the second point. + * + * @param first The first recovery point + * @param second The second recovery point + * + * @return true if the first point precedes the second point + **/ +__attribute__((warn_unused_result)) +static bool beforeRecoveryPoint(const RecoveryPoint *first, + const RecoveryPoint *second) +{ + if (first->sequenceNumber < second->sequenceNumber) { + return true; + } + + if (first->sequenceNumber > second->sequenceNumber) { + return false; + } + + if (first->sectorCount < second->sectorCount) { + return true; + } + + return ((first->sectorCount == second->sectorCount) + && (first->entryCount < second->entryCount)); +} + +/** + * Prepare the sub-task completion. + * + * @param recovery The RecoveryCompletion whose sub-task completion is to + * be prepared + * @param callback The callback to register for the next sub-task + * @param errorHandler The error handler for the next sub-task + * @param zoneType The type of zone on which the callback or errorHandler + * should run + **/ +static void prepareSubTask(RecoveryCompletion *recovery, + VDOAction callback, + VDOAction errorHandler, + ZoneType zoneType) +{ + const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); + ThreadID threadID; + switch (zoneType) { + case ZONE_TYPE_LOGICAL: + // All blockmap access is done on single thread, so use logical zone 0. + threadID = getLogicalZoneThread(threadConfig, 0); + break; + + case ZONE_TYPE_PHYSICAL: + threadID = recovery->allocator->threadID; + break; + + case ZONE_TYPE_ADMIN: + default: + threadID = getAdminThread(threadConfig); + } + + prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler, + threadID, recovery); +} + +/**********************************************************************/ +int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) +{ + const ThreadConfig *threadConfig = getThreadConfig(vdo); + RecoveryCompletion *recovery; + int result = ALLOCATE_EXTENDED(RecoveryCompletion, + threadConfig->physicalZoneCount, RingNode, + __func__, &recovery); + if (result != VDO_SUCCESS) { + return result; + } + + recovery->vdo = vdo; + for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { + initializeWaitQueue(&recovery->missingDecrefs[z]); + } + + result = initializeEnqueueableCompletion(&recovery->completion, + RECOVERY_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + *recoveryPtr = recovery; + return VDO_SUCCESS; +} + +/** + * A waiter callback to free MissingDecrefs. + * + * Implements WaiterCallback. + **/ +static void freeMissingDecref(Waiter *waiter, + void *context __attribute__((unused))) +{ + FREE(asMissingDecref(waiter)); +} + +/**********************************************************************/ +void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr) +{ + RecoveryCompletion *recovery = *recoveryPtr; + if (recovery == NULL) { + return; + } + + freeIntMap(&recovery->slotEntryMap); + const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); + for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { + notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL); + } + + FREE(recovery->journalData); + FREE(recovery->entries); + destroyEnqueueable(&recovery->subTaskCompletion); + destroyEnqueueable(&recovery->completion); + FREE(recovery); + *recoveryPtr = NULL; +} + +/** + * Finish recovering, free the recovery completion and notify the parent. + * + * @param completion The recovery completion + **/ +static void finishRecovery(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + RecoveryCompletion *recovery = asRecoveryCompletion(completion); + VDO *vdo = recovery->vdo; + uint64_t recoveryCount = ++vdo->completeRecoveries; + initializeRecoveryJournalPostRecovery(vdo->recoveryJournal, + recoveryCount, recovery->highestTail); + freeRecoveryCompletion(&recovery); + logInfo("Rebuild complete."); + + // Now that we've freed the recovery completion and its vast array of + // journal entries, we can allocate refcounts. + int result = allocateSlabRefCounts(vdo->depot); + finishCompletion(parent, result); +} + +/** + * Handle a recovery error. + * + * @param completion The recovery completion + **/ +static void abortRecovery(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + int result = completion->result; + RecoveryCompletion *recovery = asRecoveryCompletion(completion); + freeRecoveryCompletion(&recovery); + logWarning("Recovery aborted"); + finishCompletion(parent, result); +} + +/** + * Abort a recovery if there is an error. + * + * @param result The result to check + * @param recovery The recovery completion + * + * @return true if the result was an error + **/ +__attribute__((warn_unused_result)) +static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery) +{ + if (result == VDO_SUCCESS) { + return false; + } + + finishCompletion(&recovery->completion, result); + return true; +} + +/** + * Unpack the recovery journal entry associated with the given recovery point. + * + * @param recovery The recovery completion + * @param point The recovery point + * + * @return The unpacked contents of the matching recovery journal entry + **/ +static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery, + const RecoveryPoint *point) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + PhysicalBlockNumber blockNumber + = getRecoveryJournalBlockNumber(journal, point->sequenceNumber); + off_t sectorOffset + = (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE); + PackedJournalSector *sector + = (PackedJournalSector *) &recovery->journalData[sectorOffset]; + return unpackRecoveryJournalEntry(§or->entries[point->entryCount]); +} + +/** + * Create an array of all valid journal entries, in order, and store it in the + * recovery completion. + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error code + **/ +static int extractJournalEntries(RecoveryCompletion *recovery) +{ + // Allocate a NumberedBlockMapping array just large enough to transcribe + // every increment PackedRecoveryJournalEntry from every valid journal block. + int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__, + &recovery->entries); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->blockMapHead, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + result = validateRecoveryJournalEntry(recovery->vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + return result; + } + + if (isIncrementOperation(entry.operation)) { + recovery->entries[recovery->entryCount] = (NumberedBlockMapping) { + .blockMapSlot = entry.slot, + .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), + .number = recovery->entryCount, + }; + recovery->entryCount++; + } + + incrementRecoveryPoint(&recoveryPoint); + } + + result = ASSERT((recovery->entryCount <= recovery->increfCount), + "approximate incref count is an upper bound"); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + } + + return result; +} + +/** + * Extract journal entries and recover the block map. This callback is + * registered in startSuperBlockSave(). + * + * @param completion The sub-task completion + **/ +static void launchBlockMapRecovery(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnLogicalZoneThread(vdo, 0, __func__); + + // Extract the journal entries for the block map recovery. + int result = extractJournalEntries(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareToFinishParent(completion, &recovery->completion); + recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion); +} + +/** + * Finish flushing all slab journals and start a write of the super block. + * This callback is registered in addSynthesizedEntries(). + * + * @param completion The sub-task completion + **/ +static void startSuperBlockSave(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnAdminThread(vdo, __func__); + + logInfo("Saving recovery progress"); + vdo->state = VDO_REPLAYING; + + // The block map access which follows the super block save must be done + // on a logical thread. + prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, + ZONE_TYPE_LOGICAL); + saveVDOComponentsAsync(vdo, completion); +} + +/** + * The callback from loading the slab depot. It will update the logical blocks + * and block map data blocks counts in the recovery journal and then drain the + * slab depot in order to commit the recovered slab journals. It is registered + * in applyToDepot(). + * + * @param completion The sub-task completion + **/ +static void finishRecoveringDepot(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnAdminThread(vdo, __func__); + + logInfo("Replayed %zu journal entries into slab journals", + recovery->entriesAddedToSlabJournals); + logInfo("Synthesized %zu missing journal entries", + recovery->missingDecrefCount); + vdo->recoveryJournal->logicalBlocksUsed = recovery->logicalBlocksUsed; + vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks; + + prepareSubTask(recovery, startSuperBlockSave, finishParentCallback, + ZONE_TYPE_ADMIN); + drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion); +} + +/** + * The error handler for recovering slab journals. It will skip any remaining + * recovery on the current zone and propagate the error. It is registered in + * addSlabJournalEntries() and addSynthesizedEntries(). + * + * @param completion The completion of the block allocator being recovered + **/ +static void handleAddSlabJournalEntryError(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + notifySlabJournalsAreRecovered(recovery->allocator, completion->result); +} + +/** + * Add synthesized entries into slab journals, waiting when necessary. + * + * @param completion The allocator completion + **/ +static void addSynthesizedEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + + // Get ready in case we need to enqueue again + prepareCompletion(completion, addSynthesizedEntries, + handleAddSlabJournalEntryError, + completion->callbackThreadID, recovery); + WaitQueue *missingDecrefs + = &recovery->missingDecrefs[recovery->allocator->zoneNumber]; + while (hasWaiters(missingDecrefs)) { + MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs)); + if (!attemptReplayIntoSlabJournal(decref->slabJournal, + decref->penultimateMapping.pbn, + DATA_DECREMENT, &decref->journalPoint, + completion)) { + return; + } + + dequeueNextWaiter(missingDecrefs); + FREE(decref); + } + + notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS); +} + +/** + * Determine the LBNs used count as of the end of the journal (but + * not including any changes to that count from entries that will be + * synthesized later). + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error + **/ +static int computeUsages(RecoveryCompletion *recovery) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + PackedJournalHeader *tailHeader + = getJournalBlockHeader(journal, recovery->journalData, recovery->tail); + + RecoveryBlockHeader unpacked; + unpackRecoveryBlockHeader(tailHeader, &unpacked); + recovery->logicalBlocksUsed = unpacked.logicalBlocksUsed; + recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks; + + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->tail, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + if (isMappedLocation(&entry.mapping)) { + switch (entry.operation) { + case DATA_INCREMENT: + recovery->logicalBlocksUsed++; + break; + + case DATA_DECREMENT: + recovery->logicalBlocksUsed--; + break; + + case BLOCK_MAP_INCREMENT: + recovery->blockMapDataBlocks++; + break; + + default: + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "Recovery journal entry at " + "sequence number %" PRIu64 + ", sector %u, entry %u had invalid " + "operation %u", + recoveryPoint.sequenceNumber, + recoveryPoint.sectorCount, + recoveryPoint.entryCount, + entry.operation); + } + } + + incrementRecoveryPoint(&recoveryPoint); + } + + return VDO_SUCCESS; +} + +/** + * Advance the current recovery and journal points. + * + * @param recovery The RecoveryCompletion whose points are to be + * advanced + * @param entriesPerBlock The number of entries in a recovery journal block + **/ +static void advancePoints(RecoveryCompletion *recovery, + JournalEntryCount entriesPerBlock) +{ + incrementRecoveryPoint(&recovery->nextRecoveryPoint); + advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock); +} + +/** + * Replay recovery journal entries into the slab journals of the allocator + * currently being recovered, waiting for slab journal tailblock space when + * necessary. This method is its own callback. + * + * @param completion The allocator completion + **/ +static void addSlabJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + + // Get ready in case we need to enqueue again. + prepareCompletion(completion, addSlabJournalEntries, + handleAddSlabJournalEntryError, + completion->callbackThreadID, recovery); + for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint; + beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint); + advancePoints(recovery, journal->entriesPerBlock)) { + RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint); + int result = validateRecoveryJournalEntry(vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(journal->readOnlyNotifier, result); + finishCompletion(completion, result); + return; + } + + if (entry.mapping.pbn == ZERO_BLOCK) { + continue; + } + + Slab *slab = getSlab(vdo->depot, entry.mapping.pbn); + if (slab->allocator != recovery->allocator) { + continue; + } + + if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn, + entry.operation, + &recovery->nextJournalPoint, + completion)) { + return; + } + + recovery->entriesAddedToSlabJournals++; + } + + logInfo("Recreating missing journal entries for zone %u", + recovery->allocator->zoneNumber); + addSynthesizedEntries(completion); +} + +/**********************************************************************/ +void replayIntoSlabJournals(BlockAllocator *allocator, + VDOCompletion *completion, + void *context) +{ + RecoveryCompletion *recovery = context; + assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__); + if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) { + // there's nothing to replay + notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS); + return; + } + + recovery->allocator = allocator; + recovery->nextRecoveryPoint = (RecoveryPoint) { + .sequenceNumber = recovery->slabJournalHead, + .sectorCount = 1, + .entryCount = 0, + }; + + recovery->nextJournalPoint = (JournalPoint) { + .sequenceNumber = recovery->slabJournalHead, + .entryCount = 0, + }; + + logInfo("Replaying entries into slab journals for zone %u", + allocator->zoneNumber); + completion->parent = recovery; + addSlabJournalEntries(completion); +} + +/** + * A waiter callback to enqueue a MissingDecref on the queue for the physical + * zone in which it will be applied. + * + * Implements WaiterCallback. + **/ +static void queueOnPhysicalZone(Waiter *waiter, void *context) +{ + MissingDecref *decref = asMissingDecref(waiter); + DataLocation mapping = decref->penultimateMapping; + if (isMappedLocation(&mapping)) { + decref->recovery->logicalBlocksUsed--; + } + + if (mapping.pbn == ZERO_BLOCK) { + // Decrefs of zero are not applied to slab journals. + FREE(decref); + return; + } + + decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn); + ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber; + enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref); +} + +/** + * Queue each missing decref on the slab journal to which it is to be applied + * then load the slab depot. This callback is registered in + * findSlabJournalEntries(). + * + * @param completion The sub-task completion + **/ +static void applyToDepot(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + assertOnAdminThread(recovery->vdo, __func__); + prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback, + ZONE_TYPE_ADMIN); + + SlabDepot *depot = getSlabDepot(recovery->vdo); + notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot); + if (abortRecoveryOnError(recovery->completion.result, recovery)) { + return; + } + + loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); +} + +/** + * Validate the location of the penultimate mapping for a MissingDecref. If it + * is valid, enqueue it for the appropriate physical zone or account for it. + * Otherwise, dispose of it and signal an error. + * + * @param decref The decref whose penultimate mapping has just been found + * @param location The penultimate mapping + * @param errorCode The error code to use if the location is invalid + **/ +static int recordMissingDecref(MissingDecref *decref, + DataLocation location, + int errorCode) +{ + RecoveryCompletion *recovery = decref->recovery; + recovery->incompleteDecrefCount--; + if (isValidLocation(&location) + && isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) { + decref->penultimateMapping = location; + decref->complete = true; + return VDO_SUCCESS; + } + + // The location was invalid + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode); + setCompletionResult(&recovery->completion, errorCode); + logErrorWithStringError(errorCode, + "Invalid mapping for pbn %llu with state %u", + location.pbn, location.state); + return errorCode; +} + +/** + * Find the block map slots with missing decrefs. + * + * To find the slots missing decrefs, we iterate through the journal in reverse + * so we see decrefs before increfs; if we see an incref before its paired + * decref, we instantly know this incref is missing its decref. + * + * Simultaneously, we attempt to determine the missing decref. If there is a + * missing decref, and at least two increfs for that slot, we know we should + * decref the PBN from the penultimate incref. Otherwise, there is only one + * incref for that slot: we must synthesize the decref out of the block map + * instead of the recovery journal. + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int findMissingDecrefs(RecoveryCompletion *recovery) +{ + IntMap *slotEntryMap = recovery->slotEntryMap; + // This placeholder decref is used to mark lbns for which we have observed a + // decref but not the paired incref (going backwards through the journal). + MissingDecref foundDecref; + + // A buffer is allocated based on the number of incRef entries found, so use + // the earliest head. + SequenceNumber head = minSequenceNumber(recovery->blockMapHead, + recovery->slabJournalHead); + RecoveryPoint headPoint = { + .sequenceNumber = head, + .sectorCount = 1, + .entryCount = 0, + }; + + // Set up for the first fake journal point that will be used for a + // synthesized entry. + recovery->nextSynthesizedJournalPoint = (JournalPoint) { + .sequenceNumber = recovery->tail, + .entryCount = recovery->vdo->recoveryJournal->entriesPerBlock, + }; + + RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint; + while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) { + decrementRecoveryPoint(&recoveryPoint); + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + + if (!isIncrementOperation(entry.operation)) { + // Observe that we've seen a decref before its incref, but only if + // the IntMap does not contain an unpaired incref for this lbn. + int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), + &foundDecref, false, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + continue; + } + + recovery->increfCount++; + + MissingDecref *decref + = intMapRemove(slotEntryMap, slotAsNumber(entry.slot)); + if (entry.operation == BLOCK_MAP_INCREMENT) { + if (decref != NULL) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "decref found for block map block %" + PRIu64 " with state %u", + entry.mapping.pbn, entry.mapping.state); + } + + // There are no decrefs for block map pages, so they can't be missing. + continue; + } + + if (decref == &foundDecref) { + // This incref already had a decref in the intmap, so we know it is + // not missing its decref. + continue; + } + + if (decref == NULL) { + // This incref is missing a decref. Add a missing decref object. + int result = makeMissingDecref(recovery, entry, &decref); + if (result != VDO_SUCCESS) { + return result; + } + + result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref, + false, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + continue; + } + + /* + * This MissingDecref was left here by an incref without a decref. + * We now know what its penultimate mapping is, and all entries + * before here in the journal are paired, decref before incref, so + * we needn't remember it in the intmap any longer. + */ + int result = recordMissingDecref(decref, entry.mapping, + VDO_CORRUPT_JOURNAL); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/** + * Process a fetched block map page for a missing decref. This callback is + * registered in findSlabJournalEntries(). + * + * @param completion The page completion which has just finished loading + **/ +static void processFetchedPage(VDOCompletion *completion) +{ + MissingDecref *currentDecref = completion->parent; + RecoveryCompletion *recovery = currentDecref->recovery; + assertOnLogicalZoneThread(recovery->vdo, 0, __func__); + + const BlockMapPage *page = dereferenceReadableVDOPage(completion); + DataLocation location + = unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]); + releaseVDOPageCompletion(completion); + recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING); + if (recovery->incompleteDecrefCount == 0) { + completeCompletion(&recovery->subTaskCompletion); + } +} + +/** + * Handle an error fetching a block map page for a missing decref. + * This error handler is registered in findSlabJournalEntries(). + * + * @param completion The page completion which has just finished loading + **/ +static void handleFetchError(VDOCompletion *completion) +{ + MissingDecref *decref = completion->parent; + RecoveryCompletion *recovery = decref->recovery; + assertOnLogicalZoneThread(recovery->vdo, 0, __func__); + + // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from + // the journal was bad, so convert the error code + setCompletionResult(&recovery->subTaskCompletion, + ((completion->result == VDO_OUT_OF_RANGE) + ? VDO_CORRUPT_JOURNAL : completion->result)); + releaseVDOPageCompletion(completion); + if (--recovery->incompleteDecrefCount == 0) { + completeCompletion(&recovery->subTaskCompletion); + } +} + +/** + * The waiter callback to requeue a missing decref and launch its page fetch. + * + * Implements WaiterCallback. + **/ +static void launchFetch(Waiter *waiter, void *context) +{ + MissingDecref *decref = asMissingDecref(waiter); + RecoveryCompletion *recovery = decref->recovery; + if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref) + != VDO_SUCCESS) { + return; + } + + if (decref->complete) { + // We've already found the mapping for this decref, no fetch needed. + return; + } + + BlockMapZone *zone = context; + initVDOPageCompletion(&decref->pageCompletion, zone->pageCache, + decref->slot.pbn, false, decref, processFetchedPage, + handleFetchError); + getVDOPageAsync(&decref->pageCompletion.completion); +} + +/** + * Find all entries which need to be replayed into the slab journals. + * + * @param completion The sub-task completion + **/ +static void findSlabJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + + // We need to be on logical zone 0's thread since we are going to use its + // page cache. + assertOnLogicalZoneThread(vdo, 0, __func__); + int result = findMissingDecrefs(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareSubTask(recovery, applyToDepot, finishParentCallback, + ZONE_TYPE_ADMIN); + + /* + * Increment the incompleteDecrefCount so that the fetch callback can't + * complete the sub-task while we are still processing the queue of missing + * decrefs. + */ + if (recovery->incompleteDecrefCount++ > 0) { + // Fetch block map pages to fill in the incomplete missing decrefs. + notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch, + getBlockMapZone(getBlockMap(vdo), 0)); + } + + if (--recovery->incompleteDecrefCount == 0) { + completeCompletion(completion); + } +} + +/** + * Find the contiguous range of journal blocks. + * + * @param recovery The recovery completion + * + * @return true if there were valid journal blocks + **/ +static bool findContiguousRange(RecoveryCompletion *recovery) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + SequenceNumber head + = minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead); + + bool foundEntries = false; + for (SequenceNumber i = head; i <= recovery->highestTail; i++) { + recovery->tail = i; + recovery->tailRecoveryPoint = (RecoveryPoint) { + .sequenceNumber = i, + .sectorCount = 0, + .entryCount = 0, + }; + + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, recovery->journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isExactRecoveryJournalBlock(journal, &header, i) + || (header.entryCount > journal->entriesPerBlock)) { + // A bad block header was found so this must be the end of the journal. + break; + } + + JournalEntryCount blockEntries = header.entryCount; + // Examine each sector in turn to determine the last valid sector. + for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { + PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); + + // A bad sector means that this block was torn. + if (!isValidRecoveryJournalSector(&header, sector)) { + break; + } + + JournalEntryCount sectorEntries = minBlock(sector->entryCount, + blockEntries); + if (sectorEntries > 0) { + foundEntries = true; + recovery->tailRecoveryPoint.sectorCount++; + recovery->tailRecoveryPoint.entryCount = sectorEntries; + blockEntries -= sectorEntries; + } + + // If this sector is short, the later sectors can't matter. + if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) + || (blockEntries == 0)) { + break; + } + } + + // If this block was not filled, or if it tore, no later block can matter. + if ((header.entryCount != journal->entriesPerBlock) + || (blockEntries > 0)) { + break; + } + } + + // Set the tail to the last valid tail block, if there is one. + if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) { + recovery->tail--; + } + + return foundEntries; +} + +/** + * Count the number of increment entries in the journal. + * + * @param recovery The recovery completion + **/ +static int countIncrementEntries(RecoveryCompletion *recovery) +{ + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->blockMapHead, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + int result = validateRecoveryJournalEntry(recovery->vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + return result; + } + if (isIncrementOperation(entry.operation)) { + recovery->increfCount++; + } + incrementRecoveryPoint(&recoveryPoint); + } + + return VDO_SUCCESS; +} + +/** + * Determine the limits of the valid recovery journal and prepare to replay + * into the slab journals and block map. + * + * @param completion The sub-task completion + **/ +static void prepareToApplyJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + logInfo("Finished reading recovery journal"); + bool foundEntries = findHeadAndTail(journal, recovery->journalData, + &recovery->highestTail, + &recovery->blockMapHead, + &recovery->slabJournalHead); + if (foundEntries) { + foundEntries = findContiguousRange(recovery); + } + + // Both reap heads must be behind the tail. + if ((recovery->blockMapHead > recovery->tail) + || (recovery->slabJournalHead > recovery->tail)) { + int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "Journal tail too early. " + "block map head: %" PRIu64 + ", slab journal head: %" PRIu64 + ", tail: %llu", + recovery->blockMapHead, + recovery->slabJournalHead, + recovery->tail); + finishCompletion(&recovery->completion, result); + return; + } + + if (!foundEntries) { + // This message must be recognizable by VDOTest::RebuildBase. + logInfo("Replaying 0 recovery entries into block map"); + // We still need to load the SlabDepot. + FREE(recovery->journalData); + recovery->journalData = NULL; + prepareSubTask(recovery, finishParentCallback, finishParentCallback, + ZONE_TYPE_ADMIN); + loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY, + completion, recovery); + return; + } + + logInfo("Highest-numbered recovery journal block has sequence number" + " %llu, and the highest-numbered usable block is %" + PRIu64, recovery->highestTail, recovery->tail); + + if (isReplaying(vdo)) { + // We need to know how many entries the block map rebuild completion will + // need to hold. + int result = countIncrementEntries(recovery); + if (result != VDO_SUCCESS) { + finishCompletion(&recovery->completion, result); + return; + } + + // We need to access the block map from a logical zone. + prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, + ZONE_TYPE_LOGICAL); + loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, + recovery); + return; + } + + int result = computeUsages(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback, + ZONE_TYPE_LOGICAL); + invokeCallback(completion); +} + +/**********************************************************************/ +void launchRecovery(VDO *vdo, VDOCompletion *parent) +{ + // Note: This message must be recognizable by Permabit::VDODeviceBase. + logWarning("Device was dirty, rebuilding reference counts"); + + RecoveryCompletion *recovery; + int result = makeRecoveryCompletion(vdo, &recovery); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOCompletion *completion = &recovery->completion; + prepareCompletion(completion, finishRecovery, abortRecovery, + parent->callbackThreadID, parent); + prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback, + ZONE_TYPE_ADMIN); + loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion, + &recovery->journalData); +} diff --git a/vdo/base/vdoRecovery.h b/vdo/base/vdoRecovery.h new file mode 100644 index 0000000..f817a05 --- /dev/null +++ b/vdo/base/vdoRecovery.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.h#2 $ + */ + +#ifndef VDO_RECOVERY_H +#define VDO_RECOVERY_H + +#include "completion.h" +#include "vdo.h" + +/** + * Replay recovery journal entries in the the slab journals of slabs owned by a + * given BlockAllocator. + * + * @param allocator The allocator whose slab journals are to be recovered + * @param completion The completion to use for waiting on slab journal space + * @param context The slab depot load context supplied by a recovery when + * it loads the depot + **/ +void replayIntoSlabJournals(BlockAllocator *allocator, + VDOCompletion *completion, + void *context); + +/** + * Construct a recovery completion and launch it. Apply all valid journal block + * entries to all VDO structures. This function performs the offline portion of + * recovering a VDO from a crash. + * + * @param vdo The vdo to recover + * @param parent The completion to notify when the offline portion of the + * recovery is complete + **/ +void launchRecovery(VDO *vdo, VDOCompletion *parent); + +#endif // VDO_RECOVERY_H diff --git a/vdo/base/vdoRecoveryInternals.h b/vdo/base/vdoRecoveryInternals.h new file mode 100644 index 0000000..b0414c1 --- /dev/null +++ b/vdo/base/vdoRecoveryInternals.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecoveryInternals.h#2 $ + */ + +#ifndef VDO_RECOVERY_INTERNALS_H +#define VDO_RECOVERY_INTERNALS_H + +#include "vdoRecovery.h" + +#include "blockMapRecovery.h" +#include "intMap.h" +#include "journalPoint.h" +#include "ringNode.h" +#include "types.h" +#include "waitQueue.h" + +/** + * The absolute position of an entry in the recovery journal, including + * the sector number and the entry number within the sector. + **/ +typedef struct { + SequenceNumber sequenceNumber; // Block sequence number + uint8_t sectorCount; // Sector number + JournalEntryCount entryCount; // Entry number +} RecoveryPoint; + +typedef struct { + /** The completion header */ + VDOCompletion completion; + /** The sub-task completion */ + VDOCompletion subTaskCompletion; + /** The VDO in question */ + VDO *vdo; + /** The BlockAllocator whose journals are being recovered */ + BlockAllocator *allocator; + /** A buffer to hold the data read off disk */ + char *journalData; + /** The number of increfs */ + size_t increfCount; + + /** The entry data for the block map recovery */ + NumberedBlockMapping *entries; + /** The number of entries in the entry array */ + size_t entryCount; + /** The sequence number of the first valid block for block map recovery */ + SequenceNumber blockMapHead; + /** The sequence number of the first valid block for slab journal replay */ + SequenceNumber slabJournalHead; + /** The sequence number of the last valid block of the journal (if known) */ + SequenceNumber tail; + /** + * The highest sequence number of the journal, not the same as the tail, + * since the tail ignores blocks after the first hole. + */ + SequenceNumber highestTail; + + /** A location just beyond the last valid entry of the journal */ + RecoveryPoint tailRecoveryPoint; + /** The location of the next recovery journal entry to apply */ + RecoveryPoint nextRecoveryPoint; + /** The number of logical blocks currently known to be in use */ + BlockCount logicalBlocksUsed; + /** The number of block map data blocks known to be allocated */ + BlockCount blockMapDataBlocks; + /** The journal point to give to the next synthesized decref */ + JournalPoint nextJournalPoint; + /** The number of entries played into slab journals */ + size_t entriesAddedToSlabJournals; + + // Decref synthesis fields + + /** An intMap for use in finding which slots are missing decrefs */ + IntMap *slotEntryMap; + /** The number of synthesized decrefs */ + size_t missingDecrefCount; + /** The number of incomplete decrefs */ + size_t incompleteDecrefCount; + /** The fake journal point of the next missing decref */ + JournalPoint nextSynthesizedJournalPoint; + /** The queue of missing decrefs */ + WaitQueue missingDecrefs[]; +} RecoveryCompletion; + +/** + * Convert a generic completion to a RecoveryCompletion. + * + * @param completion The completion to convert + * + * @return The RecoveryCompletion + **/ +__attribute__((warn_unused_result)) +static inline RecoveryCompletion * +asRecoveryCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(RecoveryCompletion, completion) == 0); + assertCompletionType(completion->type, RECOVERY_COMPLETION); + return (RecoveryCompletion *) completion; +} + +/** + * Allocate and initialize a RecoveryCompletion. + * + * @param vdo The VDO in question + * @param recoveryPtr A pointer to hold the new RecoveryCompletion + * + * @return VDO_SUCCESS or a status code + **/ +int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) + __attribute__((warn_unused_result)); + +/** + * Free a RecoveryCompletion and all underlying structures. + * + * @param recoveryPtr A pointer to the recovery completion to free + **/ +void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr); + +#endif // VDO_RECOVERY_INTERNALS_H diff --git a/vdo/base/vdoResize.c b/vdo/base/vdoResize.c new file mode 100644 index 0000000..ee3271d --- /dev/null +++ b/vdo/base/vdoResize.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.c#15 $ + */ + +#include "vdoResize.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "completion.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "vdoInternal.h" +#include "vdoLayout.h" + +typedef enum { + GROW_PHYSICAL_PHASE_START = 0, + GROW_PHYSICAL_PHASE_COPY_SUMMARY, + GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, + GROW_PHYSICAL_PHASE_USE_NEW_SLABS, + GROW_PHYSICAL_PHASE_END, + GROW_PHYSICAL_PHASE_ERROR, +} GrowPhysicalPhase; + +static const char *GROW_PHYSICAL_PHASE_NAMES[] = { + "GROW_PHYSICAL_PHASE_START", + "GROW_PHYSICAL_PHASE_COPY_SUMMARY", + "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", + "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", + "GROW_PHYSICAL_PHASE_END", + "GROW_PHYSICAL_PHASE_ERROR", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); +} + +/** + * Callback to initiate a grow physical, registered in performGrowPhysical(). + * + * @param completion The sub-task completion + **/ +static void growPhysicalCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_PHYSICAL); + assertAdminPhaseThread(adminCompletion, __func__, GROW_PHYSICAL_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case GROW_PHYSICAL_PHASE_START: + if (isReadOnly(vdo->readOnlyNotifier)) { + logErrorWithStringError(VDO_READ_ONLY, + "Can't grow physical size of a read-only VDO"); + setCompletionResult(resetAdminSubTask(completion), VDO_READ_ONLY); + break; + } + + if (startOperationWithWaiter(&vdo->adminState, + ADMIN_STATE_SUSPENDED_OPERATION, + &adminCompletion->completion, NULL)) { + // Copy the journal into the new layout. + copyPartition(vdo->layout, RECOVERY_JOURNAL_PARTITION, + resetAdminSubTask(completion)); + } + return; + + case GROW_PHYSICAL_PHASE_COPY_SUMMARY: + copyPartition(vdo->layout, SLAB_SUMMARY_PARTITION, + resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: + vdo->config.physicalBlocks = growVDOLayout(vdo->layout); + updateSlabDepotSize(vdo->depot); + saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: + useNewSlabs(vdo->depot, resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_END: + setSlabSummaryOrigin(getSlabSummary(vdo->depot), + getVDOPartition(vdo->layout, SLAB_SUMMARY_PARTITION)); + setRecoveryJournalPartition(vdo->recoveryJournal, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION)); + break; + + case GROW_PHYSICAL_PHASE_ERROR: + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishVDOLayoutGrowth(vdo->layout); + finishOperationWithResult(&vdo->adminState, completion->result); +} + +/** + * Handle an error during the grow physical process. + * + * @param completion The sub-task completion + **/ +static void handleGrowthError(VDOCompletion *completion) +{ + adminCompletionFromSubTask(completion)->phase = GROW_PHYSICAL_PHASE_ERROR; + growPhysicalCallback(completion); +} + +/**********************************************************************/ +int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) +{ + BlockCount oldPhysicalBlocks = vdo->config.physicalBlocks; + + // Skip any noop grows. + if (oldPhysicalBlocks == newPhysicalBlocks) { + return VDO_SUCCESS; + } + + if (newPhysicalBlocks != getNextVDOLayoutSize(vdo->layout)) { + /* + * Either the VDO isn't prepared to grow, or it was prepared to grow + * to a different size. Doing this check here relies on the fact that + * the call to this method is done under the dmsetup message lock. + */ + finishVDOLayoutGrowth(vdo->layout); + abandonNewSlabs(vdo->depot); + return VDO_PARAMETER_MISMATCH; + } + + // Validate that we are prepared to grow appropriately. + BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); + BlockCount preparedDepotSize = getNewDepotSize(vdo->depot); + if (preparedDepotSize != newDepotSize) { + return VDO_PARAMETER_MISMATCH; + } + + int result = performAdminOperation(vdo, ADMIN_OPERATION_GROW_PHYSICAL, + getThreadIDForPhase, growPhysicalCallback, + handleGrowthError); + if (result != VDO_SUCCESS) { + return result; + } + + logInfo("Physical block count was %llu, now %llu", + oldPhysicalBlocks, newPhysicalBlocks); + return VDO_SUCCESS; +} + +/** + * Callback to check that we're not in recovery mode, used in + * prepareToGrowPhysical(). + * + * @param completion The sub-task completion + **/ +static void checkMayGrowPhysical(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL); + + VDO *vdo = adminCompletion->completion.parent; + assertOnAdminThread(vdo, __func__); + + resetAdminSubTask(completion); + + // This check can only be done from a base code thread. + if (isReadOnly(vdo->readOnlyNotifier)) { + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + // This check should only be done from a base code thread. + if (inRecoveryMode(vdo)) { + finishCompletion(completion->parent, VDO_RETRY_AFTER_REBUILD); + return; + } + + completeCompletion(completion->parent); +} + +/**********************************************************************/ +int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) +{ + BlockCount currentPhysicalBlocks = vdo->config.physicalBlocks; + if (newPhysicalBlocks < currentPhysicalBlocks) { + return logErrorWithStringError(VDO_NOT_IMPLEMENTED, + "Removing physical storage from a VDO is " + "not supported"); + } + + if (newPhysicalBlocks == currentPhysicalBlocks) { + logWarning("Requested physical block count %" PRIu64 + " not greater than %llu", + newPhysicalBlocks, currentPhysicalBlocks); + finishVDOLayoutGrowth(vdo->layout); + abandonNewSlabs(vdo->depot); + return VDO_PARAMETER_MISMATCH; + } + + int result = performAdminOperation(vdo, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + getThreadIDForPhase, checkMayGrowPhysical, + finishParentCallback); + if (result != VDO_SUCCESS) { + return result; + } + + result = prepareToGrowVDOLayout(vdo->layout, currentPhysicalBlocks, + newPhysicalBlocks, vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); + result = prepareToGrowSlabDepot(vdo->depot, newDepotSize); + if (result != VDO_SUCCESS) { + finishVDOLayoutGrowth(vdo->layout); + return result; + } + + return VDO_SUCCESS; +} diff --git a/vdo/base/vdoResize.h b/vdo/base/vdoResize.h new file mode 100644 index 0000000..76bfc1f --- /dev/null +++ b/vdo/base/vdoResize.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.h#1 $ + */ + +#ifndef VDO_RESIZE_H +#define VDO_RESIZE_H + +#include "types.h" + +/** + * Make the completion for an asynchronous resize. + * + * @param vdo The VDO + * @param newPhysicalBlocks The new physical size in blocks + * @param completionPtr A pointer to hold the completion + * + * @return VDO_SUCCESS or an error + **/ +int makeResizeVDOCompletion(VDO *vdo, + BlockCount newPhysicalBlocks, + VDOCompletion **completionPtr) + __attribute__((warn_unused_result)); + +/** + * Free the completion for an asynchronous resize, and NULL out the + * reference to it. + * + * @param completionPtr A reference to the completion to free + **/ +void freeResizeVDOCompletion(VDOCompletion **completionPtr); + +/** + * Grow the physical size of the VDO. This method may only be called when the + * VDO has been suspended and must not be called from a base thread. + * + * @param vdo The VDO to resize + * @param newPhysicalBlocks The new physical size in blocks + * + * @return VDO_SUCCESS or an error + **/ +int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks); + +/** + * Prepare to resize the VDO, allocating memory as needed. + * + * @param vdo The VDO + * @param newPhysicalBlocks The new physical size in blocks + **/ +int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) + __attribute__((warn_unused_result)); + +#endif /* VDO_RESIZE_H */ diff --git a/vdo/base/vdoResizeLogical.c b/vdo/base/vdoResizeLogical.c new file mode 100644 index 0000000..97a06d1 --- /dev/null +++ b/vdo/base/vdoResizeLogical.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.c#6 $ + */ + +#include "vdoResizeLogical.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "vdoInternal.h" + +typedef enum { + GROW_LOGICAL_PHASE_START = 0, + GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, + GROW_LOGICAL_PHASE_END, + GROW_LOGICAL_PHASE_ERROR, +} GrowLogicalPhase; + +static const char *GROW_LOGICAL_PHASE_NAMES[] = { + "GROW_LOGICAL_PHASE_START", + "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", + "GROW_LOGICAL_PHASE_END", + "GROW_LOGICAL_PHASE_ERROR", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); +} + +/** + * Callback to initiate a grow logical, registered in performGrowLogical(). + * + * @param completion The sub-task completion + **/ +static void growLogicalCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_LOGICAL); + assertAdminPhaseThread(adminCompletion, __func__, GROW_LOGICAL_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case GROW_LOGICAL_PHASE_START: + if (isReadOnly(vdo->readOnlyNotifier)) { + logErrorWithStringError(VDO_READ_ONLY, + "Can't grow logical size of a read-only VDO"); + finishCompletion(resetAdminSubTask(completion), VDO_READ_ONLY); + return; + } + + if (startOperationWithWaiter(&vdo->adminState, + ADMIN_STATE_SUSPENDED_OPERATION, + &adminCompletion->completion, NULL)) { + + vdo->config.logicalBlocks = getNewEntryCount(getBlockMap(vdo)); + saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); + } + + return; + + case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: + growBlockMap(getBlockMap(vdo), resetAdminSubTask(completion)); + return; + + case GROW_LOGICAL_PHASE_END: + break; + + case GROW_LOGICAL_PHASE_ERROR: + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishOperationWithResult(&vdo->adminState, completion->result); +} + +/** + * Handle an error during the grow physical process. + * + * @param completion The sub-task completion + **/ +static void handleGrowthError(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + if (adminCompletion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { + // We've failed to write the new size in the super block, so set our + // in memory config back to the old size. + VDO *vdo = adminCompletion->completion.parent; + BlockMap *map = getBlockMap(vdo); + vdo->config.logicalBlocks = getNumberOfBlockMapEntries(map); + abandonBlockMapGrowth(map); + } + + adminCompletion->phase = GROW_LOGICAL_PHASE_ERROR; + growLogicalCallback(completion); +} + +/**********************************************************************/ +int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) +{ + if (getNewEntryCount(getBlockMap(vdo)) != newLogicalBlocks) { + return VDO_PARAMETER_MISMATCH; + } + + return performAdminOperation(vdo, ADMIN_OPERATION_GROW_LOGICAL, + getThreadIDForPhase, growLogicalCallback, + handleGrowthError); +} + +/**********************************************************************/ +int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) +{ + if (newLogicalBlocks < vdo->config.logicalBlocks) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, + "Can't shrink VDO logical size from its " + "current value of %llu", + vdo->config.logicalBlocks); + } + + if (newLogicalBlocks == vdo->config.logicalBlocks) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, + "Can't grow VDO logical size to its " + "current value of %llu", + vdo->config.logicalBlocks); + } + + return prepareToGrowBlockMap(getBlockMap(vdo), newLogicalBlocks); +} diff --git a/vdo/base/vdoResizeLogical.h b/vdo/base/vdoResizeLogical.h new file mode 100644 index 0000000..fbea60d --- /dev/null +++ b/vdo/base/vdoResizeLogical.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.h#1 $ + */ + +#ifndef VDO_RESIZE_LOGICAL_H +#define VDO_RESIZE_LOGICAL_H + +#include "types.h" + +/** + * Grow the logical size of the VDO. This method may only be called when the + * VDO has been suspended and must not be called from a base thread. + * + * @param vdo The VDO to grow + * @param newLogicalBlocks The size to which the VDO should be grown + * + * @return VDO_SUCCESS or an error + **/ +int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); + +/** + * Prepare to grow the logical size of the VDO. This method may only be called + * while the VDO is running. + * + * @param vdo The VDO to prepare for growth + * @param newLogicalBlocks The size to which the VDO should be grown + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); + +#endif /* VDO_RESIZE_LOGICAL_H */ diff --git a/vdo/base/vdoResume.c b/vdo/base/vdoResume.c new file mode 100644 index 0000000..a10c2ef --- /dev/null +++ b/vdo/base/vdoResume.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.c#3 $ + */ + +#include "vdoResume.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "logicalZone.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "vdoInternal.h" + +typedef enum { + RESUME_PHASE_START = 0, + RESUME_PHASE_ALLOW_READ_ONLY_MODE, + RESUME_PHASE_DEPOT, + RESUME_PHASE_JOURNAL, + RESUME_PHASE_BLOCK_MAP, + RESUME_PHASE_LOGICAL_ZONES, + RESUME_PHASE_PACKER, + RESUME_PHASE_END, +} ResumePhase; + +static const char *RESUME_PHASE_NAMES[] = { + "RESUME_PHASE_START", + "RESUME_PHASE_ALLOW_READ_ONLY_MODE", + "RESUME_PHASE_DEPOT", + "RESUME_PHASE_JOURNAL", + "RESUME_PHASE_BLOCK_MAP", + "RESUME_PHASE_LOGICAL_ZONES", + "RESUME_PHASE_PACKER", + "RESUME_PHASE_END", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + const ThreadConfig *threadConfig + = getThreadConfig(adminCompletion->completion.parent); + switch (adminCompletion->phase) { + case RESUME_PHASE_JOURNAL: + return getJournalZoneThread(threadConfig); + + case RESUME_PHASE_PACKER: + return getPackerZoneThread(threadConfig); + + default: + return getAdminThread(threadConfig); + } +} + +/** + * Update the VDO state and save the super block. + * + * @param vdo The VDO being resumed + * @param completion The AdminCompletion's sub-task completion + **/ +static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) +{ + switch (vdo->state) { + case VDO_CLEAN: + case VDO_NEW: + vdo->state = VDO_DIRTY; + saveVDOComponentsAsync(vdo, completion); + return; + + case VDO_DIRTY: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + // No need to write the super block in these cases + completeCompletion(completion); + return; + + case VDO_REPLAYING: + default: + finishCompletion(completion, UDS_BAD_STATE); + } +} + +/** + * Callback to resume a VDO. + * + * @param completion The sub-task completion + **/ +static void resumeCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_RESUME); + assertAdminPhaseThread(adminCompletion, __func__, RESUME_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case RESUME_PHASE_START: + if (startResuming(&vdo->adminState, ADMIN_STATE_RESUMING, + &adminCompletion->completion, NULL)) { + writeSuperBlock(vdo, completion); + } + return; + + case RESUME_PHASE_ALLOW_READ_ONLY_MODE: + allowReadOnlyModeEntry(vdo->readOnlyNotifier, + resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_DEPOT: + resumeSlabDepot(vdo->depot, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_JOURNAL: + resumeRecoveryJournal(vdo->recoveryJournal, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_BLOCK_MAP: + resumeBlockMap(vdo->blockMap, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_LOGICAL_ZONES: + resumeLogicalZones(vdo->logicalZones,resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_PACKER: + resumePacker(vdo->packer, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_END: + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishResumingWithResult(&vdo->adminState, completion->result); +} + +/**********************************************************************/ +int performVDOResume(VDO *vdo) +{ + return performAdminOperation(vdo, ADMIN_OPERATION_RESUME, + getThreadIDForPhase, resumeCallback, + preserveErrorAndContinue); +} diff --git a/vdo/base/vdoResume.h b/vdo/base/vdoResume.h new file mode 100644 index 0000000..1ef25b2 --- /dev/null +++ b/vdo/base/vdoResume.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.h#1 $ + */ + +#ifndef VDO_RESUME_H +#define VDO_RESUME_H + +#include "types.h" + +/** + * Resume a suspended VDO. + * + * @param vdo The VDO to resume + * + * @return VDO_SUCCESS or an error + **/ +int performVDOResume(VDO *vdo); + +#endif /* VDO_RESUME_H */ diff --git a/vdo/base/vdoState.c b/vdo/base/vdoState.c new file mode 100644 index 0000000..00d3986 --- /dev/null +++ b/vdo/base/vdoState.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.c#1 $ + */ + +#include "vdoState.h" + +#include "permassert.h" + +static const char *VDO_STATE_NAMES[] = { + [VDO_CLEAN] = "CLEAN", + [VDO_DIRTY] = "DIRTY", + [VDO_FORCE_REBUILD] = "FORCE_REBUILD", + [VDO_NEW] = "NEW", + [VDO_READ_ONLY_MODE] = "READ_ONLY_MODE", + [VDO_REBUILD_FOR_UPGRADE] = "REBUILD_FOR_UPGRADE", + [VDO_RECOVERING] = "RECOVERING", + [VDO_REPLAYING] = "REPLAYING", +}; + +/**********************************************************************/ +const char *getVDOStateName(VDOState state) +{ + // Catch if a state has been added without updating the name array. + STATIC_ASSERT(COUNT_OF(VDO_STATE_NAMES) == VDO_STATE_COUNT); + + int result = ASSERT(state < COUNT_OF(VDO_STATE_NAMES), + "VDOState value %u must have a registered name", state); + if (result != UDS_SUCCESS) { + return "INVALID VDO STATE CODE"; + } + + return VDO_STATE_NAMES[state]; +} + +/**********************************************************************/ +const char *describeVDOState(VDOState state) +{ + // These strings should all fit in the 15 chars of VDOStatistics.mode. + switch (state) { + case VDO_RECOVERING: + return "recovering"; + + case VDO_READ_ONLY_MODE: + return "read-only"; + + default: + return "normal"; + } +} diff --git a/vdo/base/vdoState.h b/vdo/base/vdoState.h new file mode 100644 index 0000000..5843565 --- /dev/null +++ b/vdo/base/vdoState.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.h#2 $ + */ + +#ifndef VDO_STATE_H +#define VDO_STATE_H + +/** + * The current operating mode of the VDO. These are persistent on disk + * so the values must not change. + **/ +typedef enum { + VDO_DIRTY = 0, + VDO_NEW = 1, + VDO_CLEAN = 2, + VDO_READ_ONLY_MODE = 3, + VDO_FORCE_REBUILD = 4, + VDO_RECOVERING = 5, + VDO_REPLAYING = 6, + VDO_REBUILD_FOR_UPGRADE = 7, + + // Keep VDO_STATE_COUNT at the bottom. + VDO_STATE_COUNT +} VDOState; + +/** + * Get the name of a VDO state code for logging purposes. + * + * @param state The state code + * + * @return The name of the state code + **/ +const char *getVDOStateName(VDOState state) + __attribute__((warn_unused_result)); + +/** + * Return a user-visible string describing the current VDO state. + * + * @param state The VDO state to describe + * + * @return A string constant describing the state + **/ +const char *describeVDOState(VDOState state) + __attribute__((warn_unused_result)); + +#endif // VDO_STATE_H diff --git a/vdo/base/vdoSuspend.c b/vdo/base/vdoSuspend.c new file mode 100644 index 0000000..e919f19 --- /dev/null +++ b/vdo/base/vdoSuspend.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.c#4 $ + */ + +#include "vdoSuspend.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "logicalZone.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "vdoInternal.h" + +typedef enum { + SUSPEND_PHASE_START = 0, + SUSPEND_PHASE_PACKER, + SUSPEND_PHASE_LOGICAL_ZONES, + SUSPEND_PHASE_BLOCK_MAP, + SUSPEND_PHASE_JOURNAL, + SUSPEND_PHASE_DEPOT, + SUSPEND_PHASE_WRITE_SUPER_BLOCK, + SUSPEND_PHASE_END, +} SuspendPhase; + +static const char *SUSPEND_PHASE_NAMES[] = { + "SUSPEND_PHASE_START", + "SUSPEND_PHASE_PACKER", + "SUSPEND_PHASE_LOGICAL_ZONES", + "SUSPEND_PHASE_BLOCK_MAP", + "SUSPEND_PHASE_JOURNAL", + "SUSPEND_PHASE_DEPOT", + "SUSPEND_PHASE_WRITE_SUPER_BLOCK", + "SUSPEND_PHASE_END", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + const ThreadConfig *threadConfig + = getThreadConfig(adminCompletion->completion.parent); + switch (adminCompletion->phase) { + case SUSPEND_PHASE_PACKER: + return getPackerZoneThread(threadConfig); + + case SUSPEND_PHASE_JOURNAL: + return getJournalZoneThread(threadConfig); + + default: + return getAdminThread(threadConfig); + } +} + +/** + * Update the VDO state and save the super block. + * + * @param vdo The VDO being suspended + * @param completion The AdminCompletion's sub-task completion + **/ +static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) +{ + switch (vdo->state) { + case VDO_DIRTY: + case VDO_NEW: + vdo->state = VDO_CLEAN; + break; + + case VDO_CLEAN: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + break; + + case VDO_REPLAYING: + default: + finishCompletion(completion, UDS_BAD_STATE); + return; + } + + saveVDOComponentsAsync(vdo, completion); +} + +/** + * Callback to initiate a suspend, registered in performVDOSuspend(). + * + * @param completion The sub-task completion + **/ +static void suspendCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + ASSERT_LOG_ONLY(((adminCompletion->type == ADMIN_OPERATION_SUSPEND) + || (adminCompletion->type == ADMIN_OPERATION_SAVE)), + "unexpected admin operation type %u is neither " + "suspend nor save", adminCompletion->type); + assertAdminPhaseThread(adminCompletion, __func__, SUSPEND_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case SUSPEND_PHASE_START: + if (!startDraining(&vdo->adminState, + ((adminCompletion->type == ADMIN_OPERATION_SUSPEND) + ? ADMIN_STATE_SUSPENDING : ADMIN_STATE_SAVING), + &adminCompletion->completion, NULL)) { + return; + } + + if (!vdo->closeRequired) { + // There's nothing to do. + break; + } + + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_PACKER: + /* + * If the VDO was already resumed from a prior suspend while read-only, + * some of the components may not have been resumed. By setting a read-only + * error here, we guarantee that the result of this suspend will be + * VDO_READ_ONLY and not VDO_INVALID_ADMIN_STATE in that case. + */ + if (inReadOnlyMode(vdo)) { + setCompletionResult(&adminCompletion->completion, VDO_READ_ONLY); + } + + drainPacker(vdo->packer, resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_LOGICAL_ZONES: + drainLogicalZones(vdo->logicalZones, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_BLOCK_MAP: + drainBlockMap(vdo->blockMap, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_JOURNAL: + drainRecoveryJournal(vdo->recoveryJournal, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_DEPOT: + drainSlabDepot(vdo->depot, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_WRITE_SUPER_BLOCK: + if (isSuspending(&vdo->adminState) + || (adminCompletion->completion.result != VDO_SUCCESS)) { + // If we didn't save the VDO or there was an error, we're done. + break; + } + + writeSuperBlock(vdo, resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_END: + break; + + default: + setCompletionResult(completion, UDS_BAD_STATE); + } + + finishDrainingWithResult(&vdo->adminState, completion->result); +} + +/**********************************************************************/ +int performVDOSuspend(VDO *vdo, bool save) +{ + return performAdminOperation(vdo, (save + ? ADMIN_OPERATION_SAVE + : ADMIN_OPERATION_SUSPEND), + getThreadIDForPhase, suspendCallback, + preserveErrorAndContinue); +} diff --git a/vdo/base/vdoSuspend.h b/vdo/base/vdoSuspend.h new file mode 100644 index 0000000..39172dc --- /dev/null +++ b/vdo/base/vdoSuspend.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.h#1 $ + */ + +#ifndef VDO_SUSPEND_H +#define VDO_SUSPEND_H + +#include "types.h" + +/** + * Ensure that the VDO has no outstanding I/O and will issue none until it is + * resumed. + * + * @param vdo The VDO to suspend + * @param save If true, all dirty metadata will be flushed as + * well + * + * @return VDO_SUCCESS or an error + **/ +int performVDOSuspend(VDO *vdo, bool save); + +#endif /* VDO_SUSPEND_H */ diff --git a/vdo/base/vio.c b/vdo/base/vio.c new file mode 100644 index 0000000..9bd678d --- /dev/null +++ b/vdo/base/vio.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.c#5 $ + */ + +#include "vio.h" + +#include "logger.h" + +#include "dataVIO.h" +#include "vdoInternal.h" + +#ifdef __KERNEL__ +#include +#endif + +/**********************************************************************/ +void freeVIO(VIO **vioPtr) +{ + VIO *vio = *vioPtr; + if (vio == NULL) { + return; + } + + vio->completion.layer->freeVIO(vioPtr); +} + +/**********************************************************************/ +void initializeVIO(VIO *vio, + VIOType type, + VIOPriority priority, + VDOCompletion *parent, + VDO *vdo, + PhysicalLayer *layer) +{ + vio->vdo = vdo; + vio->type = type; + vio->priority = priority; + + VDOCompletion *completion = vioAsCompletion(vio); + initializeCompletion(completion, VIO_COMPLETION, layer); + completion->parent = parent; +} + +/**********************************************************************/ +void vioDoneCallback(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + completion->callback = vio->callback; + completion->errorHandler = vio->errorHandler; + completeCompletion(completion); +} + +/**********************************************************************/ +const char *getVIOReadWriteFlavor(const VIO *vio) +{ + if (isReadVIO(vio)) { + return "read"; + } + return (isWriteVIO(vio) ? "write" : "read-modify-write"); +} + +/**********************************************************************/ +void updateVIOErrorStats(VIO *vio, const char *format, ...) +{ + int priority; + int result = vioAsCompletion(vio)->result; + switch (result) { + case VDO_READ_ONLY: + atomicAdd64(&vio->vdo->errorStats.readOnlyErrorCount, 1); + return; + + case VDO_NO_SPACE: + atomicAdd64(&vio->vdo->errorStats.noSpaceErrorCount, 1); + priority = LOG_DEBUG; + break; + + default: + priority = LOG_ERR; + } + +#ifdef __KERNEL__ + static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (!__ratelimit(&errorLimiter)) { + return; + } +#endif + + va_list args; + va_start(args, format); + vLogWithStringError(priority, result, format, args); + va_end(args); +} + +/** + * Handle an error from a metadata I/O. + * + * @param completion The VIO + **/ +static void handleMetadataIOError(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + updateVIOErrorStats(vio, + "Completing %s VIO of type %u for physical block %" + PRIu64 " with error", + getVIOReadWriteFlavor(vio), vio->type, vio->physical); + vioDoneCallback(completion); +} + +/**********************************************************************/ +void launchMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + VIOOperation operation) +{ + vio->operation = operation; + vio->physical = physical; + vio->callback = callback; + vio->errorHandler = errorHandler; + + VDOCompletion *completion = vioAsCompletion(vio); + resetCompletion(completion); + completion->callback = vioDoneCallback; + completion->errorHandler = handleMetadataIOError; + + if (isReadVIO(vio)) { + completion->layer->readMetadata(vio); + } else { + completion->layer->writeMetadata(vio); + } +} + +/** + * Handle a flush error. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + logErrorWithStringError(completion->result, "Error flushing layer"); + completion->errorHandler = asVIO(completion)->errorHandler; + completeCompletion(completion); +} + +/**********************************************************************/ +void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler) +{ + VDOCompletion *completion = vioAsCompletion(vio); + resetCompletion(completion); + completion->callback = callback; + completion->errorHandler = handleFlushError; + vio->errorHandler = errorHandler; + vio->operation = VIO_FLUSH_BEFORE; + vio->physical = ZERO_BLOCK; + + PhysicalLayer *layer = completion->layer; + if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { + // XXX It is dangerous to be subtly dropping flushes possibly + // needed for correctness in sync mode. + finishCompletion(completion, VDO_SUCCESS); + return; + } + + layer->flush(vio); +} diff --git a/vdo/base/vio.h b/vdo/base/vio.h new file mode 100644 index 0000000..8129cc6 --- /dev/null +++ b/vdo/base/vio.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.h#3 $ + */ + +#ifndef VIO_H +#define VIO_H + +#include + +#include "completion.h" +#include "trace.h" +#include "types.h" +#include "vdo.h" + +/** + * A representation of a single block which may be passed between the VDO base + * and the physical layer. + **/ +struct vio { + /* The completion for this VIO */ + VDOCompletion completion; + + /* The functions to call when this VIO's operation is complete */ + VDOAction *callback; + VDOAction *errorHandler; + + /* The VDO handling this VIO */ + VDO *vdo; + + /* The address on the underlying device of the block to be read/written */ + PhysicalBlockNumber physical; + + /* The type of request this VIO is servicing */ + VIOOperation operation; + + /* The queueing priority of the VIO operation */ + VIOPriority priority; + + /* The VIO type is used for statistics and instrumentation. */ + VIOType type; + + /* Used for logging and debugging */ + Trace *trace; +}; + +/** + * Convert a generic VDOCompletion to a VIO. + * + * @param completion The completion to convert + * + * @return The completion as a VIO + **/ +static inline VIO *asVIO(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(VIO, completion) == 0); + assertCompletionType(completion->type, VIO_COMPLETION); + return (VIO *) completion; +} + +/** + * Convert a VIO to a generic completion. + * + * @param vio The VIO to convert + * + * @return The VIO as a completion + **/ +static inline VDOCompletion *vioAsCompletion(VIO *vio) +{ + return &vio->completion; +} + +/** + * Create a VIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIO + * @param [in] parent The parent of the VIO + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold the new VIO + * + * @return VDO_SUCCESS or an error + **/ +static inline int createVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) +{ + return layer->createMetadataVIO(layer, vioType, priority, parent, data, + vioPtr); +} + +/** + * Destroy a vio. The pointer to the VIO will be nulled out. + * + * @param vioPtr A pointer to the VIO to destroy + **/ +void freeVIO(VIO **vioPtr); + +/** + * Initialize a VIO. + * + * @param vio The VIO to initialize + * @param type The VIO type + * @param priority The relative priority of the VIO + * @param parent The parent (the extent completion) to assign to the VIO + * completion + * @param vdo The VDO for this VIO + * @param layer The layer for this VIO + **/ +void initializeVIO(VIO *vio, + VIOType type, + VIOPriority priority, + VDOCompletion *parent, + VDO *vdo, + PhysicalLayer *layer); + +/** + * The very last step in processing a VIO. Set the VIO's completion's callback + * and error handler from the fields set in the VIO itself on launch and then + * actually complete the VIO's completion. + * + * @param completion The VIO + **/ +void vioDoneCallback(VDOCompletion *completion); + +/** + * Get the name of a VIO's operation. + * + * @param vio The VIO + * + * @return The name of the VIO's operation (read, write, or read-modify-write) + **/ +const char *getVIOReadWriteFlavor(const VIO *vio); + +/** + * Update per-VIO error stats and log the error. + * + * @param vio The VIO which got an error + * @param format The format of the message to log (a printf style format) + **/ +void updateVIOErrorStats(VIO *vio, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Add a trace record for the current source location. + * + * @param vio The VIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void vioAddTraceRecord(VIO *vio, TraceLocation location) +{ + if (unlikely(vio->trace != NULL)) { + addTraceRecord(vio->trace, location); + } +} + +/** + * Check whether a VIO is servicing an external data request. + * + * @param vio The VIO to check + **/ +static inline bool isDataVIO(VIO *vio) +{ + return isDataVIOType(vio->type); +} + +/** + * Check whether a VIO is for compressed block writes + * + * @param vio The VIO to check + **/ +static inline bool isCompressedWriteVIO(VIO *vio) +{ + return isCompressedWriteVIOType(vio->type); +} + +/** + * Check whether a VIO is for metadata + * + * @param vio The VIO to check + **/ +static inline bool isMetadataVIO(VIO *vio) +{ + return isMetadataVIOType(vio->type); +} + +/** + * Check whether a VIO is a read. + * + * @param vio The VIO + * + * @return true if the VIO is a read + **/ +static inline bool isReadVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ); +} + +/** + * Check whether a VIO is a read-modify-write. + * + * @param vio The VIO + * + * @return true if the VIO is a read-modify-write + **/ +static inline bool isReadModifyWriteVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ_MODIFY_WRITE); +} + +/** + * Check whether a VIO is a write. + * + * @param vio The VIO + * + * @return true if the VIO is a write + **/ +static inline bool isWriteVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_WRITE); +} + +/** + * Check whether a VIO requires a flush before doing its I/O. + * + * @param vio The VIO + * + * @return true if the VIO requires a flush before + **/ +static inline bool vioRequiresFlushBefore(const VIO *vio) +{ + return ((vio->operation & VIO_FLUSH_BEFORE) == VIO_FLUSH_BEFORE); +} + +/** + * Check whether a VIO requires a flush after doing its I/O. + * + * @param vio The VIO + * + * @return true if the VIO requires a flush after + **/ +static inline bool vioRequiresFlushAfter(const VIO *vio) +{ + return ((vio->operation & VIO_FLUSH_AFTER) == VIO_FLUSH_AFTER); +} + +/** + * Launch a metadata VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to read or write + * @param callback The function to call when the VIO completes its I/O + * @param errorHandler The handler for write errors + * @param operation The operation to perform (read or write) + **/ +void launchMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + VIOOperation operation); + +/** + * Launch a metadata read VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to read + * @param callback The function to call when the VIO completes its read + * @param errorHandler The handler for write errors + **/ +static inline void launchReadMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, VIO_READ); +} + +/** + * Launch a metadata write VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to write + * @param callback The function to call when the VIO completes its write + * @param errorHandler The handler for write errors + **/ +static inline void launchWriteMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, VIO_WRITE); +} + +/** + * Launch a metadata write VIO optionally flushing the layer before and/or + * after the write operation. + * + * @param vio The VIO to launch + * @param physical The physical block number to write + * @param callback The function to call when the VIO completes its + * operation + * @param errorHandler The handler for flush or write errors + * @param flushBefore Whether or not to flush before writing + * @param flushAfter Whether or not to flush after writing + **/ +static inline +void launchWriteMetadataVIOWithFlush(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + bool flushBefore, + bool flushAfter) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, + (VIO_WRITE + | (flushBefore ? VIO_FLUSH_BEFORE : 0) + | (flushAfter ? VIO_FLUSH_AFTER : 0))); +} + +/** + * Issue a flush to the layer. If the layer does not require flushing, this + * method will immediately finish the VIO with which it was called. Care must + * be taken to avoid introducing a stack overflow in that case. + * + * @param vio The VIO to notify when the flush is complete + * @param callback The function to call when the flush is complete + * @param errorHandler The handler for flush errors + **/ +void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler); + +#endif // VIO_H diff --git a/vdo/base/vioPool.c b/vdo/base/vioPool.c new file mode 100644 index 0000000..3d5ce07 --- /dev/null +++ b/vdo/base/vioPool.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.c#5 $ + */ + +#include "vioPool.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "constants.h" +#include "vio.h" +#include "types.h" + +/** + * An VIOPool is a collection of preallocated VIOs. + **/ +struct vioPool { + /** The number of objects managed by the pool */ + size_t size; + /** The list of objects which are available */ + RingNode available; + /** The queue of requestors waiting for objects from the pool */ + WaitQueue waiting; + /** The number of objects currently in use */ + size_t busyCount; + /** The list of objects which are in use */ + RingNode busy; + /** The number of requests when no object was available */ + uint64_t outageCount; + /** The ID of the thread on which this pool may be used */ + ThreadID threadID; + /** The buffer backing the pool's VIOs */ + char *buffer; + /** The pool entries */ + VIOPoolEntry entries[]; +}; + +/**********************************************************************/ +int makeVIOPool(PhysicalLayer *layer, + size_t poolSize, + ThreadID threadID, + VIOConstructor *vioConstructor, + void *context, + VIOPool **poolPtr) +{ + VIOPool *pool; + int result = ALLOCATE_EXTENDED(VIOPool, poolSize, VIOPoolEntry, __func__, + &pool); + if (result != VDO_SUCCESS) { + return result; + } + + pool->threadID = threadID; + initializeRing(&pool->available); + initializeRing(&pool->busy); + + result = ALLOCATE(poolSize * VDO_BLOCK_SIZE, char, "VIO pool buffer", + &pool->buffer); + if (result != VDO_SUCCESS) { + freeVIOPool(&pool); + return result; + } + + char *ptr = pool->buffer; + for (size_t i = 0; i < poolSize; i++) { + VIOPoolEntry *entry = &pool->entries[i]; + entry->buffer = ptr; + entry->context = context; + result = vioConstructor(layer, entry, ptr, &entry->vio); + if (result != VDO_SUCCESS) { + freeVIOPool(&pool); + return result; + } + + ptr += VDO_BLOCK_SIZE; + initializeRing(&entry->node); + pushRingNode(&pool->available, &entry->node); + pool->size++; + } + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVIOPool(VIOPool **poolPtr) +{ + if (*poolPtr == NULL) { + return; + } + + // Remove all available entries from the object pool. + VIOPool *pool = *poolPtr; + ASSERT_LOG_ONLY(!hasWaiters(&pool->waiting), + "VIO pool must not have any waiters when being freed"); + ASSERT_LOG_ONLY((pool->busyCount == 0), + "VIO pool must not have %zu busy entries when being freed", + pool->busyCount); + ASSERT_LOG_ONLY(isRingEmpty(&pool->busy), + "VIO pool must not have busy entries when being freed"); + + VIOPoolEntry *entry; + while ((entry = asVIOPoolEntry(chopRingNode(&pool->available))) != NULL) { + freeVIO(&entry->vio); + } + + // Make sure every VIOPoolEntry has been removed. + for (size_t i = 0; i < pool->size; i++) { + VIOPoolEntry *entry = &pool->entries[i]; + ASSERT_LOG_ONLY(isRingEmpty(&entry->node), "VIO Pool entry still in use:" + " VIO is in use for physical block %" PRIu64 + " for operation %u", + entry->vio->physical, + entry->vio->operation); + } + + FREE(pool->buffer); + FREE(pool); + *poolPtr = NULL; +} + +/**********************************************************************/ +bool isVIOPoolBusy(VIOPool *pool) +{ + return (pool->busyCount != 0); +} + +/**********************************************************************/ +int acquireVIOFromPool(VIOPool *pool, Waiter *waiter) +{ + ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), + "acquire from active VIOPool called from correct thread"); + + if (isRingEmpty(&pool->available)) { + pool->outageCount++; + return enqueueWaiter(&pool->waiting, waiter); + } + + pool->busyCount++; + RingNode *entry = chopRingNode(&pool->available); + pushRingNode(&pool->busy, entry); + (*waiter->callback)(waiter, entry); + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry) +{ + ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), + "vio pool entry returned on same thread as it was acquired"); + entry->vio->completion.errorHandler = NULL; + if (hasWaiters(&pool->waiting)) { + notifyNextWaiter(&pool->waiting, NULL, entry); + return; + } + + pushRingNode(&pool->available, &entry->node); + --pool->busyCount; +} + +/**********************************************************************/ +uint64_t getVIOPoolOutageCount(VIOPool *pool) +{ + return pool->outageCount; +} diff --git a/vdo/base/vioPool.h b/vdo/base/vioPool.h new file mode 100644 index 0000000..bab3dbe --- /dev/null +++ b/vdo/base/vioPool.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.h#4 $ + */ + +#ifndef VIO_POOL_H +#define VIO_POOL_H + +#include "permassert.h" + +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A VIOPool is a collection of preallocated VIOs used to write arbitrary + * metadata blocks. + **/ + +/** + * An VIOPoolEntry is the pair of VIO and buffer whether in use or not. + **/ +typedef struct { + RingNode node; + VIO *vio; + void *buffer; + void *parent; + void *context; +} VIOPoolEntry; + +/** + * A function which constructs a VIO for a pool. + * + * @param [in] layer The physical layer in which the VIO will operate + * @param [in] parent The parent of the VIO + * @param [in] buffer The data buffer for the VIO + * @param [out] vioPtr A pointer to hold the new VIO + **/ +typedef int VIOConstructor(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr); + +/** + * Create a new VIO pool. + * + * @param [in] layer the physical layer to write to and read from + * @param [in] poolSize the number of VIOs in the pool + * @param [in] threadID the ID of the thread using this pool + * @param [in] vioConstructor the constructor for VIOs in the pool + * @param [in] context the context that each entry will have + * @param [out] poolPtr the resulting pool + * + * @return a success or error code + **/ +int makeVIOPool(PhysicalLayer *layer, + size_t poolSize, + ThreadID threadID, + VIOConstructor *vioConstructor, + void *context, + VIOPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a VIO pool + * + * @param poolPtr the pointer holding the pool, which will be nulled out + **/ +void freeVIOPool(VIOPool **poolPtr); + +/** + * Check whether an VIO pool has outstanding entries. + * + * @return true if the pool is busy + **/ +bool isVIOPoolBusy(VIOPool *pool) + __attribute__((warn_unused_result)); + +/** + * Acquire a VIO and buffer from the pool (asynchronous). + * + * @param pool the VIO pool + * @param waiter object that is requesting a VIO + * + * @return VDO_SUCCESS or an error + **/ +int acquireVIOFromPool(VIOPool *pool, Waiter *waiter); + +/** + * Return a VIO and its buffer to the pool. + * + * @param pool the VIO pool + * @param entry a VIO pool entry + **/ +void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry); + +/** + * Convert a RingNode to the VIOPoolEntry that contains it. + * + * @param node The RingNode to convert + * + * @return The VIOPoolEntry wrapping the RingNode + **/ +static inline VIOPoolEntry *asVIOPoolEntry(RingNode *node) +{ + STATIC_ASSERT(offsetof(VIOPoolEntry, node) == 0); + return (VIOPoolEntry *) node; +} + +/** + * Return the outage count of an VIO pool. + * + * @param pool The pool + * + * @return the number of times an acquisition request had to wait + **/ +uint64_t getVIOPoolOutageCount(VIOPool *pool) + __attribute__((warn_unused_result)); + +#endif // VIO_POOL_H diff --git a/vdo/base/vioRead.c b/vdo/base/vioRead.c new file mode 100644 index 0000000..ab73727 --- /dev/null +++ b/vdo/base/vioRead.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.c#1 $ + */ + +#include "vioRead.h" + +#include "logger.h" + +#include "blockMap.h" +#include "dataVIO.h" +#include "vdoInternal.h" +#include "vioWrite.h" + +/** + * Do the modify-write part of a read-modify-write cycle. This callback is + * registered in readBlock(). + * + * @param completion The DataVIO which has just finished its read + **/ +static void modifyForPartialWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + completion->layer->applyPartialWrite(dataVIO); + VIO *vio = dataVIOAsVIO(dataVIO); + vio->operation = VIO_WRITE | (vio->operation & ~VIO_READ_WRITE_MASK); + dataVIO->isPartialWrite = true; + launchWriteDataVIO(dataVIO); +} + +/** + * Read a block asynchronously. This is the callback registered in + * readBlockMapping(). + * + * @param completion The DataVIO to read + **/ +static void readBlock(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + DataVIO *dataVIO = asDataVIO(completion); + VIO *vio = asVIO(completion); + completion->callback + = (isReadVIO(vio) ? completeDataVIO : modifyForPartialWrite); + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + completion->layer->zeroDataVIO(dataVIO); + invokeCallback(completion); + return; + } + + vio->physical = dataVIO->mapped.pbn; + dataVIO->lastAsyncOperation = READ_DATA; + completion->layer->readData(dataVIO); +} + +/** + * Read the DataVIO's mapping from the block map. This callback is registered + * in launchReadDataVIO(). + * + * @param completion The DataVIO to be read + **/ +static void readBlockMapping(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + setLogicalCallback(dataVIO, readBlock, THIS_LOCATION("$F;cb=readBlock")); + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK; + getMappedBlockAsync(dataVIO); +} + +/**********************************************************************/ +void launchReadDataVIO(DataVIO *dataVIO) +{ + assertInLogicalZone(dataVIO); + dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; + // Go find the block map slot for the LBN mapping. + findBlockMapSlotAsync(dataVIO, readBlockMapping, + getLogicalZoneThreadID(dataVIO->logical.zone)); +} + +/** + * Release the logical block lock which a read DataVIO obtained now that it + * is done. + * + * @param completion The DataVIO + **/ +static void releaseLogicalLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + releaseLogicalBlockLock(dataVIO); + vioDoneCallback(completion); +} + +/** + * Clean up a DataVIO which has finished processing a read. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupReadDataVIO(DataVIO *dataVIO) +{ + launchLogicalCallback(dataVIO, releaseLogicalLock, + THIS_LOCATION("$F;cb=releaseLL")); +} diff --git a/vdo/base/vioRead.h b/vdo/base/vioRead.h new file mode 100644 index 0000000..ae2fa37 --- /dev/null +++ b/vdo/base/vioRead.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.h#1 $ + */ + +#ifndef VIO_READ_H +#define VIO_READ_H + +#include "types.h" + +/** + * Start the asynchronous processing of the DataVIO for a read or + * read-modify-write request which has acquired a lock on its logical block. + * The first step is to perform a block map lookup. + * + * @param dataVIO The DataVIO doing the read + **/ +void launchReadDataVIO(DataVIO *dataVIO); + +/** + * Clean up a DataVIO which has finished processing a read. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupReadDataVIO(DataVIO *dataVIO); + +#endif /* VIO_READ_H */ diff --git a/vdo/base/vioWrite.c b/vdo/base/vioWrite.c new file mode 100644 index 0000000..ac2bb53 --- /dev/null +++ b/vdo/base/vioWrite.c @@ -0,0 +1,1201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $ + */ + +/* + * This file contains almost all of the VDO write path, which begins with + * writeExtent(). The progression through the callbacks which make up the + * write path depends upon whether or not the write policy is synchronous or + * asynchronous. The paths would proceed as outlined in the pseudo-code here + * if this were normal, synchronous code without callbacks. Complications + * involved in waiting on locks are not included. + * + * ###################################################################### + * writeExtentSynchronous(extent) + * { + * foreach (vio in extent) { + * launchWriteVIO() + * # allocateBlockForWrite() + * if (!trim and !zero-block) { + * allocate block + * if (vio is compressed) { + * completeCompressedBlockWrite() + * finishVIO() + * return + * } + * writeBlock() + * } + * finishBlockWrite() + * addJournalEntry() # Increment + * if (vio->newMapped is not ZERO_BLOCK) { + * journalIncrementForWrite() + * } + * acknowledgeWriteCallback() + * readOldBlockMapping() + * journalUnmappingForWrite() + * if (vio->mapped is not ZERO_BLOCK) { + * journalDecrementForWrite() + * } + * updateBlockMapForWrite() + * if (trim || zero-block) { + * finishVIO() + * return + * } + * + * prepareForDedupe() + * hashData() + * resolveHashZone() + * acquireHashLock() + * attemptDedupe() (query albireo) + * if (isDuplicate) { + * verifyAdvice() (read verify) + * if (isDuplicate and canAddReference) { + * shareBlock() + * addJournalEntryForDedupe() + * incrementForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * if (not canAddReference) { + * layer->updateAlbireo() + * } + * # compressData() + * if (compressing and not mooted and has no waiters) { + * layer->compressVIO() + * packCompressedData() + * if (compressed) { + * journalCompressedBlocks() + * incrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * } + * } + * + * finishVIO() + * } + * } + * + * ###################################################################### + * writeExtentAsynchronous(extent) + * { + * foreach (vio in extent) { + * launchWriteVIO() + * # allocateBlockForWrite() + * if (trim || zero-block) { + * acknowledgeWrite() + * } else { + * allocateAndLockBlock() + * if (vio is compressed) { + * writeBlock() + * completeCompressedBlockWrite() + * finishVIO() + * return + * } + * + * acknowledgeWrite() + * prepareForDedupe() + * hashData() + * resolveHashZone() + * acquireHashLock() + * attemptDedupe() (query albireo) + * if (isDuplicate) { + * verifyAdvice() (read verify) + * if (isDuplicate and canAddReference) { + * shareBlock() + * addJournalEntryForDedupe() + * incrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * if (not canAddReference) { + * layer->updateAlbireo() + * } + * # compressData() + * if (compressing and not mooted and has no waiters) { + * layer->compressVIO() + * packCompressedData() + * if (compressed) { + * journalCompressedBlocks() + * journalIncrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * writeBlock() + * } + * + * finishBlockWrite() + * addJournalEntry() # Increment + * if (vio->newMapped is not ZERO_BLOCK) { + * journalIncrementForWrite() + * } + * readOldBlockMappingForWrite() + * journalUnmappingForWrite() + * if (vio->mapped is not ZERO_BLOCK) { + * journalDecrementForWrite() + * } + * updateBlockMapForWrite() + * finishVIO() + * } + * } + */ + +#include "vioWrite.h" + +#include "logger.h" + +#include "allocatingVIO.h" +#include "atomic.h" +#include "blockMap.h" +#include "compressionState.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "recoveryJournal.h" +#include "referenceOperation.h" +#include "slab.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "vdoInternal.h" +#include "vioRead.h" + +/** + * The steps taken cleaning up a VIO, in the order they are performed. + **/ +typedef enum dataVIOCleanupStage { + VIO_CLEANUP_START = 0, + VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START, + VIO_RELEASE_RECOVERY_LOCKS, + VIO_RELEASE_HASH_LOCK, + VIO_RELEASE_LOGICAL, + VIO_CLEANUP_DONE +} DataVIOCleanupStage; + +/** + * Actions to take on error used by abortOnError(). + **/ +typedef enum { + NOT_READ_ONLY, + READ_ONLY_IF_ASYNC, + READ_ONLY, +} ReadOnlyAction; + +// Forward declarations required because of circular function references. +static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage); +static void writeBlock(DataVIO *dataVIO); + +/** + * Check whether we are in async mode. + * + * @param dataVIO A DataVIO containing a pointer to the VDO whose write + * policy we want to check + * + * @return true if we are in async mode + **/ +static inline bool isAsync(DataVIO *dataVIO) +{ + return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC); +} + +/** + * Release the PBN lock and/or the reference on the allocated block at the + * end of processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void releaseAllocatedLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO)); + performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS); +} + +/** + * Release the logical block lock and flush generation lock at the end of + * processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void releaseLogicalLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + releaseLogicalBlockLock(dataVIO); + releaseFlushGenerationLock(dataVIO); + performCleanupStage(dataVIO, VIO_CLEANUP_DONE); +} + +/** + * Release the hash lock at the end of processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void cleanHashLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + releaseHashLock(dataVIO); + performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL); +} + +/** + * Make some assertions about a DataVIO which has finished cleaning up + * and do its final callback. + * + * @param dataVIO The DataVIO which has finished cleaning up + **/ +static void finishCleanup(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL, + "complete DataVIO has no allocation lock"); + ASSERT_LOG_ONLY(dataVIO->hashLock == NULL, + "complete DataVIO has no hash lock"); + vioDoneCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Perform the next step in the process of cleaning up a DataVIO. + * + * @param dataVIO The DataVIO to clean up + * @param stage The cleanup stage to perform + **/ +static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage) +{ + switch (stage) { + case VIO_RELEASE_ALLOCATED: + if (hasAllocation(dataVIO)) { + launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock, + THIS_LOCATION("$F;cb=releaseAllocLock")); + return; + } + // fall through + + case VIO_RELEASE_RECOVERY_LOCKS: + if ((dataVIO->recoverySequenceNumber > 0) + && !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier) + && (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) { + logWarning("VDO not read-only when cleaning DataVIO with RJ lock"); + } + // fall through + + case VIO_RELEASE_HASH_LOCK: + if (dataVIO->hashLock != NULL) { + launchHashZoneCallback(dataVIO, cleanHashLock, + THIS_LOCATION("$F;cb=cleanHashLock")); + return; + } + // fall through + + case VIO_RELEASE_LOGICAL: + if (!isCompressedWriteDataVIO(dataVIO)) { + launchLogicalCallback(dataVIO, releaseLogicalLock, + THIS_LOCATION("$F;cb=releaseLL")); + return; + } + // fall through + + default: + finishCleanup(dataVIO); + } +} + +/** + * Return a DataVIO that encountered an error to its hash lock so it can + * update the hash lock state accordingly. This continuation is registered in + * abortOnError(), and must be called in the hash zone of the DataVIO. + * + * @param completion The completion of the DataVIO to return to its hash lock + **/ +static void finishWriteDataVIOWithError(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + continueHashLockOnError(dataVIO); +} + +/** + * Check whether a result is an error, and if so abort the DataVIO associated + * with the error. + * + * @param result The result to check + * @param dataVIO The DataVIO + * @param readOnlyAction The conditions under which the VDO should be put + * into read-only mode if the result is an error + * + * @return true if the result is an error + **/ +static bool abortOnError(int result, + DataVIO *dataVIO, + ReadOnlyAction readOnlyAction) +{ + if (result == VDO_SUCCESS) { + return false; + } + + if ((result == VDO_READ_ONLY) + || (readOnlyAction == READ_ONLY) + || ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) { + ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier; + if (!isReadOnly(notifier)) { + if (result != VDO_READ_ONLY) { + logErrorWithStringError(result, "Preparing to enter read-only mode:" + " DataVIO for LBN %llu (becoming mapped" + " to %llu, previously mapped" + " to %llu, allocated %llu) is" + " completing with a fatal error after" + " operation %s", dataVIO->logical.lbn, + dataVIO->newMapped.pbn, dataVIO->mapped.pbn, + getDataVIOAllocation(dataVIO), + getOperationName(dataVIO)); + } + + enterReadOnlyMode(notifier, result); + } + } + + if (dataVIO->hashLock != NULL) { + launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError, + THIS_LOCATION(NULL)); + } else { + finishDataVIO(dataVIO, result); + } + return true; +} + +/** + * Return a DataVIO that finished writing, compressing, or deduplicating to + * its hash lock so it can share the result with any DataVIOs waiting in the + * hash lock, or update albireo, or simply release its share of the lock. This + * continuation is registered in updateBlockMapForWrite(), + * updateBlockMapForDedupe(), and abortDeduplication(), and must be called in + * the hash zone of the DataVIO. + * + * @param completion The completion of the DataVIO to return to its hash lock + **/ +static void finishWriteDataVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + continueHashLock(dataVIO); +} + +/** + * Abort the data optimization process. + * + * @param dataVIO The DataVIO which does not deduplicate or compress + **/ +static void abortDeduplication(DataVIO *dataVIO) +{ + if (!hasAllocation(dataVIO)) { + // There was no space to write this block and we failed to deduplicate + // or compress it. + finishDataVIO(dataVIO, VDO_NO_SPACE); + return; + } + + if (isAsync(dataVIO)) { + // We failed to deduplicate or compress an async DataVIO, so now we need + // to actually write the data. + writeBlock(dataVIO); + return; + } + + if (dataVIO->hashLock == NULL) { + // We failed to compress a synchronous DataVIO that is a hash collision, + // which means it can't dedpe or be used for dedupe, so it's done now. + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + /* + * This synchronous DataVIO failed to compress and so is finished, but must + * now return to its hash lock so other DataVIOs with the same data can + * deduplicate against the uncompressed block it wrote. + */ + launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); +} + +/** + * Update the block map now that we've added an entry in the recovery journal + * for a block we have just shared. This is the callback registered in + * decrementForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void updateBlockMapForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->hashLock != NULL) { + setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); + } else { + completion->callback = completeDataVIO; + } + dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE; + putMappedBlockAsync(dataVIO); +} + +/** + * Make a recovery journal increment. + * + * @param dataVIO The DataVIO + * @param lock The PBNLock on the block being incremented + **/ +static void journalIncrement(DataVIO *dataVIO, PBNLock *lock) +{ + setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn, + dataVIO->newMapped.state, lock, + &dataVIO->operation); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Make a recovery journal decrement entry. + * + * @param dataVIO The DataVIO + **/ +static void journalDecrement(DataVIO *dataVIO) +{ + setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn, + dataVIO->mapped.state, dataVIO->mapped.zone, + &dataVIO->operation); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Make a reference count change. + * + * @param dataVIO The DataVIO + **/ +static void updateReferenceCount(DataVIO *dataVIO) +{ + SlabDepot *depot = getVDOFromDataVIO(dataVIO)->depot; + PhysicalBlockNumber pbn = dataVIO->operation.pbn; + int result = ASSERT(isPhysicalDataBlock(depot, pbn), + "Adding slab journal entry for impossible PBN %" PRIu64 + "for LBN %llu", pbn, dataVIO->logical.lbn); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO); +} + +/** + * Do the decref after a successful dedupe or compression. This is the callback + * registered by journalUnmappingForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void decrementForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + if (allocatingVIO->allocation == dataVIO->mapped.pbn) { + /* + * If we are about to release the reference on the allocated block, + * we must release the PBN lock on it first so that the allocator will + * not allocate a write-locked block. + */ + releaseAllocationLock(allocatingVIO); + } + + setLogicalCallback(dataVIO, updateBlockMapForDedupe, + THIS_LOCATION("$F;js=dec")); + dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE; + updateReferenceCount(dataVIO); +} + +/** + * Write the appropriate journal entry for removing the mapping of logical to + * mapped, for dedupe or compression. This is the callback registered in + * readOldBlockMappingForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void journalUnmappingForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, updateBlockMapForDedupe, + THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM")); + } else { + setMappedZoneCallback(dataVIO, decrementForDedupe, + THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE; + journalDecrement(dataVIO); +} + +/** + * Get the previous PBN mapped to this LBN from the block map, so as to make + * an appropriate journal entry referencing the removal of this LBN->PBN + * mapping, for dedupe or compression. This callback is registered in + * incrementForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void readOldBlockMappingForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE; + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmapDedupe")); + getMappedBlockAsync(dataVIO); +} + +/** + * Do the incref after compression. This is the callback registered by + * addRecoveryJournalEntryForCompression(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForCompression(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInNewMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), + "Impossible attempt to update reference counts for a block " + "which was not compressed (logical block %llu)", + dataVIO->logical.lbn); + + /* + * If we are synchronous and allocated a block, we know the one we + * allocated is the block we need to decrement, so there is no need + * to look in the block map. + */ + if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, + THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); + } else { + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION; + updateReferenceCount(dataVIO); +} + +/** + * Add a recovery journal entry for the increment resulting from compression. + * + * @param completion The DataVIO which has been compressed + **/ +static void addRecoveryJournalEntryForCompression(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (!isCompressed(dataVIO->newMapped.state)) { + abortDeduplication(dataVIO); + return; + } + + setNewMappedZoneCallback(dataVIO, incrementForCompression, + THIS_LOCATION("$F($dup);js=map/$dup;" + "cb=incCompress($dup)")); + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION; + journalIncrement(dataVIO, getDuplicateLock(dataVIO)); +} + +/** + * Attempt to pack the compressed DataVIO into a block. This is the callback + * registered in compressData(). + * + * @param completion The completion of a compressed DataVIO + **/ +static void packCompressedData(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInPackerZone(dataVIO); + + // XXX this is a callback, so there should probably be an error check here + // even if we think compression can't currently return one. + + if (!mayPackDataVIO(dataVIO)) { + abortDeduplication(dataVIO); + return; + } + + setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression, + THIS_LOCATION("$F;cb=update(compress)")); + dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK; + attemptPacking(dataVIO); +} + +/**********************************************************************/ +void compressData(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(!dataVIO->isDuplicate, + "compressing a non-duplicate block"); + if (!mayCompressDataVIO(dataVIO)) { + abortDeduplication(dataVIO); + return; + } + + dataVIO->lastAsyncOperation = COMPRESS_DATA; + setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack")); + dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO); +} + +/** + * Do the incref after deduplication. This is the callback registered by + * addRecoveryJournalEntryForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInNewMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(dataVIO->isDuplicate, + "Impossible attempt to update reference counts for a block " + "which was not a duplicate (logical block %llu)", + dataVIO->logical.lbn); + + /* + * If we are synchronous and allocated a block, we know the one we + * allocated is the block we need to decrement, so there is no need + * to look in the block map. + */ + if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, + THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); + } else { + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE; + updateReferenceCount(dataVIO); +} + +/** + * Add a recovery journal entry for the increment resulting from deduplication. + * This callback is registered in shareBlock(). + * + * @param completion The DataVIO which has been deduplicated + **/ +static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + setNewMappedZoneCallback(dataVIO, incrementForDedupe, + THIS_LOCATION("$F($dup);js=map/$dup;" + "cb=incDedupe($dup)")); + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE; + journalIncrement(dataVIO, getDuplicateLock(dataVIO)); +} + +/** + * Share a block in the block map if it is a duplicate. This is the lock + * callback registered in acquirePBNReadLock(). This is only public so + * test code can compare the function to the current callback in a completion. + * + * @param completion The completion of the write in progress + **/ +void shareBlock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInDuplicateZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (!dataVIO->isDuplicate) { + compressData(dataVIO); + return; + } + + dataVIO->newMapped = dataVIO->duplicate; + launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe, + THIS_LOCATION("$F;cb=addJournalEntryDup")); +} + +/** + * Route the DataVIO to the HashZone responsible for the chunk name to acquire + * a hash lock on that name, or join with a existing hash lock managing + * concurrent dedupe for that name. This is the callback registered in + * resolveHashZone(). + * + * @param completion The DataVIO to lock + **/ +static void lockHashInZone(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + // Shouldn't have had any errors since all we did was switch threads. + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + int result = acquireHashLock(dataVIO); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->hashLock == NULL) { + // It's extremely unlikely, but in the case of a hash collision, the + // DataVIO will not obtain a reference to the lock and cannot deduplicate. + compressData(dataVIO); + return; + } + + enterHashLock(dataVIO); +} + +/** + * Set the hash zone (and flag the chunk name as set) while still on the + * thread that just hashed the data to set the chunk name. This is the + * callback registered by prepareForDedupe(). + * + * @param completion The DataVIO whose chunk name was just generated, as a + * completion + **/ +static void resolveHashZone(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we are on. + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed"); + + dataVIO->hashZone + = selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName); + dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK; + launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL)); +} + +/** + * Prepare for the dedupe path after a synchronous write or an asynchronous + * allocation. This callback is registered in updateBlockMapForWrite() for + * sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for + * async. It is also called directly from the latter when allocation fails. + * + * @param completion The completion of the write in progress + **/ +static void prepareForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we are on + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (!isAsync(dataVIO)) { + // Remember which block we wrote so we will decrement the reference to it + // if we deduplicate. This avoids having to look it up in the block map. + dataVIO->mapped = dataVIO->newMapped; + } + + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "must not prepare to dedupe zero blocks"); + + // Before we can dedupe, we need to know the chunk name, so the first step + // is to hash the block data. + dataVIO->lastAsyncOperation = HASH_DATA; + // XXX this is the wrong thread to run this callback, but we don't yet have + // a mechanism for running it on the CPU thread immediately after hashing. + setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL)); + completion->layer->hashData(dataVIO); +} + +/** + * Update the block map after a data write (or directly for a ZERO_BLOCK write + * or trim). This callback is registered in decrementForWrite() and + * journalUnmappingForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void updateBlockMapForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { + completion->callback = completeDataVIO; + } else if (!isAsync(dataVIO)) { + // Synchronous DataVIOs branch off to the hash/dedupe path after finishing + // the uncompressed write of their data. + completion->callback = prepareForDedupe; + } else if (dataVIO->hashLock != NULL) { + // Async writes will be finished, but must return to the hash lock to + // allow other DataVIOs with the same data to dedupe against the write. + setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); + } else { + // Async writes without a hash lock (hash collisions) will be finished. + completion->callback = completeDataVIO; + } + + dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK; + putMappedBlockAsync(dataVIO); +} + +/** + * Do the decref after a successful block write. This is the callback + * by journalUnmappingForWrite() if the old mapping was not the zero block. + * + * @param completion The completion of the write in progress + **/ +static void decrementForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE; + setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL)); + updateReferenceCount(dataVIO); +} + +/** + * Write the appropriate journal entry for unmapping logical to mapped for a + * write. This is the callback registered in readOldBlockMappingForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void journalUnmappingForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, updateBlockMapForWrite, + THIS_LOCATION("$F;js=unmap;cb=updateBMwrite")); + } else { + setMappedZoneCallback(dataVIO, decrementForWrite, + THIS_LOCATION("$F;js=unmap;cb=decWrite")); + } + dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE; + journalDecrement(dataVIO); +} + +/** + * Get the previous PBN mapped to this LBN from the block map for a write, so + * as to make an appropriate journal entry referencing the removal of this + * LBN->PBN mapping. This callback is registered in finishBlockWrite() in the + * async path, and is registered in acknowledgeWrite() in the sync path. + * + * @param completion The completion of the write in progress + **/ +static void readOldBlockMappingForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + setJournalCallback(dataVIO, journalUnmappingForWrite, + THIS_LOCATION("$F;cb=journalUnmapWrite")); + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE; + getMappedBlockAsync(dataVIO); +} + +/** + * Acknowledge a write to the requestor. + * + * @param dataVIO The DataVIO being acknowledged + **/ +static void acknowledgeWrite(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock, + "write VIO to be acknowledged has a flush generation lock"); + dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE; + dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO); +} + +/** + * Acknowledge a write now that we have made an entry in the recovery + * journal. This is the callback registered in finishBlockWrite() in + * synchronous mode. + * + * @param completion The completion of the write in progress + **/ +static void acknowledgeWriteCallback(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + setLogicalCallback(dataVIO, readOldBlockMappingForWrite, + THIS_LOCATION(NULL)); + acknowledgeWrite(dataVIO); +} + +/**********************************************************************/ +static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO) +{ + return (isAsync(dataVIO) + ? readOldBlockMappingForWrite : acknowledgeWriteCallback); +} + +/** + * Do the incref after a successful block write. This is the callback + * registered by finishBlockWrite(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + /* + * Now that the data has been written, it's safe to deduplicate against the + * block. Downgrade the allocation lock to a read lock so it can be used + * later by the hash lock (which we don't have yet in sync mode). + */ + downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock); + + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE; + setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), + THIS_LOCATION(NULL)); + updateReferenceCount(dataVIO); +} + +/** + * Add an entry in the recovery journal after a successful block write. This is + * the callback registered by writeBlock(). It is also registered in + * allocateBlockForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void finishBlockWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (dataVIO->newMapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), + THIS_LOCATION("$F;js=writeZero")); + } else { + setAllocatedZoneCallback(dataVIO, incrementForWrite, + THIS_LOCATION("$F;js=mapWrite")); + } + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE; + journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock); +} + +/** + * Write data to the underlying storage. + * + * @param dataVIO The DataVIO to write + **/ +static void writeBlock(DataVIO *dataVIO) +{ + dataVIO->lastAsyncOperation = WRITE_DATA; + setJournalCallback(dataVIO, finishBlockWrite, + THIS_LOCATION("$F(data);cb=finishWrite")); + dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO); +} + +/** + * Continue the write path for a DataVIO now that block allocation is complete + * (the DataVIO may or may not have actually received an allocation). This + * callback is registered in continueWriteWithBlockMapSlot(). + * + * @param allocatingVIO The DataVIO which has finished the allocation process + * (as an AllocatingVIO) + **/ +static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO) +{ + DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); + if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO, + NOT_READ_ONLY)) { + return; + } + + if (!hasAllocation(dataVIO)) { + prepareForDedupe(dataVIOAsCompletion(dataVIO)); + return; + } + + atomicStoreBool(&dataVIO->hasAllocation, true); + dataVIO->newMapped = (ZonedPBN) { + .zone = allocatingVIO->zone, + .pbn = allocatingVIO->allocation, + .state = MAPPING_STATE_UNCOMPRESSED, + }; + + if (!isAsync(dataVIO)) { + writeBlock(dataVIO); + return; + } + + // XXX prepareForDedupe can run from any thread, so this is a place where + // running the callback on the kernel thread would save a thread switch. + setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL)); + if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) { + invokeCallback(dataVIOAsCompletion(dataVIO)); + return; + } + + acknowledgeWrite(dataVIO); +} + +/** + * Continue the write path for a VIO now that block map slot resolution is + * complete. This callback is registered in launchWriteDataVIO(). + * + * @param completion The DataVIO to write + **/ +static void continueWriteWithBlockMapSlot(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we're on. + if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) { + return; + } + + if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { + int result = ASSERT(isTrimDataVIO(dataVIO), + "dataVIO with no block map page is a trim"); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + // This is a trim for a block on a block map page which has not been + // allocated, so there's nothing more we need to do. + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { + // We don't need to write any data, so skip allocation and just update + // the block map and reference counts (via the journal). + dataVIO->newMapped.pbn = ZERO_BLOCK; + launchJournalCallback(dataVIO, finishBlockWrite, + THIS_LOCATION("$F;cb=finishWrite")); + return; + } + + allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), + getAllocationSelector(dataVIO->logical.zone), + VIO_WRITE_LOCK, continueWriteAfterAllocation); +} + +/**********************************************************************/ +void launchWriteDataVIO(DataVIO *dataVIO) +{ + if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) { + finishDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + // Write requests join the current flush generation. + int result = acquireFlushGenerationLock(dataVIO); + if (abortOnError(result, dataVIO, NOT_READ_ONLY)) { + return; + } + + // Go find the block map slot for the LBN mapping. + dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; + findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot, + getLogicalZoneThreadID(dataVIO->logical.zone)); +} + +/**********************************************************************/ +void cleanupWriteDataVIO(DataVIO *dataVIO) +{ + performCleanupStage(dataVIO, VIO_CLEANUP_START); +} diff --git a/vdo/base/vioWrite.h b/vdo/base/vioWrite.h new file mode 100644 index 0000000..6effc91 --- /dev/null +++ b/vdo/base/vioWrite.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.h#1 $ + */ + +#ifndef VIO_WRITE_H +#define VIO_WRITE_H + +#include "types.h" + +/** + * Release the PBN read lock if it is held. + * + * @param dataVIO The possible lock holder + **/ +void releasePBNReadLock(DataVIO *dataVIO); + +/** + * Start the asynchronous processing of a DataVIO for a write request which has + * acquired a lock on its logical block by joining the current flush generation + * and then attempting to allocate a physical block. + * + * @param dataVIO The DataVIO doing the write + **/ +void launchWriteDataVIO(DataVIO *dataVIO); + +/** + * Clean up a DataVIO which has finished processing a write. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupWriteDataVIO(DataVIO *dataVIO); + +/** + * Continue a write by attempting to compress the data. This is a re-entry + * point to vioWrite used by hash locks. + * + * @param dataVIO The DataVIO to be compressed + **/ +void compressData(DataVIO *dataVIO); + +#endif /* VIO_WRITE_H */ diff --git a/vdo/base/volumeGeometry.c b/vdo/base/volumeGeometry.c new file mode 100644 index 0000000..32b2e5f --- /dev/null +++ b/vdo/base/volumeGeometry.c @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.c#10 $ + */ + +#include "volumeGeometry.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "constants.h" +#include "header.h" +#include "physicalLayer.h" +#include "releaseVersions.h" +#include "statusCodes.h" +#include "types.h" + +enum { + GEOMETRY_BLOCK_LOCATION = 0, + MAGIC_NUMBER_SIZE = 8, +}; + +typedef struct { + char magicNumber[MAGIC_NUMBER_SIZE]; + Header header; + VolumeGeometry geometry; + CRC32Checksum checksum; +} __attribute__((packed)) GeometryBlock; + +static const Header GEOMETRY_BLOCK_HEADER_4_0 = { + .id = GEOMETRY_BLOCK, + .version = { + .majorVersion = 4, + .minorVersion = 0, + }, + // Note: this size isn't just the payload size following the header, like it + // is everywhere else in VDO. + .size = sizeof(GeometryBlock), +}; + +static const byte MAGIC_NUMBER[MAGIC_NUMBER_SIZE + 1] = "dmvdo001"; + +static const ReleaseVersionNumber COMPATIBLE_RELEASE_VERSIONS[] = { + MAGNESIUM_RELEASE_VERSION_NUMBER, +}; + +/** + * Determine whether the supplied release version can be understood by + * the VDO code. + * + * @param version The release version number to check + * + * @return True if the given version can be loaded. + **/ +static inline bool isLoadableReleaseVersion(ReleaseVersionNumber version) +{ + if (version == CURRENT_RELEASE_VERSION_NUMBER) { + return true; + } + + for (unsigned int i = 0; i < COUNT_OF(COMPATIBLE_RELEASE_VERSIONS); i++) { + if (version == COMPATIBLE_RELEASE_VERSIONS[i]) { + return true; + } + } + + return false; +} + +/** + * Decode the on-disk representation of an index configuration from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeIndexConfig(Buffer *buffer, IndexConfig *config) +{ + uint32_t mem; + int result = getUInt32LEFromBuffer(buffer, &mem); + if (result != VDO_SUCCESS) { + return result; + } + + uint32_t checkpointFrequency; + result = getUInt32LEFromBuffer(buffer, &checkpointFrequency); + if (result != VDO_SUCCESS) { + return result; + } + + bool sparse; + result = getBoolean(buffer, &sparse); + if (result != VDO_SUCCESS) { + return result; + } + + *config = (IndexConfig) { + .mem = mem, + .checkpointFrequency = checkpointFrequency, + .sparse = sparse, + }; + return VDO_SUCCESS; +} + +/** + * Encode the on-disk representation of an index configuration into a buffer. + * + * @param config The index configuration to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeIndexConfig(const IndexConfig *config, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, config->mem); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt32LEIntoBuffer(buffer, config->checkpointFrequency); + if (result != VDO_SUCCESS) { + return result; + } + + return putBoolean(buffer, config->sparse); +} + +/** + * Decode the on-disk representation of a volume region from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param region The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeVolumeRegion(Buffer *buffer, VolumeRegion *region) +{ + VolumeRegionID id; + int result = getUInt32LEFromBuffer(buffer, &id); + if (result != VDO_SUCCESS) { + return result; + } + + PhysicalBlockNumber startBlock; + result = getUInt64LEFromBuffer(buffer, &startBlock); + if (result != VDO_SUCCESS) { + return result; + } + + *region = (VolumeRegion) { + .id = id, + .startBlock = startBlock, + }; + return VDO_SUCCESS; +} + +/** + * Encode the on-disk representation of a volume region into a buffer. + * + * @param region The region to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeVolumeRegion(const VolumeRegion *region, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, region->id); + if (result != VDO_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, region->startBlock); +} + +/** + * Decode the on-disk representation of a volume geometry from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param geometry The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeVolumeGeometry(Buffer *buffer, VolumeGeometry *geometry) +{ + ReleaseVersionNumber releaseVersion; + int result = getUInt32LEFromBuffer(buffer, &releaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + Nonce nonce; + result = getUInt64LEFromBuffer(buffer, &nonce); + if (result != VDO_SUCCESS) { + return result; + } + + geometry->releaseVersion = releaseVersion; + geometry->nonce = nonce; + + result = getBytesFromBuffer(buffer, sizeof(UUID), geometry->uuid); + if (result != VDO_SUCCESS) { + return result; + } + + for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { + result = decodeVolumeRegion(buffer, &geometry->regions[id]); + if (result != VDO_SUCCESS) { + return result; + } + } + + return decodeIndexConfig(buffer, &geometry->indexConfig); +} + +/** + * Encode the on-disk representation of a volume geometry into a buffer. + * + * @param geometry The geometry to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeVolumeGeometry(const VolumeGeometry *geometry, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, geometry->releaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, geometry->nonce); + if (result != VDO_SUCCESS) { + return result; + } + + result = putBytes(buffer, sizeof(UUID), geometry->uuid); + if (result != VDO_SUCCESS) { + return result; + } + + for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { + result = encodeVolumeRegion(&geometry->regions[id], buffer); + if (result != VDO_SUCCESS) { + return result; + } + } + + return encodeIndexConfig(&geometry->indexConfig, buffer); +} + +/** + * Decode the on-disk representation of a geometry block, up to but not + * including the checksum, from a buffer. + * + * @param buffer A buffer positioned at the start of the block + * @param geometry The structure to receive the decoded volume geometry fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeGeometryBlock(Buffer *buffer, VolumeGeometry *geometry) +{ + if (!hasSameBytes(buffer, MAGIC_NUMBER, MAGIC_NUMBER_SIZE)) { + return VDO_BAD_MAGIC; + } + + int result = skipForward(buffer, MAGIC_NUMBER_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + + Header header; + result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVolumeGeometry(buffer, geometry); + if (result != VDO_SUCCESS) { + return result; + } + + // Leave the CRC for the caller to decode and verify. + return ASSERT(header.size + == (uncompactedAmount(buffer) + sizeof(CRC32Checksum)), + "should have decoded up to the geometry checksum"); +} + +/** + * Encode the on-disk representation of a geometry block, up to but not + * including the checksum, into a buffer. + * + * @param geometry The volume geometry to encode into the block + * @param buffer A buffer positioned at the start of the block + * + * @return UDS_SUCCESS or an error + **/ +static int encodeGeometryBlock(const VolumeGeometry *geometry, Buffer *buffer) +{ + int result = putBytes(buffer, MAGIC_NUMBER_SIZE, MAGIC_NUMBER); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeHeader(&GEOMETRY_BLOCK_HEADER_4_0, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVolumeGeometry(geometry, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + // Leave the CRC for the caller to compute and encode. + return ASSERT(GEOMETRY_BLOCK_HEADER_4_0.size + == (contentLength(buffer) + sizeof(CRC32Checksum)), + "should have decoded up to the geometry checksum"); +} + +/** + * Allocate a block-size buffer to read the geometry from the physical layer, + * read the block, and return the buffer. + * + * @param [in] layer The physical layer containing the block to read + * @param [out] blockPtr A pointer to receive the allocated buffer + * + * @return VDO_SUCCESS or an error code + **/ +static int readGeometryBlock(PhysicalLayer *layer, byte **blockPtr) +{ + int result = ASSERT(layer->reader != NULL, "Layer must have a sync reader"); + if (result != VDO_SUCCESS) { + return result; + } + + char *block; + result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + result = layer->reader(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + *blockPtr = (byte *) block; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +{ + byte *block; + int result = readGeometryBlock(layer, &block); + if (result != VDO_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer(block, VDO_BLOCK_SIZE, VDO_BLOCK_SIZE, &buffer); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + result = decodeGeometryBlock(buffer, geometry); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Checksum everything decoded so far. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, block, + uncompactedAmount(buffer)); + CRC32Checksum savedChecksum; + result = getUInt32LEFromBuffer(buffer, &savedChecksum); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Finished all decoding. Everything that follows is validation code. + freeBuffer(&buffer); + FREE(block); + + if (!isLoadableReleaseVersion(geometry->releaseVersion)) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "release version %d cannot be loaded", + geometry->releaseVersion); + } + + return ((checksum == savedChecksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH); +} + +/************************************************************************/ +int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) +{ + UdsConfiguration udsConfiguration = NULL; + int result = indexConfigToUdsConfiguration(indexConfig, &udsConfiguration); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error creating index config"); + } + + uint64_t indexBytes; + result = udsComputeIndexSize(udsConfiguration, 0, &indexBytes); + udsFreeConfiguration(udsConfiguration); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error computing index size"); + } + + BlockCount indexBlocks = indexBytes / VDO_BLOCK_SIZE; + if ((((uint64_t) indexBlocks) * VDO_BLOCK_SIZE) != indexBytes) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, "index size must be" + " a multiple of block size %d", + VDO_BLOCK_SIZE); + } + + *indexBlocksPtr = indexBlocks; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int initializeVolumeGeometry(Nonce nonce, + UUID uuid, + IndexConfig *indexConfig, + VolumeGeometry *geometry) +{ + BlockCount indexSize = 0; + if (indexConfig != NULL) { + int result = computeIndexBlocks(indexConfig, &indexSize); + if (result != VDO_SUCCESS) { + return result; + } + } + + *geometry = (VolumeGeometry) { + .releaseVersion = CURRENT_RELEASE_VERSION_NUMBER, + .nonce = nonce, + .regions = { + [INDEX_REGION] = { + .id = INDEX_REGION, + .startBlock = 1, + }, + [DATA_REGION] = { + .id = DATA_REGION, + .startBlock = 1 + indexSize, + } + } + }; + memcpy(geometry->uuid, uuid, sizeof(UUID)); + if (indexSize > 0) { + memcpy(&geometry->indexConfig, indexConfig, sizeof(IndexConfig)); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int clearVolumeGeometry(PhysicalLayer *layer) +{ + char *block; + int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + FREE(block); + return result; +} + +/**********************************************************************/ +int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +{ + char *block; + int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer((byte *) block, VDO_BLOCK_SIZE, 0, &buffer); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + result = encodeGeometryBlock(geometry, buffer); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Checksum everything encoded so far and then encode the checksum. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, (byte *) block, + contentLength(buffer)); + result = putUInt32LEIntoBuffer(buffer, checksum); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Write it. + result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + freeBuffer(&buffer); + FREE(block); + return result; +} + +/************************************************************************/ +int indexConfigToUdsConfiguration(IndexConfig *indexConfig, + UdsConfiguration *udsConfigPtr) +{ + UdsConfiguration udsConfiguration; + int result = udsInitializeConfiguration(&udsConfiguration, indexConfig->mem); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error initializing configuration"); + } + + udsConfigurationSetSparse(udsConfiguration, indexConfig->sparse); + + *udsConfigPtr = udsConfiguration; + return VDO_SUCCESS; +} + +/************************************************************************/ +void indexConfigToUdsParameters(IndexConfig *indexConfig, + struct uds_parameters *userParams) +{ + userParams->checkpoint_frequency = indexConfig->checkpointFrequency; +} diff --git a/vdo/base/volumeGeometry.h b/vdo/base/volumeGeometry.h new file mode 100644 index 0000000..c06cdde --- /dev/null +++ b/vdo/base/volumeGeometry.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.h#5 $ + */ + +#ifndef VOLUME_GEOMETRY_H +#define VOLUME_GEOMETRY_H + +#include "uds.h" + +#include "types.h" + +struct indexConfig { + uint32_t mem; + uint32_t checkpointFrequency; + bool sparse; +} __attribute__((packed)); + +typedef enum { + INDEX_REGION = 0, + DATA_REGION = 1, + VOLUME_REGION_COUNT, +} VolumeRegionID; + +typedef struct { + /** The ID of the region */ + VolumeRegionID id; + /** + * The absolute starting offset on the device. The region continues until + * the next region begins. + */ + PhysicalBlockNumber startBlock; +} __attribute__((packed)) VolumeRegion; + +/** A binary UUID is 16 bytes. */ +typedef unsigned char UUID[16]; + +typedef struct { + /** The release version number of this volume */ + ReleaseVersionNumber releaseVersion; + /** The nonce of this volume */ + Nonce nonce; + /** The UUID of this volume */ + UUID uuid; + /** The regions in ID order */ + VolumeRegion regions[VOLUME_REGION_COUNT]; + /** The index config */ + IndexConfig indexConfig; +} __attribute__((packed)) VolumeGeometry; + +/** + * Get the start of the index region from a geometry. + * + * @param geometry The geometry + * + * @return The start of the index region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getIndexRegionOffset(VolumeGeometry geometry) +{ + return geometry.regions[INDEX_REGION].startBlock; +} + +/** + * Get the start of the data region from a geometry. + * + * @param geometry The geometry + * + * @return The start of the data region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getDataRegionOffset(VolumeGeometry geometry) +{ + return geometry.regions[DATA_REGION].startBlock; +} + +/** + * Get the size of the index region from a geometry. + * + * @param geometry The geometry + * + * @return the size of the index region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getIndexRegionSize(VolumeGeometry geometry) +{ + return getDataRegionOffset(geometry) - getIndexRegionOffset(geometry); +} + +/** + * Read the volume geometry from a layer. + * + * @param layer The layer to read and parse the geometry from + * @param geometry The geometry to be loaded + **/ +int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +__attribute__((warn_unused_result)); + +/** + * Initialize a VolumeGeometry for a VDO. + * + * @param nonce The nonce for the VDO + * @param uuid The uuid for the VDO + * @param indexConfig The index config of the VDO. + * @param geometry The geometry being initialized + * + * @return VDO_SUCCESS or an error + **/ +int initializeVolumeGeometry(Nonce nonce, + UUID uuid, + IndexConfig *indexConfig, + VolumeGeometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Zero out the geometry on a layer. + * + * @param layer The layer to clear + * + * @return VDO_SUCCESS or an error + **/ +int clearVolumeGeometry(PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Write a geometry block for a VDO. + * + * @param layer The layer on which to write. + * @param geometry The VolumeGeometry to be written + * + * @return VDO_SUCCESS or an error + **/ +int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +__attribute__((warn_unused_result)); + +/** + * Convert a index config to a UDS configuration, which can be used by UDS. + * + * @param [in] indexConfig The index config to convert + * @param [out] udsConfigPtr A pointer to return the UDS configuration + * + * @return VDO_SUCCESS or an error + **/ +int indexConfigToUdsConfiguration(IndexConfig *indexConfig, + UdsConfiguration *udsConfigPtr) +__attribute__((warn_unused_result)); + +/** + * Modify the uds_parameters to match the requested index config. + * + * @param indexConfig The index config to convert + * @param userParams The uds_parameters to modify + **/ +void indexConfigToUdsParameters(IndexConfig *indexConfig, + struct uds_parameters *userParams); + +/** + * Compute the index size in blocks from the IndexConfig. + * + * @param [in] indexConfig The index config + * @param [out] indexBlocksPtr A pointer to return the index size in blocks + * + * @return VDO_SUCCESS or an error + **/ +int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) +__attribute__((warn_unused_result)); + +/** + * Set load config fields from a volume geometry. + * + * @param [in] geometry The geometry to use + * @param [out] loadConfig The load config to set + **/ +static inline void setLoadConfigFromGeometry(VolumeGeometry *geometry, + VDOLoadConfig *loadConfig) +{ + loadConfig->firstBlockOffset = getDataRegionOffset(*geometry); + loadConfig->releaseVersion = geometry->releaseVersion; + loadConfig->nonce = geometry->nonce; +} + +#endif // VOLUME_GEOMETRY_H diff --git a/vdo/base/waitQueue.c b/vdo/base/waitQueue.c new file mode 100644 index 0000000..3d7f175 --- /dev/null +++ b/vdo/base/waitQueue.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.c#1 $ + */ + +#include "waitQueue.h" + +#include "permassert.h" + +#include "statusCodes.h" + +/**********************************************************************/ +int enqueueWaiter(WaitQueue *queue, Waiter *waiter) +{ + int result = ASSERT((waiter->nextWaiter == NULL), + "new waiter must not already be in a waiter queue"); + if (result != VDO_SUCCESS) { + return result; + } + + if (queue->lastWaiter == NULL) { + // The queue is empty, so form the initial circular list by self-linking + // the initial waiter. + waiter->nextWaiter = waiter; + } else { + // Splice the new waiter in at the end of the queue. + waiter->nextWaiter = queue->lastWaiter->nextWaiter; + queue->lastWaiter->nextWaiter = waiter; + } + // In both cases, the waiter we added to the ring becomes the last waiter. + queue->lastWaiter = waiter; + queue->queueLength += 1; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue) +{ + // If the source queue is empty, there's nothing to do. + if (!hasWaiters(fromQueue)) { + return; + } + + if (hasWaiters(toQueue)) { + // Both queues are non-empty. Splice the two circular lists together by + // swapping the next (head) pointers in the list tails. + Waiter *fromHead = fromQueue->lastWaiter->nextWaiter; + Waiter *toHead = toQueue->lastWaiter->nextWaiter; + toQueue->lastWaiter->nextWaiter = fromHead; + fromQueue->lastWaiter->nextWaiter = toHead; + } + + toQueue->lastWaiter = fromQueue->lastWaiter; + toQueue->queueLength += fromQueue->queueLength; + initializeWaitQueue(fromQueue); +} + +/**********************************************************************/ +void notifyAllWaiters(WaitQueue *queue, + WaiterCallback *callback, + void *context) +{ + // Copy and empty the queue first, avoiding the possibility of an infinite + // loop if entries are returned to the queue by the callback function. + WaitQueue waiters; + initializeWaitQueue(&waiters); + transferAllWaiters(queue, &waiters); + + // Drain the copied queue, invoking the callback on every entry. + while (notifyNextWaiter(&waiters, callback, context)) { + // All the work is done by the loop condition. + } +} + +/**********************************************************************/ +Waiter *getFirstWaiter(const WaitQueue *queue) +{ + Waiter *lastWaiter = queue->lastWaiter; + if (lastWaiter == NULL) { + // There are no waiters, so we're done. + return NULL; + } + + // The queue is circular, so the last entry links to the head of the queue. + return lastWaiter->nextWaiter; +} + +/**********************************************************************/ +int dequeueMatchingWaiters(WaitQueue *queue, + WaiterMatch *matchMethod, + void *matchContext, + WaitQueue *matchedQueue) +{ + WaitQueue matchedWaiters; + initializeWaitQueue(&matchedWaiters); + + WaitQueue iterationQueue; + initializeWaitQueue(&iterationQueue); + transferAllWaiters(queue, &iterationQueue); + while (hasWaiters(&iterationQueue)) { + Waiter *waiter = dequeueNextWaiter(&iterationQueue); + int result = VDO_SUCCESS; + if (!matchMethod(waiter, matchContext)) { + result = enqueueWaiter(queue, waiter); + } else { + result = enqueueWaiter(&matchedWaiters, waiter); + } + if (result != VDO_SUCCESS) { + transferAllWaiters(&matchedWaiters, queue); + transferAllWaiters(&iterationQueue, queue); + return result; + } + } + + transferAllWaiters(&matchedWaiters, matchedQueue); + return VDO_SUCCESS; +} + +/**********************************************************************/ +Waiter *dequeueNextWaiter(WaitQueue *queue) +{ + Waiter *firstWaiter = getFirstWaiter(queue); + if (firstWaiter == NULL) { + return NULL; + } + + Waiter *lastWaiter = queue->lastWaiter; + if (firstWaiter == lastWaiter) { + // The queue has a single entry, so just empty it out by nulling the tail. + queue->lastWaiter = NULL; + } else { + // The queue has more than one entry, so splice the first waiter out of + // the circular queue. + lastWaiter->nextWaiter = firstWaiter->nextWaiter; + } + + // The waiter is no longer in a wait queue. + firstWaiter->nextWaiter = NULL; + queue->queueLength -= 1; + return firstWaiter; +} + +/**********************************************************************/ +bool notifyNextWaiter(WaitQueue *queue, + WaiterCallback *callback, + void *context) +{ + Waiter *waiter = dequeueNextWaiter(queue); + if (waiter == NULL) { + return false; + } + + if (callback == NULL) { + callback = waiter->callback; + } + (*callback)(waiter, context); + return true; +} + +/**********************************************************************/ +const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) +{ + Waiter *firstWaiter = getFirstWaiter(queue); + if (waiter == NULL) { + return firstWaiter; + } + return ((waiter->nextWaiter != firstWaiter) ? waiter->nextWaiter : NULL); +} diff --git a/vdo/base/waitQueue.h b/vdo/base/waitQueue.h new file mode 100644 index 0000000..5eb754e --- /dev/null +++ b/vdo/base/waitQueue.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.h#1 $ + */ + +#ifndef WAIT_QUEUE_H +#define WAIT_QUEUE_H + +#include "common.h" + +/** + * A wait queue is a circular list of entries waiting to be notified of a + * change in a condition. Keeping a circular list allows the queue structure + * to simply be a pointer to the tail (newest) entry in the queue, supporting + * constant-time enqueue and dequeue operations. A null pointer is an empty + * queue. + * + * An empty queue: + * queue0.lastWaiter -> NULL + * + * A singleton queue: + * queue1.lastWaiter -> entry1 -> entry1 -> [...] + * + * A three-element queue: + * queue2.lastWaiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] + **/ + +typedef struct waiter Waiter; + +typedef struct { + /** The tail of the queue, the last (most recently added) entry */ + Waiter *lastWaiter; + /** The number of waiters currently in the queue */ + size_t queueLength; +} WaitQueue; + +/** + * Callback type for functions which will be called to resume processing of a + * waiter after it has been removed from its wait queue. + **/ +typedef void WaiterCallback(Waiter *waiter, void *context); + +/** + * Method type for Waiter matching methods. + * + * A WaiterMatch method returns false if the waiter does not match. + **/ +typedef bool WaiterMatch(Waiter *waiter, void *context); + +/** + * The queue entry structure for entries in a WaitQueue. + **/ +struct waiter { + /** + * The next waiter in the queue. If this entry is the last waiter, then this + * is actually a pointer back to the head of the queue. + **/ + struct waiter *nextWaiter; + + /** Optional waiter-specific callback to invoke when waking this waiter. */ + WaiterCallback *callback; +}; + +/** + * Check whether a Waiter is waiting. + * + * @param waiter The waiter to check + * + * @return true if the waiter is on some WaitQueue + **/ +static inline bool isWaiting(Waiter *waiter) +{ + return (waiter->nextWaiter != NULL); +} + +/** + * Initialize a wait queue. + * + * @param queue The queue to initialize + **/ +static inline void initializeWaitQueue(WaitQueue *queue) +{ + *queue = (WaitQueue) { + .lastWaiter = NULL, + .queueLength = 0, + }; +} + +/** + * Check whether a wait queue has any entries waiting in it. + * + * @param queue The queue to query + * + * @return true if there are any waiters in the queue + **/ +__attribute__((warn_unused_result)) +static inline bool hasWaiters(const WaitQueue *queue) +{ + return (queue->lastWaiter != NULL); +} + +/** + * Add a waiter to the tail end of a wait queue. The waiter must not already + * be waiting in a queue. + * + * @param queue The queue to which to add the waiter + * @param waiter The waiter to add to the queue + * + * @return VDO_SUCCESS or an error code + **/ +int enqueueWaiter(WaitQueue *queue, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Notify all the entries waiting in a queue to continue execution by invoking + * a callback function on each of them in turn. The queue is copied and + * emptied before invoking any callbacks, and only the waiters that were in + * the queue at the start of the call will be notified. + * + * @param queue The wait queue containing the waiters to notify + * @param callback The function to call to notify each waiter, or NULL + * to invoke the callback field registered in each waiter + * @param context The context to pass to the callback function + **/ +void notifyAllWaiters(WaitQueue *queue, + WaiterCallback *callback, + void *context); + +/** + * Notify the next entry waiting in a queue to continue execution by invoking + * a callback function on it after removing it from the queue. + * + * @param queue The wait queue containing the waiter to notify + * @param callback The function to call to notify the waiter, or NULL + * to invoke the callback field registered in the waiter + * @param context The context to pass to the callback function + * + * @return true if there was a waiter in the queue + **/ +bool notifyNextWaiter(WaitQueue *queue, + WaiterCallback *callback, + void *context); + +/** + * Transfer all waiters from one wait queue to a second queue, emptying the + * first queue. + * + * @param fromQueue The queue containing the waiters to move + * @param toQueue The queue that will receive the waiters from the + * the first queue + **/ +void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue); + +/** + * Return the waiter that is at the head end of a wait queue. + * + * @param queue The queue from which to get the first waiter + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +Waiter *getFirstWaiter(const WaitQueue *queue); + +/** + * Remove all waiters that match based on the specified matching method and + * append them to a WaitQueue. + * + * @param queue The wait queue to process + * @param matchMethod The method to determine matching + * @param matchContext Contextual info for the match method + * @param matchedQueue A WaitQueue to store matches + * + * @return VDO_SUCCESS or an error code + **/ +int dequeueMatchingWaiters(WaitQueue *queue, + WaiterMatch *matchMethod, + void *matchContext, + WaitQueue *matchedQueue); + +/** + * Remove the first waiter from the head end of a wait queue. The caller will + * be responsible for waking the waiter by invoking the correct callback + * function to resume its execution. + * + * @param queue The wait queue from which to remove the first entry + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +Waiter *dequeueNextWaiter(WaitQueue *queue); + +/** + * Count the number of waiters in a wait queue. + * + * @param queue The wait queue to query + * + * @return the number of waiters in the queue + **/ +__attribute__((warn_unused_result)) +static inline size_t countWaiters(const WaitQueue *queue) +{ + return queue->queueLength; +} + +/** + * Get the waiter after this one, for debug iteration. + * + * @param queue The wait queue + * @param waiter A waiter + * + * @return the next waiter, or NULL + **/ +const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) + __attribute__((warn_unused_result)); + +#endif // WAIT_QUEUE_H diff --git a/vdo/kernel/batchProcessor.c b/vdo/kernel/batchProcessor.c new file mode 100644 index 0000000..5845960 --- /dev/null +++ b/vdo/kernel/batchProcessor.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.c#2 $ + */ + +#include "batchProcessor.h" + +#include "memoryAlloc.h" + +#include "constants.h" + +#include "kernelLayer.h" + +/* + * On memory ordering: + * + * The producer thread does: enqueue item on queue (xchg, which is + * implicitly interlocked, then a store), memory barrier, then atomic + * cmpxchg of the state field. The x86 architecture spec says the + * xchg, store, lock-cmpxchg sequence cannot be reordered, but on + * architectures using load-linked and store-conditional for the + * cmpxchg, like AArch64, the LL can be reordered with the store, so + * we add a barrier. + * + * The consumer thread, when it is running out of work, does: read + * queue (find empty), set state, mfence, read queue again just to be + * sure. The set-state and read-queue cannot be reordered with respect + * to the mfence (but without the mfence, the read could be moved + * before the set). + * + * The xchg and mfence impose a total order across processors, and + * each processor sees the stores done by the other processor in the + * required order. If the xchg happens before the mfence, the + * consumer's "read queue again" operation will see the update. If the + * mfence happens first, the producer's "cmpxchg state" will see its + * updated value. + * + * These are the semantics implemented by memory set to WB (write-back + * caching) mode on x86-64. So, the simple analysis is that no wakeups + * should be missed. + * + * It's a little subtler with funnel queues, since one interrupted or + * delayed enqueue operation (see the commentary in funnelQueuePut) + * can cause another, concurrent enqueue operation to complete without + * actually making the entry visible to the consumer. In essence, one + * update makes no new work items visible to the consumer, and the + * other (when it eventually completes) makes two (or more) work items + * visible, and each one ensures that the consumer will process what + * it has made visible. + */ + +typedef enum batchProcessorState { + BATCH_PROCESSOR_IDLE, + BATCH_PROCESSOR_ENQUEUED, +} BatchProcessorState; + +struct batchProcessor { + spinlock_t consumerLock; + FunnelQueue *queue; + KvdoWorkItem workItem; + atomic_t state; + BatchProcessorCallback callback; + void *closure; + KernelLayer *layer; +}; + +static void scheduleBatchProcessing(BatchProcessor *batch); + +/** + * Apply the batch processing function to the accumulated set of + * objects. + * + * Runs in a "CPU queue". + * + * @param [in] item The work item embedded in the BatchProcessor + **/ +static void batchProcessorWork(KvdoWorkItem *item) +{ + BatchProcessor *batch = container_of(item, BatchProcessor, workItem); + spin_lock(&batch->consumerLock); + while (!isFunnelQueueEmpty(batch->queue)) { + batch->callback(batch, batch->closure); + } + atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); + memoryFence(); + bool needReschedule = !isFunnelQueueEmpty(batch->queue); + spin_unlock(&batch->consumerLock); + if (needReschedule) { + scheduleBatchProcessing(batch); + } +} + +/** + * Ensure that the batch-processing function is scheduled to run. + * + * If we're the thread that switches the BatchProcessor state from + * idle to enqueued, we're the thread responsible for actually + * enqueueing it. If some other thread got there first, or it was + * already enqueued, it's not our problem. + * + * @param [in] batch The BatchProcessor control data + **/ +static void scheduleBatchProcessing(BatchProcessor *batch) +{ + /* + * We want this to be very fast in the common cases. + * + * In testing on our "mgh" class machines (HP ProLiant DL380p Gen8, + * Intel Xeon E5-2690, 2.9GHz), it appears that under some + * conditions it's a little faster to use a memory fence and then + * read the "state" field, skipping the cmpxchg if the state is + * already set to BATCH_PROCESSOR_ENQUEUED. (Sometimes slightly + * faster still if we prefetch the state field first.) Note that the + * read requires the fence, otherwise it could be executed before + * the preceding store by the FunnelQueue code to the "next" + * pointer, which can, very rarely, result in failing to issue a + * wakeup when needed. + * + * However, the gain is small, and in testing on our older "harvard" + * class machines (Intel Xeon X5680, 3.33GHz) it was a clear win to + * skip all of that and go right for the cmpxchg. + * + * Of course, the tradeoffs may be sensitive to the particular work + * going on, cache pressure, etc. + */ + smp_mb(); + BatchProcessorState oldState + = atomic_cmpxchg(&batch->state, BATCH_PROCESSOR_IDLE, + BATCH_PROCESSOR_ENQUEUED); + bool doSchedule = (oldState == BATCH_PROCESSOR_IDLE); + if (doSchedule) { + enqueueCPUWorkQueue(batch->layer, &batch->workItem); + } +} + +/**********************************************************************/ +int makeBatchProcessor(KernelLayer *layer, + BatchProcessorCallback callback, + void *closure, + BatchProcessor **batchPtr) +{ + BatchProcessor *batch; + + int result = ALLOCATE(1, BatchProcessor, "batchProcessor", &batch); + if (result != UDS_SUCCESS) { + return result; + } + result = makeFunnelQueue(&batch->queue); + if (result != UDS_SUCCESS) { + FREE(batch); + return result; + } + + spin_lock_init(&batch->consumerLock); + setupWorkItem(&batch->workItem, batchProcessorWork, + (KvdoWorkFunction) callback, CPU_Q_ACTION_COMPLETE_KVIO); + atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); + batch->callback = callback; + batch->closure = closure; + batch->layer = layer; + + *batchPtr = batch; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item) +{ + funnelQueuePut(batch->queue, &item->workQueueEntryLink); + scheduleBatchProcessing(batch); +} + +/**********************************************************************/ +KvdoWorkItem *nextBatchItem(BatchProcessor *batch) +{ + FunnelQueueEntry *fqEntry = funnelQueuePoll(batch->queue); + if (fqEntry == NULL) { + return NULL; + } + + return container_of(fqEntry, KvdoWorkItem, workQueueEntryLink); +} + +/**********************************************************************/ +void condReschedBatchProcessor(BatchProcessor *batch) +{ + cond_resched_lock(&batch->consumerLock); +} + +/**********************************************************************/ +void freeBatchProcessor(BatchProcessor **batchPtr) +{ + BatchProcessor *batch = *batchPtr; + if (batch) { + memoryFence(); + BUG_ON(atomic_read(&batch->state) == BATCH_PROCESSOR_ENQUEUED); + freeFunnelQueue(batch->queue); + FREE(batch); + *batchPtr = NULL; + } +} diff --git a/vdo/kernel/batchProcessor.h b/vdo/kernel/batchProcessor.h new file mode 100644 index 0000000..5e348c6 --- /dev/null +++ b/vdo/kernel/batchProcessor.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.h#2 $ + */ + +#ifndef BATCHPROCESSOR_H +#define BATCHPROCESSOR_H + +#include "kernelTypes.h" +#include "util/funnelQueue.h" + +/** + * Control data for managing collections of objects to be operated on + * by a specified function. May be used when the work function is + * lightweight enough or cache-contentious enough that it makes sense + * to try to accumulate multiple objects and operate on them all at + * once in one thread. + * + * The work function is run in one of the kernel layer's "CPU queues", + * and care is taken to ensure that only one invocation can be running + * or scheduled at any given time. It can loop calling nextBatchItem + * repeatedly until there are no more objects to operate on. It should + * also call condReschedBatchProcessor now and then, to play nicely + * with the OS scheduler. + * + * Objects to operate on are manipulated through a FunnelQueueEntry + * object which must be contained within them. + **/ +typedef struct batchProcessor BatchProcessor; + +typedef void (*BatchProcessorCallback)(BatchProcessor *batch, void *closure); + +/** + * Creates a batch-processor control structure. + * + * @param [in] layer The kernel layer data, used to enqueue work items + * @param [in] callback A function to process the accumulated objects + * @param [in] closure A private data pointer for use by the callback + * @param [out] batchPtr Where to store the pointer to the new object + * + * @return UDS_SUCCESS or an error code + **/ +int makeBatchProcessor(KernelLayer *layer, + BatchProcessorCallback callback, + void *closure, + BatchProcessor **batchPtr); + +/** + * Adds an object to the processing queue. + * + *

If the callback function is not currently running or scheduled to be run, + * it gets queued up to run. + * + * @param [in] batch The batch-processor data + * @param [in] item The handle on the new object to add + **/ +void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item); + +/** + * Fetches the next object in the processing queue. + * + * @param [in] batch The batch-processor data + * + * @return An object pointer or NULL + **/ +KvdoWorkItem *nextBatchItem(BatchProcessor *batch) + __attribute__((warn_unused_result)); + +/** + * Free the batch-processor data and null out the pointer. + * + * @param [in,out] batchPtr Where the BatchProcessor pointer is stored + **/ +void freeBatchProcessor(BatchProcessor **batchPtr); + +/** + * Yield control to the scheduler if the kernel has indicated that + * other work needs to run on the current processor. + * + * The data structure is needed so that the spin lock can be + * (conditionally) released and re-acquired. + * + * @param [in] batch The batch-processor data + **/ +void condReschedBatchProcessor(BatchProcessor *batch); + +#endif // BATCHPROCESSOR_H diff --git a/vdo/kernel/bio.c b/vdo/kernel/bio.c new file mode 100644 index 0000000..a8e3a5e --- /dev/null +++ b/vdo/kernel/bio.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.c#8 $ + */ + +#include "bio.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + +#include "flush.h" +#include "recoveryJournal.h" + +#include "bioIterator.h" +#include "ioSubmitter.h" + +/** + * Gets the raw buffer from a biovec. + * + * @param biovec The biovec in question + * + * @return the buffer + **/ +static char *getBufferForBiovec(struct bio_vec *biovec) +{ + return (page_address(biovec->bv_page) + biovec->bv_offset); +} + +/**********************************************************************/ +void bioCopyDataIn(BIO *bio, char *dataPtr) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + memcpy(dataPtr, getBufferForBiovec(biovec), biovec->bv_len); + dataPtr += biovec->bv_len; + } +} + +/**********************************************************************/ +void bioCopyDataOut(BIO *bio, char *dataPtr) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + memcpy(getBufferForBiovec(biovec), dataPtr, biovec->bv_len); + flush_dcache_page(biovec->bv_page); + dataPtr += biovec->bv_len; + } +} + +/**********************************************************************/ +void setBioOperation(BIO *bio, unsigned int operation) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= operation; +#else + + unsigned int OPERATION_MASK = WRITE | REQ_DISCARD | REQ_FLUSH; + + // Clear the relevant bits + bio->bi_rw &= ~OPERATION_MASK; + // Set the operation we care about + bio->bi_rw |= operation; +#endif +} + +/**********************************************************************/ +void freeBio(BIO *bio, KernelLayer *layer) +{ + bio_put(bio); +} + +/**********************************************************************/ +void countBios(AtomicBioStats *bioStats, BIO *bio) +{ + if (isWriteBio(bio)) { + atomic64_inc(&bioStats->write); + } else { + atomic64_inc(&bioStats->read); + } + if (isDiscardBio(bio)) { + atomic64_inc(&bioStats->discard); + } + if (isFlushBio(bio)) { + atomic64_inc(&bioStats->flush); + } + if (isFUABio(bio)) { + atomic64_inc(&bioStats->fua); + } +} + +/** + * The function determines whether a buffer contains all zeroes. + * + * @param buffer The buffer to check + * @param length The length of the buffer + * + * @return true is all zeroes, false otherwise + **/ +static inline bool isAllZeros(const char *buffer, unsigned int length) +{ + /* + * Handle expected common case of even the first word being nonzero, + * without getting into the more expensive (for one iteration) loop + * below. + */ + if (likely(length >= sizeof(uint64_t))) { + if (GET_UNALIGNED(uint64_t, buffer) != 0) { + return false; + } + + unsigned int wordCount = length / sizeof(uint64_t); + + // Unroll to process 64 bytes at a time + unsigned int chunkCount = wordCount / 8; + while (chunkCount-- > 0) { + uint64_t word0 = GET_UNALIGNED(uint64_t, buffer); + uint64_t word1 = GET_UNALIGNED(uint64_t, buffer + 1 * sizeof(uint64_t)); + uint64_t word2 = GET_UNALIGNED(uint64_t, buffer + 2 * sizeof(uint64_t)); + uint64_t word3 = GET_UNALIGNED(uint64_t, buffer + 3 * sizeof(uint64_t)); + uint64_t word4 = GET_UNALIGNED(uint64_t, buffer + 4 * sizeof(uint64_t)); + uint64_t word5 = GET_UNALIGNED(uint64_t, buffer + 5 * sizeof(uint64_t)); + uint64_t word6 = GET_UNALIGNED(uint64_t, buffer + 6 * sizeof(uint64_t)); + uint64_t word7 = GET_UNALIGNED(uint64_t, buffer + 7 * sizeof(uint64_t)); + uint64_t or = (word0 | word1 | word2 | word3 + | word4 | word5 | word6 | word7); + // Prevent compiler from using 8*(cmp;jne). + __asm__ __volatile__ ("" : : "g" (or)); + if (or != 0) { + return false; + } + buffer += 8 * sizeof(uint64_t); + } + wordCount %= 8; + + // Unroll to process 8 bytes at a time. + // (Is this still worthwhile?) + while (wordCount-- > 0) { + if (GET_UNALIGNED(uint64_t, buffer) != 0) { + return false; + } + buffer += sizeof(uint64_t); + } + length %= sizeof(uint64_t); + // Fall through to finish up anything left over. + } + + while (length-- > 0) { + if (*buffer++ != 0) { + return false; + } + } + return true; +} + +/**********************************************************************/ +bool bioIsZeroData(BIO *bio) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + if (!isAllZeros(getBufferForBiovec(biovec), biovec->bv_len)) { + return false; + } + } + return true; +} + +/**********************************************************************/ +void bioZeroData(BIO *bio) +{ + zero_fill_bio(bio); +} + +/**********************************************************************/ +static void setBioSize(BIO *bio, BlockSize bioSize) +{ +#ifdef USE_BI_ITER + bio->bi_iter.bi_size = bioSize; +#else + bio->bi_size = bioSize; +#endif +} + +/** + * Initialize a bio. + * + * @param bio The bio to initialize + * @param layer The layer to which it belongs. + **/ +static void initializeBio(BIO *bio, KernelLayer *layer) +{ + // Save off important info so it can be set back later + unsigned short vcnt = bio->bi_vcnt; + void *pvt = bio->bi_private; + bio_reset(bio); // Memsets large portion of bio. Reset all needed fields. + bio->bi_private = pvt; + bio->bi_vcnt = vcnt; + bio->bi_end_io = completeAsyncBio; + setBioSector(bio, (sector_t) -1); // Sector will be set later on. + setBioBlockDevice(bio, getKernelLayerBdev(layer)); +} + +/**********************************************************************/ +void resetBio(BIO *bio, KernelLayer *layer) +{ + initializeBio(bio, layer); + setBioSize(bio, VDO_BLOCK_SIZE); +} + +/**********************************************************************/ +int allocateBio(KernelLayer *layer, unsigned int bvecCount, BIO **bioPtr) +{ + BIO *bio = bio_alloc_bioset(GFP_NOIO, bvecCount, layer->bioset); + if (IS_ERR(bio)) { + logError("bio allocation failure %ld", PTR_ERR(bio)); + return PTR_ERR(bio); + } + + initializeBio(bio, layer); + + *bioPtr = bio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int createBio(KernelLayer *layer, char *data, BIO **bioPtr) +{ + BIO *bio = NULL; + if (data == NULL) { + int result = allocateBio(layer, 0, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + *bioPtr = bio; + return VDO_SUCCESS; + } + + unsigned int len = VDO_BLOCK_SIZE; + unsigned long kaddr = (unsigned long) data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int bvecCount = end - start; + + int result = allocateBio(layer, bvecCount, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + int offset = offset_in_page(kaddr); + for (unsigned int i = 0; (i < bvecCount) && (len > 0); i++) { + unsigned int bytes = PAGE_SIZE - offset; + if (bytes > len) { + bytes = len; + } + + struct page *page + = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); + int bytesAdded = bio_add_page(bio, page, bytes, offset); + if (bytesAdded != bytes) { + freeBio(bio, layer); + return logErrorWithStringError(VDO_BIO_CREATION_FAILED, + "Could only add %i bytes to bio", + bytesAdded); + + } + + data += bytes; + len -= bytes; + offset = 0; + } + + *bioPtr = bio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void prepareFlushBIO(BIO *bio, + void *context, + struct block_device *device, + bio_end_io_t *endIOCallback) +{ + clearBioOperationAndFlags(bio); + /* + * One would think we could use REQ_OP_FLUSH on new kernels, but some + * layers of the stack don't recognize that as a flush. So do it + * like blkdev_issue_flush() and make it a write+flush. + */ + setBioOperationWrite(bio); + setBioOperationFlagPreflush(bio); + bio->bi_end_io = endIOCallback; + bio->bi_private = context; + bio->bi_vcnt = 0; + setBioBlockDevice(bio, device); + setBioSize(bio, 0); + setBioSector(bio, 0); +} diff --git a/vdo/kernel/bio.h b/vdo/kernel/bio.h new file mode 100644 index 0000000..1ba8234 --- /dev/null +++ b/vdo/kernel/bio.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.h#6 $ + */ + +#ifndef BIO_H +#define BIO_H + +#include +#include +#include + +#include "kernelTypes.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) +#define USE_BI_ITER 1 +#endif + +/** + * Copy the bio data to a char array. + * + * @param bio The bio to copy the data from + * @param dataPtr The local array to copy the data to + **/ +void bioCopyDataIn(BIO *bio, char *dataPtr); + +/** + * Copy a char array to the bio data. + * + * @param bio The bio to copy the data to + * @param dataPtr The local array to copy the data from + **/ +void bioCopyDataOut(BIO *bio, char *dataPtr); + +/** + * Set the bi_rw or equivalent field of a bio to a particular data + * operation. Intended to be called only by setBioOperationRead() etc. + * + * @param bio The bio to modify + * @param operation The operation to set it to + **/ +void setBioOperation(BIO *bio, unsigned int operation); + +/**********************************************************************/ +static inline void setBioOperationRead(BIO *bio) +{ + setBioOperation(bio, READ); +} + +/**********************************************************************/ +static inline void setBioOperationWrite(BIO *bio) +{ + setBioOperation(bio, WRITE); +} + +/**********************************************************************/ +static inline void clearBioOperationAndFlags(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf = 0; +#else + bio->bi_rw = 0; +#endif +} + +/**********************************************************************/ +static inline void copyBioOperationAndFlags(BIO *to, BIO *from) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + to->bi_opf = from->bi_opf; +#else + to->bi_rw = from->bi_rw; +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlag(BIO *bio, unsigned int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf |= flag; +#else + bio->bi_rw |= flag; +#endif +} + +/**********************************************************************/ +static inline void clearBioOperationFlag(BIO *bio, unsigned int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf &= ~flag; +#else + bio->bi_rw &= ~flag; +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlagPreflush(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + setBioOperationFlag(bio, REQ_PREFLUSH); +#else + // Preflushes and empty flushes are not currently distinguished. + setBioOperation(bio, WRITE_FLUSH); +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlagSync(BIO *bio) +{ + setBioOperationFlag(bio, REQ_SYNC); +} + +/**********************************************************************/ +static inline void clearBioOperationFlagSync(BIO *bio) +{ + clearBioOperationFlag(bio, REQ_SYNC); +} + +/**********************************************************************/ +static inline void setBioOperationFlagFua(BIO *bio) +{ + setBioOperationFlag(bio, REQ_FUA); +} + +/**********************************************************************/ +static inline void clearBioOperationFlagFua(BIO *bio) +{ + clearBioOperationFlag(bio, REQ_FUA); +} + +/**********************************************************************/ +static inline bool isDiscardBio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio != NULL) && (bio_op(bio) == REQ_OP_DISCARD); +#else + return (bio != NULL) && ((bio->bi_rw & REQ_DISCARD) != 0); +#endif +} + +/**********************************************************************/ +static inline bool isFlushBio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0); +#else + return (bio->bi_rw & REQ_FLUSH) != 0; +#endif +} + +/**********************************************************************/ +static inline bool isFUABio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio->bi_opf & REQ_FUA) != 0; +#else + return (bio->bi_rw & REQ_FUA) != 0; +#endif +} + +/**********************************************************************/ +static inline bool isReadBio(BIO *bio) +{ + return bio_data_dir(bio) == READ; +} + +/**********************************************************************/ +static inline bool isWriteBio(BIO *bio) +{ + return bio_data_dir(bio) == WRITE; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Get the error from the bio. + * + * @param bio The bio + * + * @return the bio's error if any + **/ +static inline int getBioResult(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + return blk_status_to_errno(bio->bi_status); +#else + return bio->bi_error; +#endif +} +#endif // newer than 4.4 + +/** + * Set the block device for a bio. + * + * @param bio The bio to modify + * @param device The new block device for the bio + **/ +static inline void setBioBlockDevice(BIO *bio, struct block_device *device) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) + bio_set_dev(bio, device); +#else + bio->bi_bdev = device; +#endif +} + +/** + * Get a bio's size. + * + * @param bio The bio + * + * @return the bio's size + **/ +static inline unsigned int getBioSize(BIO *bio) +{ +#ifdef USE_BI_ITER + return bio->bi_iter.bi_size; +#else + return bio->bi_size; +#endif +} + +/** + * Set the bio's sector. + * + * @param bio The bio + * @param sector The sector + **/ +static inline void setBioSector(BIO *bio, sector_t sector) +{ +#ifdef USE_BI_ITER + bio->bi_iter.bi_sector = sector; +#else + bio->bi_sector = sector; +#endif +} + +/** + * Get the bio's sector. + * + * @param bio The bio + * + * @return the sector + **/ +static inline sector_t getBioSector(BIO *bio) +{ +#ifdef USE_BI_ITER + return bio->bi_iter.bi_sector; +#else + return bio->bi_sector; +#endif +} + +/** + * Tell the kernel we've completed processing of this bio. + * + * @param bio The bio to complete + * @param error A system error code, or 0 for success + **/ +static inline void completeBio(BIO *bio, int error) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + bio->bi_status = errno_to_blk_status(error); + bio_endio(bio); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + bio->bi_error = error; + bio_endio(bio); +#else + bio_endio(bio, error); +#endif +} + +/** + * Frees up a bio structure + * + * @param bio The bio to free + * @param layer The layer the bio was created in + **/ +void freeBio(BIO *bio, KernelLayer *layer); + +/** + * Count the statistics for the bios. This is used for calls into VDO and + * for calls out of VDO. + * + * @param bioStats Statistics structure to update + * @param bio The bio + **/ +void countBios(AtomicBioStats *bioStats, BIO *bio); + +/** + * Reset a bio so it can be used again. + * + * @param bio The bio to reset + * @param layer The physical layer + **/ +void resetBio(BIO *bio, KernelLayer *layer); + +/** + * Check to see whether a bio's data are all zeroes. + * + * @param bio The bio + * + * @return true if the bio's data are all zeroes + **/ +bool bioIsZeroData(BIO *bio); + +/** + * Set a bio's data to all zeroes. + * + * @param [in] bio The bio + **/ +void bioZeroData(BIO *bio); + +/** + * Create a new bio structure for kernel buffer storage. + * + * @param [in] layer The physical layer + * @param [in] data The buffer (can be NULL) + * @param [out] bioPtr A pointer to hold new bio + * + * @return VDO_SUCCESS or an error + **/ +int createBio(KernelLayer *layer, char *data, BIO **bioPtr); + +/** + * Prepare a BIO to issue a flush to the device below. + * + * @param bio The flush BIO + * @param context The context for the callback + * @param device The device to flush + * @param endIOCallback The function to call when the flush is complete + **/ +void prepareFlushBIO(BIO *bio, + void *context, + struct block_device *device, + bio_end_io_t *endIOCallback); + +/** + * Perform IO with a bio, waiting for completion and returning its result. + * The bio must already have its sector, block device, and operation set. + * + * @param bio The bio to do IO with + * + * @return The bio result + **/ +static inline int submitBioAndWait(BIO *bio) +{ + submit_bio_wait(bio); + return getBioResult(bio); +} + +#endif /* BIO_H */ diff --git a/vdo/kernel/bioIterator.h b/vdo/kernel/bioIterator.h new file mode 100644 index 0000000..7445261 --- /dev/null +++ b/vdo/kernel/bioIterator.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bioIterator.h#1 $ + */ + +#ifndef BIO_ITERATOR_H +#define BIO_ITERATOR_H + +#include + +#include "bio.h" +#include "kernelTypes.h" + +typedef struct { + BIO *bio; +#ifdef USE_BI_ITER + struct bvec_iter iter; + // Needed so we can store the return value of bio_iter_iovec. + struct bio_vec temp; +#else + int index; +#endif +} BioIterator; + +/** + * Create an iterator over a bio's data. + * + * @param bio The bio to iterate over + * + * @return An iterator over a bio + **/ +static BioIterator createBioIterator(BIO *bio) +{ + BioIterator iterator = { + .bio = bio, +#ifdef USE_BI_ITER + .iter = bio->bi_iter, +#else + .index = bio->bi_idx, +#endif + }; + return iterator; +} + +/** + * Get the next biovec from the iterator, or NULL if there are no more. + * + * @param iterator The iterator from which to get data + * + * @return The next biovec from the iterator, or NULL. + **/ +static struct bio_vec *getNextBiovec(BioIterator *iterator) +{ + BIO *bio = iterator->bio; +#ifdef USE_BI_ITER + if (iterator->iter.bi_size == 0) { + return NULL; + } + + iterator->temp = bio_iter_iovec(bio, iterator->iter); + return &iterator->temp; +#else + if (iterator->index >= bio->bi_vcnt) { + return NULL; + } + return bio_iovec_idx(bio, iterator->index); +#endif +} + +/** + * Advance the iterator to the next biovec in the bio. + * + * @param [in,out] iterator The iterator to advance + **/ +static void advanceBioIterator(BioIterator *iterator) +{ +#ifdef USE_BI_ITER + bio_advance_iter(iterator->bio, &iterator->iter, iterator->temp.bv_len); +#else + iterator->index++; +#endif +} + +#endif /* BIO_ITERATOR_H */ diff --git a/vdo/kernel/bufferPool.c b/vdo/kernel/bufferPool.c new file mode 100644 index 0000000..9c950ca --- /dev/null +++ b/vdo/kernel/bufferPool.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.c#1 $ + */ + +#include "bufferPool.h" + +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "statusCodes.h" + +/* + * For list nodes on the free-object list, the data field describes + * the object available for reuse. + * + * For nodes on the "spare" list, the data field is meaningless; + * they're just nodes available for use when we need to add an object + * pointer to the freeObjectList. + * + * These are both "free lists", in a sense; don't get confused! + */ +typedef struct { + struct list_head list; // links in current list + void *data; // element data, if on free list +} BufferElement; + +struct bufferPool { + const char *name; // Pool name + void *data; // Associated pool data + spinlock_t lock; // Locks this object + unsigned int size; // Total number of buffers + struct list_head freeObjectList; // List of free buffers + struct list_head spareListNodes; // Unused list nodes + unsigned int numBusy; // Number of buffers in use + unsigned int maxBusy; // Maximum value of the above + BufferAllocateFunction *alloc; // Allocate function for buffer data + BufferFreeFunction *free; // Free function for buffer data + BufferDumpFunction *dump; // Dump function for buffer data + BufferElement *bhead; // Array of BufferElement structures + void **objects; +}; + +/*************************************************************************/ +int makeBufferPool(const char *poolName, + unsigned int size, + BufferAllocateFunction *allocateFunction, + BufferFreeFunction *freeFunction, + BufferDumpFunction *dumpFunction, + void *poolData, + BufferPool **poolPtr) +{ + BufferPool *pool; + + int result = ALLOCATE(1, BufferPool, "buffer pool", &pool); + if (result != VDO_SUCCESS) { + logError("buffer pool allocation failure %d", result); + return result; + } + + result = ALLOCATE(size, BufferElement, "buffer pool elements", &pool->bhead); + if (result != VDO_SUCCESS) { + logError("buffer element array allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + + result = ALLOCATE(size, void *, "object pointers", &pool->objects); + if (result != VDO_SUCCESS) { + logError("buffer object array allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + + pool->name = poolName; + pool->alloc = allocateFunction; + pool->free = freeFunction; + pool->dump = dumpFunction; + pool->data = poolData; + pool->size = size; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->freeObjectList); + INIT_LIST_HEAD(&pool->spareListNodes); + BufferElement *bh = pool->bhead; + for (int i = 0; i < pool->size; i++) { + result = pool->alloc(pool->data, &bh->data); + if (result != VDO_SUCCESS) { + logError("verify buffer data allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + pool->objects[i] = bh->data; + list_add(&bh->list, &pool->freeObjectList); + bh++; + } + pool->numBusy = pool->maxBusy = 0; + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/*************************************************************************/ +void freeBufferPool(BufferPool **poolPtr) +{ + BufferPool *pool = *poolPtr; + if (pool == NULL) { + return; + } + + ASSERT_LOG_ONLY((pool->numBusy == 0), "freeing busy buffer pool, numBusy=%d", + pool->numBusy); + if (pool->objects != NULL) { + for (int i = 0; i < pool->size; i++) { + if (pool->objects[i] != NULL) { + pool->free(pool->data, pool->objects[i]); + } + } + FREE(pool->objects); + } + FREE(pool->bhead); + FREE(pool); + *poolPtr = NULL; +} + +/*************************************************************************/ +static bool inFreeList(BufferPool *pool, void *data) +{ + struct list_head *node; + list_for_each(node, &pool->freeObjectList) { + if (container_of(node, BufferElement, list)->data == data) { + return true; + } + } + return false; +} + +/*************************************************************************/ +void dumpBufferPool(BufferPool *pool, bool dumpElements) +{ + // In order that syslog can empty its buffer, sleep after 35 elements for + // 4ms (till the second clock tick). These numbers chosen in October + // 2012 running on an lfarm. + enum { ELEMENTS_PER_BATCH = 35 }; + enum { SLEEP_FOR_SYSLOG = 4 }; + + if (pool == NULL) { + return; + } + spin_lock(&pool->lock); + logInfo("%s: %u of %u busy (max %u)", pool->name, pool->numBusy, pool->size, + pool->maxBusy); + if (dumpElements && (pool->dump != NULL)) { + int dumped = 0; + for (int i = 0; i < pool->size; i++) { + if (!inFreeList(pool, pool->objects[i])) { + pool->dump(pool->data, pool->objects[i]); + if (++dumped >= ELEMENTS_PER_BATCH) { + spin_unlock(&pool->lock); + dumped = 0; + msleep(SLEEP_FOR_SYSLOG); + spin_lock(&pool->lock); + } + } + } + } + spin_unlock(&pool->lock); +} + +/*************************************************************************/ +int allocBufferFromPool(BufferPool *pool, void **dataPtr) +{ + if (pool == NULL) { + return UDS_INVALID_ARGUMENT; + } + + spin_lock(&pool->lock); + if (unlikely(list_empty(&pool->freeObjectList))) { + spin_unlock(&pool->lock); + logDebug("no free buffers"); + return -ENOMEM; + } + + BufferElement *bh = list_first_entry(&pool->freeObjectList, BufferElement, + list); + list_move(&bh->list, &pool->spareListNodes); + pool->numBusy++; + if (pool->numBusy > pool->maxBusy) { + pool->maxBusy = pool->numBusy; + } + *dataPtr = bh->data; + spin_unlock(&pool->lock); + return VDO_SUCCESS; + +} + +/*************************************************************************/ +static bool freeBufferToPoolInternal(BufferPool *pool, void *data) +{ + if (unlikely(list_empty(&pool->spareListNodes))) { + return false; + } + BufferElement *bh = list_first_entry(&pool->spareListNodes, BufferElement, + list); + list_move(&bh->list, &pool->freeObjectList); + bh->data = data; + pool->numBusy--; + return true; +} + +/*************************************************************************/ +void freeBufferToPool(BufferPool *pool, void *data) +{ + spin_lock(&pool->lock); + bool success = freeBufferToPoolInternal(pool, data); + spin_unlock(&pool->lock); + if (!success) { + logDebug("trying to add to free list when already full"); + } +} + +/*************************************************************************/ +void freeBuffersToPool(BufferPool *pool, void **data, int count) +{ + spin_lock(&pool->lock); + bool success = true; + for (int i = 0; (i < count) && success; i++) { + success = freeBufferToPoolInternal(pool, data[i]); + } + spin_unlock(&pool->lock); + if (!success) { + logDebug("trying to add to free list when already full"); + } +} diff --git a/vdo/kernel/bufferPool.h b/vdo/kernel/bufferPool.h new file mode 100644 index 0000000..9c505c9 --- /dev/null +++ b/vdo/kernel/bufferPool.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.h#1 $ + */ +#ifndef BUFFERPOOL_H +#define BUFFERPOOL_H + +/* + * We need bug.h because in 3.10, kernel.h (indirectly) defines + * ARRAY_SIZE as a macro which (indirectly and conditionally) uses + * BUILD_BUG_ON_ZERO, which is defined in bug.h, which is *not* + * included. In earlier versions like 3.2 it Just Worked. + */ +#include +#include +#include + +typedef struct bufferPool BufferPool; + +typedef int BufferAllocateFunction(void *poolData, void **dataPtr); +typedef void BufferFreeFunction(void *poolData, void *data); +typedef void BufferDumpFunction(void *poolData, void *data); + +/** + * Creates a generic pool of buffer data. The elements in the pool are + * allocated up front and placed on a free list, which manages the + * reuse of the individual buffers in the pool. + * + * @param [in] poolName Name of the pool + * @param [in] size The number of elements to create for this pool + * @param [in] allocateFunction The function to call to create the actual data + * for each element + * @param [in] freeFunction The function to call to free the actual data + * for each element + * @param [in] dumpFunction The function to call to dump the actual data + * for each element into the log + * @param [in] poolData A pointer to the pool's associated data + * @param [out] poolPtr A pointer to hold the pool that was created + * + * @return a success or error code + */ +int makeBufferPool(const char *poolName, + unsigned int size, + BufferAllocateFunction *allocateFunction, + BufferFreeFunction *freeFunction, + BufferDumpFunction *dumpFunction, + void *poolData, + BufferPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Free a buffer pool and null out the reference to it. This will free + * all the elements of the pool as well. + * + * @param [in] poolPtr The reference to the pool to free + **/ +void freeBufferPool(BufferPool **poolPtr); + +/** + * Dump a buffer pool to the log. + * + * @param [in] pool The buffer pool to allocate from + * @param [in] dumpElements True for complete output, or false for a + * one-line summary + **/ +void dumpBufferPool(BufferPool *pool, bool dumpElements); + +/** + * Acquires a free buffer from the free list of the pool and + * returns it's associated data. + * + * @param [in] pool The buffer pool to allocate from + * @param [out] dataPtr A pointer to hold the buffer data + * + * @return a success or error code + */ +int allocBufferFromPool(BufferPool *pool, void **dataPtr) + __attribute__((warn_unused_result)); + +/** + * Returns a buffer to the free list of a pool + * + * @param [in] pool The buffer pool to return the buffer to + * @param [in] data The buffer data to return + */ +void freeBufferToPool(BufferPool *pool, void *data); + +/** + * Returns a set of buffers to the free list of a pool + * + * @param [in] pool The buffer pool to return the buffer to + * @param [in] data The buffer data to return + * @param [in] count Number of entries in the data array + */ +void freeBuffersToPool(BufferPool *pool, void **data, int count); + +/** + * Control structure for freeing (releasing back to the pool) pointers + * in batches. + * + * Since the objects stored in a buffer pool are completely opaque, + * some external data structure is needed to manage a collection of + * them. This is a simple helper for doing that, since we're freeing + * batches of objects in a couple different places. Within the pool + * itself there's a pair of linked lists, but getting at them requires + * the locking that we're trying to minimize. + * + * We collect pointers until the array is full or until there are no + * more available, and we call freeBuffersToPool to release a batch + * all at once. + **/ +typedef struct freeBufferPointers { + BufferPool *pool; + int index; + void *pointers[30]; // size is arbitrary +} FreeBufferPointers; + +/** + * Initialize the control structure for batching buffer pointers to be + * released to their pool. + * + * @param [out] fbp The (caller-allocated) control structure + * @param [in] pool The buffer pool to return objects to. + **/ +static inline void initFreeBufferPointers(FreeBufferPointers *fbp, + BufferPool *pool) +{ + fbp->index = 0; + fbp->pool = pool; +} + +/** + * Release any buffers left in the collection. + * + * @param [in] fbp The control structure + **/ +static inline void freeBufferPointers(FreeBufferPointers *fbp) +{ + freeBuffersToPool(fbp->pool, fbp->pointers, fbp->index); + fbp->index = 0; +} + +/** + * Add another buffer pointer to the collection, and if we're full, + * release the whole batch to the pool. + * + * @param [in] fbp The control structure + * @param [in] pointer The buffer pointer to release + **/ +static inline void addFreeBufferPointer(FreeBufferPointers *fbp, + void *pointer) +{ + fbp->pointers[fbp->index] = pointer; + fbp->index++; + if (fbp->index == ARRAY_SIZE(fbp->pointers)) { + freeBufferPointers(fbp); + } +} + +#endif /* BUFFERPOOL_H */ diff --git a/vdo/kernel/dataKVIO.c b/vdo/kernel/dataKVIO.c new file mode 100644 index 0000000..ba9c8e8 --- /dev/null +++ b/vdo/kernel/dataKVIO.c @@ -0,0 +1,1192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.c#18 $ + */ + +#include "dataKVIO.h" + + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" + +#include "dataVIO.h" +#include "compressedBlock.h" +#include "hashLock.h" +#include "lz4.h" + +#include "bio.h" +#include "dedupeIndex.h" +#include "kvdoFlush.h" +#include "kvio.h" +#include "ioSubmitter.h" +#include "vdoCommon.h" +#include "verify.h" + +static void dumpPooledDataKVIO(void *poolData, void *data); + +enum { + WRITE_PROTECT_FREE_POOL = 0, + WP_DATA_KVIO_SIZE = (sizeof(DataKVIO) + PAGE_SIZE - 1 + - ((sizeof(DataKVIO) + PAGE_SIZE - 1) + % PAGE_SIZE)) +}; + +/** + * Alter the write-access permission to a page of memory, so that + * objects in the free pool may no longer be modified. + * + * To do: Deny read access as well. + * + * @param address The starting address to protect, which must be on a + * page boundary + * @param byteCount The number of bytes to protect, which must be a multiple + * of the page size + * @param mode The write protection mode (true means read-only) + **/ +static __always_inline void +setWriteProtect(void *address, + size_t byteCount, + bool mode __attribute__((unused))) +{ + BUG_ON((((long) address) % PAGE_SIZE) != 0); + BUG_ON((byteCount % PAGE_SIZE) != 0); + BUG(); // only works in internal code, sorry +} + +/**********************************************************************/ +static void maybeLogDataKVIOTrace(DataKVIO *dataKVIO) +{ + if (dataKVIO->kvio.layer->traceLogging) { + logKvioTrace(&dataKVIO->kvio); + } +} + +/** + * First tracing hook for VIO completion. + * + * If the SystemTap script vdotrace.stp is in use, it does stage 1 of + * its processing here. We must not call addTraceRecord between the + * two tap functions. + * + * @param dataKVIO The VIO we're finishing up + **/ +static void kvioCompletionTap1(DataKVIO *dataKVIO) +{ + /* + * Ensure that dataKVIO doesn't get optimized out, even under inline + * expansion. Also, make sure the compiler has to emit debug info + * for baseTraceLocation, which some of our SystemTap scripts will + * use here. + * + * First, make it look as though all memory could be clobbered; then + * require that a value be read into a register. That'll force at + * least one instruction to exist (so SystemTap can hook in) where + * dataKVIO is live. We use a field that the caller would've + * accessed recently anyway, so it may be cached. + */ + barrier(); + __asm__ __volatile__("" + : + : "g" (dataKVIO), "g" (baseTraceLocation), + "r" (dataKVIO->kvio.layer)); +} + +/** + * Second tracing hook for VIO completion. + * + * The SystemTap script vdotrace.stp splits its VIO-completion work + * into two stages, to reduce lock contention for script variables. + * Hence, it needs two hooks in the code. + * + * @param dataKVIO The VIO we're finishing up + **/ +static void kvioCompletionTap2(DataKVIO *dataKVIO) +{ + // Hack to ensure variable doesn't get optimized out. + barrier(); + __asm__ __volatile__("" : : "g" (dataKVIO), "r" (dataKVIO->kvio.layer)); +} + +/**********************************************************************/ +static void kvdoAcknowledgeDataKVIO(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIO->kvio.layer; + ExternalIORequest *externalIORequest = &dataKVIO->externalIORequest; + BIO *bio = externalIORequest->bio; + if (bio == NULL) { + return; + } + + externalIORequest->bio = NULL; + + int error + = mapToSystemError(dataVIOAsCompletion(&dataKVIO->dataVIO)->result); + bio->bi_end_io = externalIORequest->endIO; + bio->bi_private = externalIORequest->private; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf = externalIORequest->rw; +#else + bio->bi_rw = externalIORequest->rw; +#endif + + countBios(&layer->biosAcknowledged, bio); + if (dataKVIO->isPartial) { + countBios(&layer->biosAcknowledgedPartial, bio); + } + + + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + completeBio(bio, error); +} + +/**********************************************************************/ +static noinline void cleanDataKVIO(DataKVIO *dataKVIO, FreeBufferPointers *fbp) +{ + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + kvdoAcknowledgeDataKVIO(dataKVIO); + + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + kvio->bio = NULL; + + if (unlikely(kvio->vio->trace != NULL)) { + maybeLogDataKVIOTrace(dataKVIO); + kvioCompletionTap1(dataKVIO); + kvioCompletionTap2(dataKVIO); + freeTraceToPool(kvio->layer, kvio->vio->trace); + } + + addFreeBufferPointer(fbp, dataKVIO); +} + +/**********************************************************************/ +void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure) +{ + KernelLayer *layer = closure; + uint32_t count = 0; + ASSERT_LOG_ONLY(batch != NULL, "batch not null"); + ASSERT_LOG_ONLY(layer != NULL, "layer not null"); + + FreeBufferPointers fbp; + initFreeBufferPointers(&fbp, layer->dataKVIOPool); + + KvdoWorkItem *item; + while ((item = nextBatchItem(batch)) != NULL) { + cleanDataKVIO(workItemAsDataKVIO(item), &fbp); + condReschedBatchProcessor(batch); + count++; + } + + if (fbp.index > 0) { + freeBufferPointers(&fbp); + } + + completeManyRequests(layer, count); +} + +/**********************************************************************/ +static void kvdoAcknowledgeThenCompleteDataKVIO(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + kvdoAcknowledgeDataKVIO(dataKVIO); + addToBatchProcessor(dataKVIO->kvio.layer->dataKVIOReleaser, item); +} + +/**********************************************************************/ +void kvdoCompleteDataKVIO(VDOCompletion *completion) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + if (useBioAckQueue(layer) && USE_BIO_ACK_QUEUE_FOR_READ + && (dataKVIO->externalIORequest.bio != NULL)) { + launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeThenCompleteDataKVIO, + NULL, BIO_ACK_Q_ACTION_ACK); + } else { + addToBatchProcessor(layer->dataKVIOReleaser, + workItemFromDataKVIO(dataKVIO)); + } +} + +/** + * Copy the uncompressed data from a compressed block read into the user + * bio which requested the read. + * + * @param workItem The DataKVIO which requested the read + **/ +static void copyReadBlockData(KvdoWorkItem *workItem) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); + + // For a read-modify-write, copy the data into the dataBlock buffer so it + // will be set up for the write phase. + if (isReadModifyWriteVIO(dataKVIO->kvio.vio)) { + bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + // For a partial read, the callback will copy the requested data from the + // read block. + if (dataKVIO->isPartial) { + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + // For a full block read, copy the data to the bio and acknowledge. + bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); + kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); +} + +/** + * Finish reading data for a compressed block. + * + * @param dataKVIO The DataKVIO which requested the read + **/ +static void readDataKVIOReadBlockCallback(DataKVIO *dataKVIO) +{ + if (dataKVIO->readBlock.status != VDO_SUCCESS) { + setCompletionResult(dataVIOAsCompletion(&dataKVIO->dataVIO), + dataKVIO->readBlock.status); + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, copyReadBlockData, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Complete and reset a bio that was supplied by the user and then used for a + * read (so that we can complete it with the user's callback). + * + * @param bio The bio to complete + **/ +static void resetUserBio(BIO *bio) +#else +/** + * Complete and reset a bio that was supplied by the user and then used for a + * read (so that we can complete it with the user's callback). + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +static void resetUserBio(BIO *bio, int error) +#endif +{ +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)) \ + && (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))) + // This is a user bio, and the device just called bio_endio() on it, so + // we need to re-increment bi_remaining so we too can call bio_endio(). + atomic_inc(&bio->bi_remaining); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + completeAsyncBio(bio); +#else + completeAsyncBio(bio, error); +#endif +} + +/** + * Uncompress the data that's just been read and then call back the requesting + * DataKVIO. + * + * @param workItem The DataKVIO requesting the data + **/ +static void uncompressReadBlock(KvdoWorkItem *workItem) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); + ReadBlock *readBlock = &dataKVIO->readBlock; + BlockSize blockSize = VDO_BLOCK_SIZE; + + // The DataKVIO's scratch block will be used to contain the + // uncompressed data. + uint16_t fragmentOffset, fragmentSize; + char *compressedData = readBlock->data; + int result = getCompressedBlockFragment(readBlock->mappingState, + compressedData, blockSize, + &fragmentOffset, + &fragmentSize); + if (result != VDO_SUCCESS) { + logDebug("%s: frag err %d", __func__, result); + readBlock->status = result; + readBlock->callback(dataKVIO); + return; + } + + char *fragment = compressedData + fragmentOffset; + int size = LZ4_uncompress_unknownOutputSize(fragment, dataKVIO->scratchBlock, + fragmentSize, blockSize); + if (size == blockSize) { + readBlock->data = dataKVIO->scratchBlock; + } else { + logDebug("%s: lz4 error", __func__); + readBlock->status = VDO_INVALID_FRAGMENT; + } + + readBlock->callback(dataKVIO); +} + +/** + * Now that we have gotten the data from storage, uncompress the data if + * necessary and then call back the requesting DataKVIO. + * + * @param dataKVIO The DataKVIO requesting the data + * @param result The result of the read operation + **/ +static void completeRead(DataKVIO *dataKVIO, int result) +{ + ReadBlock *readBlock = &dataKVIO->readBlock; + readBlock->status = result; + + if ((result == VDO_SUCCESS) && isCompressed(readBlock->mappingState)) { + launchDataKVIOOnCPUQueue(dataKVIO, uncompressReadBlock, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); + return; + } + + readBlock->callback(dataKVIO); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Callback for a bio doing a read. + * + * @param bio The bio + */ +static void readBioCallback(BIO *bio) +#else +/** + * Callback for a bio doing a read. + * + * @param bio The bio + * @param result The result of the read operation + */ +static void readBioCallback(BIO *bio, int result) +#endif +{ + KVIO *kvio = (KVIO *) bio->bi_private; + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + dataKVIO->readBlock.data = dataKVIO->readBlock.buffer; + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + countCompletedBios(bio); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + completeRead(dataKVIO, getBioResult(bio)); +#else + completeRead(dataKVIO, result); +#endif +} + +/**********************************************************************/ +void kvdoReadBlock(DataVIO *dataVIO, + PhysicalBlockNumber location, + BlockMappingState mappingState, + BioQAction action, + DataKVIOCallback callback) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + ReadBlock *readBlock = &dataKVIO->readBlock; + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + + readBlock->callback = callback; + readBlock->status = VDO_SUCCESS; + readBlock->mappingState = mappingState; + + BUG_ON(getBIOFromDataKVIO(dataKVIO)->bi_private != &dataKVIO->kvio); + // Read the data directly from the device using the read bio. + BIO *bio = readBlock->bio; + resetBio(bio, layer); + setBioSector(bio, blockToSector(layer, location)); + setBioOperationRead(bio); + bio->bi_end_io = readBioCallback; + submitBio(bio, action); +} + +/**********************************************************************/ +void kvdoReadDataVIO(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(!isWriteVIO(dataVIOAsVIO(dataVIO)), + "operation set correctly for data read"); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=readData")); + + if (isCompressed(dataVIO->mapped.state)) { + kvdoReadBlock(dataVIO, dataVIO->mapped.pbn, dataVIO->mapped.state, + BIO_Q_ACTION_COMPRESSED_DATA, readDataKVIOReadBlockCallback); + return; + } + + KVIO *kvio = dataVIOAsKVIO(dataVIO); + BIO *bio = kvio->bio; + bio->bi_end_io = resetUserBio; + setBioSector(bio, blockToSector(kvio->layer, dataVIO->mapped.pbn)); + submitBio(bio, BIO_Q_ACTION_DATA); +} + +/**********************************************************************/ +static void kvdoAcknowledgeDataKVIOThenContinue(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + kvdoAcknowledgeDataKVIO(dataKVIO); + // Even if we're not using bio-ack threads, we may be in the wrong + // base-code thread. + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoAcknowledgeDataVIO(DataVIO *dataVIO) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + + // If the remaining discard work is not completely processed by this VIO, + // don't acknowledge it yet. + if (isDiscardBio(dataKVIO->externalIORequest.bio) + && (dataKVIO->remainingDiscard + > (VDO_BLOCK_SIZE - dataKVIO->offset))) { + invokeCallback(dataVIOAsCompletion(dataVIO)); + return; + } + + // We've finished with the KVIO; acknowledge completion of the bio to the + // kernel. + if (useBioAckQueue(layer)) { + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeDataKVIOThenContinue, + NULL, BIO_ACK_Q_ACTION_ACK); + } else { + kvdoAcknowledgeDataKVIOThenContinue(workItemFromDataKVIO(dataKVIO)); + } +} + +/**********************************************************************/ +void kvdoWriteDataVIO(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(isWriteVIO(dataVIOAsVIO(dataVIO)), + "kvdoWriteDataVIO() called on write DataVIO"); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=writeData;j=normal")); + + KVIO *kvio = dataVIOAsKVIO(dataVIO); + BIO *bio = kvio->bio; + setBioOperationWrite(bio); + setBioSector(bio, blockToSector(kvio->layer, dataVIO->newMapped.pbn)); + submitBio(bio, BIO_Q_ACTION_DATA); +} + +/**********************************************************************/ +void kvdoModifyWriteDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + BIO *bio = dataKVIO->externalIORequest.bio; + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + resetBio(dataKVIO->dataBlockBio, layer); + + if (!isDiscardBio(bio)) { + bioCopyDataIn(bio, dataKVIO->dataBlock + dataKVIO->offset); + } else { + memset(dataKVIO->dataBlock + dataKVIO->offset, '\0', + min(dataKVIO->remainingDiscard, + (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset))); + } + + dataVIO->isZeroBlock = bioIsZeroData(dataKVIO->dataBlockBio); + dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; + copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); + // Make the bio a write, not (potentially) a discard. + setBioOperationWrite(dataKVIO->dataBlockBio); +} + +/**********************************************************************/ +void kvdoZeroDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("zeroDataVIO;io=readData")); + bioZeroData(dataVIOAsKVIO(dataVIO)->bio); +} + +/**********************************************************************/ +void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination) +{ + dataVIOAddTraceRecord(destination, THIS_LOCATION(NULL)); + bioCopyDataOut(dataVIOAsKVIO(destination)->bio, + dataVIOAsDataKVIO(source)->dataBlock); +} + +/**********************************************************************/ +static void kvdoCompressWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + char *context = getWorkQueuePrivateData(); + if (unlikely(context == NULL)) { + uint32_t index = atomicAdd32(&layer->compressionContextIndex, 1) - 1; + BUG_ON(index >= layer->deviceConfig->threadCounts.cpuThreads); + context = layer->compressionContext[index]; + setWorkQueuePrivateData(context); + } + + int size = LZ4_compress_ctx_limitedOutput(context, dataKVIO->dataBlock, + dataKVIO->scratchBlock, + VDO_BLOCK_SIZE, + VDO_BLOCK_SIZE); + DataVIO *dataVIO = &dataKVIO->dataVIO; + if (size > 0) { + // The scratch block will be used to contain the compressed data. + dataVIO->compression.data = dataKVIO->scratchBlock; + dataVIO->compression.size = size; + } else { + // Use block size plus one as an indicator for uncompressible data. + dataVIO->compression.size = VDO_BLOCK_SIZE + 1; + } + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoCompressDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("compressDataVIO;" + "io=compress;cb=compress")); + + /* + * If the orignal bio was a discard, but we got this far because the discard + * was a partial one (r/m/w), and it is part of a larger discard, we cannot + * compress this VIO. We need to make sure the VIO completes ASAP. + */ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + if (isDiscardBio(dataKVIO->externalIORequest.bio) + && (dataKVIO->remainingDiscard > 0)) { + dataVIO->compression.size = VDO_BLOCK_SIZE + 1; + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, kvdoCompressWork, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +/** + * Construct a DataKVIO. + * + * @param [in] layer The physical layer + * @param [in] bio The bio to associate with this DataKVIO + * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeDataKVIO(KernelLayer *layer, BIO *bio, DataKVIO **dataKVIOPtr) +{ + DataKVIO *dataKVIO; + int result = allocBufferFromPool(layer->dataKVIOPool, (void **) &dataKVIO); + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, "data kvio allocation failure"); + } + + if (WRITE_PROTECT_FREE_POOL) { + setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); + } + + KVIO *kvio = &dataKVIO->kvio; + kvio->vio = dataVIOAsVIO(&dataKVIO->dataVIO); + memset(&kvio->enqueueable, 0, sizeof(KvdoEnqueueable)); + memset(&dataKVIO->dedupeContext.pendingList, 0, sizeof(struct list_head)); + memset(&dataKVIO->dataVIO, 0, sizeof(DataVIO)); + kvio->bioToSubmit = NULL; + bio_list_init(&kvio->biosMerged); + + // The dataBlock is only needed for writes and some partial reads. + if (isWriteBio(bio) || (getBioSize(bio) < VDO_BLOCK_SIZE)) { + resetBio(dataKVIO->dataBlockBio, layer); + } + + initializeKVIO(kvio, layer, VIO_TYPE_DATA, VIO_PRIORITY_DATA, NULL, bio); + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Creates a new DataVIO structure. A DataVIO represents a single logical + * block of data. It is what most VDO operations work with. This function also + * creates a wrapping DataKVIO structure that is used when we want to + * physically read or write the data associated with the DataVIO. + * + * @param [in] layer The physical layer + * @param [in] bio The BIO from the request the new DataKVIO will + * service + * @param [in] arrivalTime The arrival time of the BIO + * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +static int kvdoCreateKVIOFromBio(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime, + DataKVIO **dataKVIOPtr) +{ + ExternalIORequest externalIORequest = { + .bio = bio, + .private = bio->bi_private, + .endIO = bio->bi_end_io, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + .rw = bio->bi_opf, +#else + .rw = bio->bi_rw, +#endif + }; + + // We will handle FUA at the end of the request (after we restore the + // bi_rw field from externalIORequest.rw). + clearBioOperationFlagFua(bio); + + DataKVIO *dataKVIO = NULL; + int result = makeDataKVIO(layer, bio, &dataKVIO); + if (result != VDO_SUCCESS) { + return result; + } + + dataKVIO->externalIORequest = externalIORequest; + dataKVIO->offset = sectorToBlockOffset(layer, getBioSector(bio)); + dataKVIO->isPartial = ((getBioSize(bio) < VDO_BLOCK_SIZE) + || (dataKVIO->offset != 0)); + + if (dataKVIO->isPartial) { + countBios(&layer->biosInPartial, bio); + } else { + /* + * Note that we unconditionally fill in the dataBlock array for + * non-read operations. There are places like kvdoCopyVIO that may + * look at kvio->dataBlock for a zero block (and maybe for + * discards?). We could skip filling in dataBlock for such cases, + * but only once we're sure all such places are fixed to check the + * isZeroBlock flag first. + */ + if (isDiscardBio(bio)) { + /* + * This is a discard/trim operation. This is treated much like the zero + * block, but we keep different stats and distinguish it in the block + * map. + */ + memset(dataKVIO->dataBlock, 0, VDO_BLOCK_SIZE); + } else if (bio_data_dir(bio) == WRITE) { + dataKVIO->dataVIO.isZeroBlock = bioIsZeroData(bio); + // Copy the bio data to a char array so that we can continue to use + // the data after we acknowledge the bio. + bioCopyDataIn(bio, dataKVIO->dataBlock); + } + } + + if (dataKVIO->isPartial || isWriteBio(bio)) { + /* + * dataKVIO->bio will point at kvio->dataBlockBio for all writes and + * partial block I/O so the rest of the kernel code doesn't need to + * make a decision as to what to use. + */ + dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; + if (dataKVIO->isPartial && isWriteBio(bio)) { + clearBioOperationAndFlags(dataKVIO->dataBlockBio); + setBioOperationRead(dataKVIO->dataBlockBio); + } else { + copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); + } + dataKVIOAsKVIO(dataKVIO)->bio = dataKVIO->dataBlockBio; + dataKVIO->readBlock.data = dataKVIO->dataBlock; + } + + setBioBlockDevice(bio, getKernelLayerBdev(layer)); + bio->bi_end_io = completeAsyncBio; + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void launchDataKVIOWork(KvdoWorkItem *item) +{ + runCallback(vioAsCompletion(workItemAsKVIO(item)->vio)); +} + +/** + * Continue discard processing for requests that span multiple physical blocks. + * If all have been processed the KVIO is completed. If we have already seen + * an error, we skip the rest of the discard and fail immediately. + * + *

Invoked in a request-queue thread after the discard of a block has + * completed. + * + * @param completion A completion representing the discard KVIO + **/ +static void kvdoContinueDiscardKVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + dataKVIO->remainingDiscard + -= min(dataKVIO->remainingDiscard, + (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset)); + if ((completion->result != VDO_SUCCESS) + || (dataKVIO->remainingDiscard == 0)) { + if (dataKVIO->hasDiscardPermit) { + limiterRelease(&layer->discardLimiter); + dataKVIO->hasDiscardPermit = false; + } + kvdoCompleteDataKVIO(completion); + return; + } + + BIO *bio = getBIOFromDataKVIO(dataKVIO); + resetBio(bio, layer); + dataKVIO->isPartial = (dataKVIO->remainingDiscard < VDO_BLOCK_SIZE); + dataKVIO->offset = 0; + + VIOOperation operation; + if (dataKVIO->isPartial) { + operation = VIO_READ_MODIFY_WRITE; + setBioOperationRead(bio); + } else { + operation = VIO_WRITE; + } + + if (requestorSetFUA(dataKVIO)) { + operation |= VIO_FLUSH_AFTER; + } + + prepareDataVIO(dataVIO, dataVIO->logical.lbn + 1, operation, + !dataKVIO->isPartial, kvdoContinueDiscardKVIO); + enqueueDataKVIO(dataKVIO, launchDataKVIOWork, completion->callback, + REQ_Q_ACTION_MAP_BIO); +} + +/** + * Finish a partial read. + * + * @param completion The partial read KVIO + **/ +static void kvdoCompletePartialRead(VDOCompletion *completion) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + bioCopyDataOut(dataKVIO->externalIORequest.bio, + dataKVIO->readBlock.data + dataKVIO->offset); + kvdoCompleteDataKVIO(completion); + return; +} + +/**********************************************************************/ +int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, + BIO *bio, + uint64_t arrivalTime, + bool hasDiscardPermit) +{ + + DataKVIO *dataKVIO = NULL; + int result = kvdoCreateKVIOFromBio(layer, bio, arrivalTime, &dataKVIO); + if (unlikely(result != VDO_SUCCESS)) { + logInfo("%s: KVIO allocation failure", __func__); + if (hasDiscardPermit) { + limiterRelease(&layer->discardLimiter); + } + limiterRelease(&layer->requestLimiter); + return mapToSystemError(result); + } + + /* + * Discards behave very differently than other requests when coming + * in from device-mapper. We have to be able to handle any size discards + * and with various sector offsets within a block. + */ + KVIO *kvio = &dataKVIO->kvio; + VDOAction *callback = kvdoCompleteDataKVIO; + VIOOperation operation = VIO_WRITE; + bool isTrim = false; + if (isDiscardBio(bio)) { + dataKVIO->hasDiscardPermit = hasDiscardPermit; + dataKVIO->remainingDiscard = getBioSize(bio); + callback = kvdoContinueDiscardKVIO; + if (dataKVIO->isPartial) { + operation = VIO_READ_MODIFY_WRITE; + } else { + isTrim = true; + } + } else if (dataKVIO->isPartial) { + if (bio_data_dir(bio) == READ) { + callback = kvdoCompletePartialRead; + operation = VIO_READ; + } else { + operation = VIO_READ_MODIFY_WRITE; + } + } else if (bio_data_dir(bio) == READ) { + operation = VIO_READ; + } + + if (requestorSetFUA(dataKVIO)) { + operation |= VIO_FLUSH_AFTER; + } + + LogicalBlockNumber lbn + = sectorToBlock(layer, getBioSector(bio) - layer->startingSectorOffset); + prepareDataVIO(&dataKVIO->dataVIO, lbn, operation, isTrim, callback); + enqueueKVIO(kvio, launchDataKVIOWork, vioAsCompletion(kvio->vio)->callback, + REQ_Q_ACTION_MAP_BIO); + return VDO_SUCCESS; +} + +/** + * Hash a DataKVIO and set its chunk name. + * + * @param item The DataKVIO to be hashed + **/ +static void kvdoHashDataWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + DataVIO *dataVIO = &dataKVIO->dataVIO; + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + + MurmurHash3_x64_128(dataKVIO->dataBlock, VDO_BLOCK_SIZE, 0x62ea60be, + &dataVIO->chunkName); + dataKVIO->dedupeContext.chunkName = &dataVIO->chunkName; + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoHashDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + launchDataKVIOOnCPUQueue(dataVIOAsDataKVIO(dataVIO), kvdoHashDataWork, NULL, + CPU_Q_ACTION_HASH_BLOCK); +} + +/**********************************************************************/ +void kvdoCheckForDuplication(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("checkForDuplication;dup=post")); + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "zero block not checked for duplication"); + ASSERT_LOG_ONLY(dataVIO->newMapped.state != MAPPING_STATE_UNMAPPED, + "discard not checked for duplication"); + + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + if (hasAllocation(dataVIO)) { + postDedupeAdvice(dataKVIO); + } else { + // This block has not actually been written (presumably because we are + // full), so attempt to dedupe without posting bogus advice. + queryDedupeAdvice(dataKVIO); + } +} + +/**********************************************************************/ +void kvdoUpdateDedupeAdvice(DataVIO *dataVIO) +{ + updateDedupeAdvice(dataVIOAsDataKVIO(dataVIO)); +} + +/** + * Implements BufferFreeFunction. + **/ +static void freePooledDataKVIO(void *poolData, void *data) +{ + if (data == NULL) { + return; + } + + DataKVIO *dataKVIO = (DataKVIO *) data; + KernelLayer *layer = (KernelLayer *) poolData; + if (WRITE_PROTECT_FREE_POOL) { + setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); + } + + if (dataKVIO->dataBlockBio != NULL) { + freeBio(dataKVIO->dataBlockBio, layer); + } + + if (dataKVIO->readBlock.bio != NULL) { + freeBio(dataKVIO->readBlock.bio, layer); + } + + FREE(dataKVIO->readBlock.buffer); + FREE(dataKVIO->dataBlock); + FREE(dataKVIO->scratchBlock); + FREE(dataKVIO); +} + +/** + * Allocate a DataKVIO. This function is the internals of makePooledDataKVIO(). + * + * @param [in] layer The layer in which the DataKVIO will operate + * @param [out] dataKVIOPtr A pointer to hold the newly allocated DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +static int allocatePooledDataKVIO(KernelLayer *layer, DataKVIO **dataKVIOPtr) +{ + DataKVIO *dataKVIO; + int result; + if (WRITE_PROTECT_FREE_POOL) { + STATIC_ASSERT(WP_DATA_KVIO_SIZE >= sizeof(DataKVIO)); + result = allocateMemory(WP_DATA_KVIO_SIZE, 0, __func__, &dataKVIO); + if (result == VDO_SUCCESS) { + BUG_ON((((size_t) dataKVIO) & (PAGE_SIZE - 1)) != 0); + } + } else { + result = ALLOCATE(1, DataKVIO, __func__, &dataKVIO); + } + + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, "DataKVIO allocation failure"); + } + + STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio data", + &dataKVIO->dataBlock); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, "DataKVIO data allocation failure"); + } + + result = createBio(layer, dataKVIO->dataBlock, &dataKVIO->dataBlockBio); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO data bio allocation failure"); + } + + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio read buffer", + &dataKVIO->readBlock.buffer); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO read allocation failure"); + } + + result = createBio(layer, dataKVIO->readBlock.buffer, + &dataKVIO->readBlock.bio); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO read bio allocation failure"); + } + + dataKVIO->readBlock.bio->bi_private = &dataKVIO->kvio; + + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio scratch", + &dataKVIO->scratchBlock); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO scratch allocation failure"); + } + + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Implements BufferAllocateFunction. + **/ +static int makePooledDataKVIO(void *poolData, void **dataPtr) +{ + DataKVIO *dataKVIO = NULL; + int result = allocatePooledDataKVIO((KernelLayer *) poolData, &dataKVIO); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(poolData, dataKVIO); + return result; + } + + *dataPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Dump out the waiters on each DataVIO in the DataVIO buffer pool. + * + * @param queue The queue to check (logical or physical) + * @param waitOn The label to print for queue (logical or physical) + **/ +static void dumpVIOWaiters(WaitQueue *queue, char *waitOn) +{ + Waiter *first = getFirstWaiter(queue); + if (first == NULL) { + return; + } + + DataVIO *dataVIO = waiterAsDataVIO(first); + logInfo(" %s is locked. Waited on by: VIO %" PRIptr " pbn %" PRIu64 + " lbn %llu d-pbn %llu lastOp %s", + waitOn, dataVIO, getDataVIOAllocation(dataVIO), + dataVIO->logical.lbn, dataVIO->duplicate.pbn, + getOperationName(dataVIO)); + + Waiter *waiter; + for (waiter = first->nextWaiter; + waiter != first; + waiter = waiter->nextWaiter) { + dataVIO = waiterAsDataVIO(waiter); + logInfo(" ... and : VIO %" PRIptr " pbn %llu lbn %" + PRIu64 " d-pbn %llu lastOp %s", + dataVIO, getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, + dataVIO->duplicate.pbn, getOperationName(dataVIO)); + } +} + +/** + * Encode various attributes of a VIO as a string of one-character flags for + * dump logging. This encoding is for logging brevity: + * + * R => VIO completion result not VDO_SUCCESS + * W => VIO is on a wait queue + * D => VIO is a duplicate + * + *

The common case of no flags set will result in an empty, null-terminated + * buffer. If any flags are encoded, the first character in the string will be + * a space character. + * + * @param dataVIO The VIO to encode + * @param buffer The buffer to receive a null-terminated string of encoded + * flag character + **/ +static void encodeVIODumpFlags(DataVIO *dataVIO, char buffer[8]) +{ + char *pFlag = buffer; + *pFlag++ = ' '; + if (dataVIOAsCompletion(dataVIO)->result != VDO_SUCCESS) { + *pFlag++ = 'R'; + } + if (dataVIOAsAllocatingVIO(dataVIO)->waiter.nextWaiter != NULL) { + *pFlag++ = 'W'; + } + if (dataVIO->isDuplicate) { + *pFlag++ = 'D'; + } + if (pFlag == &buffer[1]) { + // No flags, so remove the blank space. + pFlag = buffer; + } + *pFlag = '\0'; +} + +/** + * Dump out info on a DataKVIO from the DataKVIO pool. + * + *

Implements BufferDumpFunction. + * + * @param poolData The pool data + * @param data The DataKVIO to dump + **/ +static void dumpPooledDataKVIO(void *poolData __attribute__((unused)), + void *data) +{ + DataKVIO *dataKVIO = (DataKVIO *) data; + DataVIO *dataVIO = &dataKVIO->dataVIO; + + /* + * This just needs to be big enough to hold a queue (thread) name + * and a function name (plus a separator character and NUL). The + * latter is limited only by taste. + * + * In making this static, we're assuming only one "dump" will run at + * a time. If more than one does run, the log output will be garbled + * anyway. + */ + static char vioWorkItemDumpBuffer[100 + MAX_QUEUE_NAME_LEN]; + /* + * We're likely to be logging a couple thousand of these lines, and + * in some circumstances syslogd may have trouble keeping up, so + * keep it BRIEF rather than user-friendly. + */ + dumpWorkItemToBuffer(&dataKVIO->kvio.enqueueable.workItem, + vioWorkItemDumpBuffer, sizeof(vioWorkItemDumpBuffer)); + // Another static buffer... + // log10(256) = 2.408+, round up: + enum { DECIMAL_DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) }; + static char vioBlockNumberDumpBuffer[sizeof("P L D") + + 3 * DECIMAL_DIGITS_PER_UINT64_T]; + if (dataVIO->isDuplicate) { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "P%llu L%llu D%llu", + getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, + dataVIO->duplicate.pbn); + } else if (hasAllocation(dataVIO)) { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "P%llu L%llu", + getDataVIOAllocation(dataVIO), dataVIO->logical.lbn); + } else { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "L%llu", + dataVIO->logical.lbn); + } + + static char vioFlushGenerationBuffer[sizeof(" FG") + + DECIMAL_DIGITS_PER_UINT64_T] = ""; + if (dataVIO->flushGeneration != 0) { + snprintf(vioFlushGenerationBuffer, sizeof(vioFlushGenerationBuffer), + " FG%llu", dataVIO->flushGeneration); + } + + // Encode VIO attributes as a string of one-character flags, usually empty. + static char flagsDumpBuffer[8]; + encodeVIODumpFlags(dataVIO, flagsDumpBuffer); + + logInfo(" kvio %" PRIptr " %s%s %s %s%s", + dataKVIO, vioBlockNumberDumpBuffer, vioFlushGenerationBuffer, + getOperationName(dataVIO), vioWorkItemDumpBuffer, flagsDumpBuffer); + // might want info on: wantAlbireoAnswer / operation / status + // might want info on: bio / bioToSubmit / biosMerged + + dumpVIOWaiters(&dataVIO->logical.waiters, "lbn"); + + // might want to dump more info from VIO here +} + +/**********************************************************************/ +int makeDataKVIOBufferPool(KernelLayer *layer, + uint32_t poolSize, + BufferPool **bufferPoolPtr) +{ + return makeBufferPool("DataKVIO Pool", poolSize, + makePooledDataKVIO, freePooledDataKVIO, + dumpPooledDataKVIO, layer, bufferPoolPtr); +} + +/**********************************************************************/ +DataLocation getDedupeAdvice(const DedupeContext *context) +{ + DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); + return (DataLocation) { + .state = dataKVIO->dataVIO.newMapped.state, + .pbn = dataKVIO->dataVIO.newMapped.pbn, + }; +} + +/**********************************************************************/ +void setDedupeAdvice(DedupeContext *context, const DataLocation *advice) +{ + DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); + receiveDedupeAdvice(&dataKVIO->dataVIO, advice); +} diff --git a/vdo/kernel/dataKVIO.h b/vdo/kernel/dataKVIO.h new file mode 100644 index 0000000..c3989f4 --- /dev/null +++ b/vdo/kernel/dataKVIO.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.h#5 $ + */ + +#ifndef DATA_KVIO_H +#define DATA_KVIO_H + +#include "dataVIO.h" +#include "kvio.h" +#include "uds-block.h" + +typedef struct { + /* + * The BIO which was received from the device mapper to initiate an I/O + * request. This field will be non-NULL only until the request is + * acknowledged. + */ + BIO *bio; + // Cached copies of fields from the bio which will need to be reset after + // we're done. + void *private; + void *endIO; + // This is a copy of the bi_rw field of the BIO which sadly is not just + // a boolean read-write flag, but also includes other flag bits. + unsigned long rw; +} ExternalIORequest; + +/* Dedupe support */ +struct dedupeContext { + UdsRequest udsRequest; + struct list_head pendingList; + Jiffies submissionTime; + Atomic32 requestState; + int status; + bool isPending; + /** Hash of the associated VIO (NULL if not calculated) */ + const UdsChunkName *chunkName; +}; + +typedef struct { + /** + * A pointer to a block that holds the data from the last read operation. + **/ + char *data; + /** + * Temporary storage for doing reads from the underlying device. + **/ + char *buffer; + /** + * A bio structure wrapping the buffer. + **/ + BIO *bio; + /** + * Callback to invoke after completing the read I/O operation. + **/ + DataKVIOCallback callback; + /** + * Mapping state passed to kvdoReadBlock(), used to determine whether + * the data must be uncompressed. + **/ + BlockMappingState mappingState; + /** + * The result code of the read attempt. + **/ + int status; +} ReadBlock; + +struct dataKVIO { + /* The embedded base code's DataVIO */ + DataVIO dataVIO; + /* The embedded KVIO */ + KVIO kvio; + /* The BIO from the request which is being serviced by this KVIO. */ + ExternalIORequest externalIORequest; + /* Dedupe */ + DedupeContext dedupeContext; + /* Read cache */ + ReadBlock readBlock; + /* partial block support */ + BlockSize offset; + bool isPartial; + /* discard support */ + bool hasDiscardPermit; + DiscardSize remainingDiscard; + /** + * A copy of user data written, so we can do additional processing + * (dedupe, compression) after acknowledging the I/O operation and + * thus losing access to the original data. + * + * Also used as buffer space for read-modify-write cycles when + * emulating smaller-than-blockSize I/O operations. + **/ + char *dataBlock; + /** A bio structure describing the #dataBlock buffer. */ + BIO *dataBlockBio; + /** A block used as output during compression or uncompression. */ + char *scratchBlock; +}; + +/** + * Convert a KVIO to a DataKVIO. + * + * @param kvio The KVIO to convert + * + * @return The KVIO as a DataKVIO + **/ +static inline DataKVIO *kvioAsDataKVIO(KVIO *kvio) +{ + ASSERT_LOG_ONLY(isData(kvio), "KVIO is a DataKVIO"); + return container_of(kvio, DataKVIO, kvio); +} + +/** + * Convert a DataKVIO to a KVIO. + * + * @param dataKVIO The DataKVIO to convert + * + * @return The DataKVIO as a KVIO + **/ +static inline KVIO *dataKVIOAsKVIO(DataKVIO *dataKVIO) +{ + return &dataKVIO->kvio; +} + +/** + * Returns a pointer to the DataKVIO wrapping a DataVIO. + * + * @param dataVIO the DataVIO + * + * @return the DataKVIO + **/ +static inline DataKVIO *dataVIOAsDataKVIO(DataVIO *dataVIO) +{ + return container_of(dataVIO, DataKVIO, dataVIO); +} + +/** + * Returns a pointer to the KVIO associated with a DataVIO. + * + * @param dataVIO the DataVIO + * + * @return the KVIO + **/ +static inline KVIO *dataVIOAsKVIO(DataVIO *dataVIO) +{ + return dataKVIOAsKVIO(dataVIOAsDataKVIO(dataVIO)); +} + +/** + * Returns a pointer to the DataKVIO wrapping a work item. + * + * @param item the work item + * + * @return the DataKVIO + **/ +static inline DataKVIO *workItemAsDataKVIO(KvdoWorkItem *item) +{ + return kvioAsDataKVIO(workItemAsKVIO(item)); +} + +/** + * Get the WorkItem from a DataKVIO. + * + * @param dataKVIO The DataKVIO + * + * @return the DataKVIO's work item + **/ +static inline KvdoWorkItem *workItemFromDataKVIO(DataKVIO *dataKVIO) +{ + return &dataKVIOAsKVIO(dataKVIO)->enqueueable.workItem; +} + +/** + * Get the BIO from a DataKVIO. + * + * @param dataKVIO The DataKVIO from which to get the BIO + * + * @return The DataKVIO's BIO + **/ +static inline BIO *getBIOFromDataKVIO(DataKVIO *dataKVIO) +{ + return dataKVIOAsKVIO(dataKVIO)->bio; +} + +/** + * Get the KernelLayer from a DataKVIO. + * + * @param dataKVIO The DataKVIO from which to get the KernelLayer + * + * @return The DataKVIO's KernelLayer + **/ +static inline KernelLayer *getLayerFromDataKVIO(DataKVIO *dataKVIO) +{ + return dataKVIOAsKVIO(dataKVIO)->layer; +} + +/** + * Set up and enqueue a DataKVIO's work item to be processed in the base code + * context. + * + * @param dataKVIO The DataKVIO with the work item to be run + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void enqueueDataKVIO(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + enqueueKVIO(dataKVIOAsKVIO(dataKVIO), work, statsFunction, action); +} + +/** + * Enqueue a DataKVIO on a work queue. + * + * @param queue The queue + * @param dataKVIO The DataKVIO + **/ +static inline void enqueueDataKVIOWork(KvdoWorkQueue *queue, + DataKVIO *dataKVIO) +{ + enqueueKVIOWork(queue, dataKVIOAsKVIO(dataKVIO)); +} + +/** + * Add a trace record for the current source location. + * + * @param dataKVIO The DataKVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void dataKVIOAddTraceRecord(DataKVIO *dataKVIO, + TraceLocation location) +{ + dataVIOAddTraceRecord(&dataKVIO->dataVIO, location); +} + +/** + * Set up and enqueue a DataKVIO on the CPU queue. + * + * @param dataKVIO The DataKVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void launchDataKVIOOnCPUQueue(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + launchKVIO(kvio, work, statsFunction, action, kvio->layer->cpuQueue); +} + +/** + * Set up and enqueue a DataKVIO on the BIO Ack queue. + * + * @param dataKVIO The DataKVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void launchDataKVIOOnBIOAckQueue(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + launchKVIO(kvio, work, statsFunction, action, kvio->layer->bioAckQueue); +} + +/** + * Move a DataKVIO back to the base threads. + * + * @param dataKVIO The DataKVIO to enqueue + **/ +static inline void kvdoEnqueueDataVIOCallback(DataKVIO *dataKVIO) +{ + kvdoEnqueueVIOCallback(dataKVIOAsKVIO(dataKVIO)); +} + +/** + * Check whether the external request bio had FUA set. + * + * @param dataKVIO The DataKVIO to check + * + * @return true if the external request bio had FUA set + **/ +static inline bool requestorSetFUA(DataKVIO *dataKVIO) +{ + return ((dataKVIO->externalIORequest.rw & REQ_FUA) == REQ_FUA); +} + +/** + * Associate a KVIO with a BIO passed in from the block layer, and start + * processing the KVIO. + * + * If setting up a KVIO fails, a message is logged, and the limiter permits + * (request and maybe discard) released, but the caller is responsible for + * disposing of the bio. + * + * @param layer The physical layer + * @param bio The bio for which to create KVIO + * @param arrivalTime The time (in jiffies) when the external request + * entered the device mapbio function + * @param hasDiscardPermit Whether we got a permit from the discardLimiter + * of the kernel layer + * + * @return VDO_SUCCESS or a system error code + **/ +int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime, + bool hasDiscardPermit) + __attribute__((warn_unused_result)); + +/** + * Return a batch of DataKVIOs to the pool. + * + *

Implements BatchProcessorCallback. + * + * @param batch The batch processor + * @param closure The kernal layer + **/ +void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure); + +/** + * Implements DataVIOZeroer. + * + * @param dataVIO The DataVIO to zero + **/ +void kvdoZeroDataVIO(DataVIO *dataVIO); + +/** + * Implements DataCopier. + * + * @param source The DataVIO to copy from + * @param destination The DataVIO to copy to + **/ +void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination); + +/** + * Fetch the data for a block from storage. The fetched data will be + * uncompressed when the callback is called, and the result of the read + * operation will be stored in the ReadBlock's status field. On success, + * the data will be in the ReadBlock's data pointer. + * + * @param dataVIO The DataVIO to read a block in for + * @param location The physical block number to read from + * @param mappingState The mapping state of the block to read + * @param action The bio queue action + * @param callback The function to call when the read is done + **/ +void kvdoReadBlock(DataVIO *dataVIO, + PhysicalBlockNumber location, + BlockMappingState mappingState, + BioQAction action, + DataKVIOCallback callback); + +/** + * Implements DataReader. + * + * @param dataVIO The DataVIO to read + **/ +void kvdoReadDataVIO(DataVIO *dataVIO); + +/** + * Implements DataWriter. + * + * @param dataVIO The DataVIO to write + **/ +void kvdoWriteDataVIO(DataVIO *dataVIO); + +/** + * Implements DataModifier. + * + * @param dataVIO The DataVIO to modify + **/ +void kvdoModifyWriteDataVIO(DataVIO *dataVIO); + +/** + * Implements DataHasher. + * + * @param dataVIO The DataVIO to hash + **/ +void kvdoHashDataVIO(DataVIO *dataVIO); + +/** + * Implements DuplicationChecker. + * + * @param dataVIO The DataVIO containing the block to check + **/ +void kvdoCheckForDuplication(DataVIO *dataVIO); + +/** + * Implements DataAcknowledger. + * + * @param dataVIO The DataVIO to acknowledge + **/ +void kvdoAcknowledgeDataVIO(DataVIO *dataVIO); + +/** + * Implements DataCompressor. + * + * @param dataVIO The DataVIO to compress + **/ +void kvdoCompressDataVIO(DataVIO *dataVIO); + +/** + * Implements AlbireoUpdater. + * + * @param dataVIO The DataVIO which needs to change the entry for its data + **/ +void kvdoUpdateDedupeAdvice(DataVIO *dataVIO); + +/** + * Allocate a buffer pool of DataKVIOs. + * + * @param [in] layer The layer in which the DataKVIOs will operate + * @param [in] poolSize The number of DataKVIOs in the pool + * @param [out] bufferPoolPtr A pointer to hold the new buffer pool + * + * @return VDO_SUCCESS or an error + **/ +int makeDataKVIOBufferPool(KernelLayer *layer, + uint32_t poolSize, + BufferPool **bufferPoolPtr) + __attribute__((warn_unused_result)); + +/** + * Get the state needed to generate UDS metadata from the DataKVIO + * associated with a DedupeContext. + * + * @param context The DedupeContext + * + * @return the advice to store in the UDS index + **/ +DataLocation getDedupeAdvice(const DedupeContext *context) + __attribute__((warn_unused_result)); + +/** + * Set the result of a dedupe query for the DataKVIO associated with a + * DedupeContext. + * + * @param context The context receiving advice + * @param advice A data location at which the chunk named in the context + * might be stored (will be NULL if no advice was found) + **/ +void setDedupeAdvice(DedupeContext *context, const DataLocation *advice); + +#endif /* DATA_KVIO_H */ diff --git a/vdo/kernel/deadlockQueue.c b/vdo/kernel/deadlockQueue.c new file mode 100644 index 0000000..2350b35 --- /dev/null +++ b/vdo/kernel/deadlockQueue.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.c#1 $ + */ + +#include "deadlockQueue.h" + +/**********************************************************************/ +void initializeDeadlockQueue(DeadlockQueue *queue) +{ + spin_lock_init(&queue->lock); + bio_list_init(&queue->list); +} + +/**********************************************************************/ +void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime) +{ + spin_lock(&queue->lock); + if (bio_list_empty(&queue->list)) { + /* + * If we get more than one pending at once, this will be inaccurate for + * some of them. Oh well. If we've gotten here, we're trying to avoid a + * deadlock; stats are a secondary concern. + */ + queue->arrivalTime = arrivalTime; + } + bio_list_add(&queue->list, bio); + spin_unlock(&queue->lock); +} diff --git a/vdo/kernel/deadlockQueue.h b/vdo/kernel/deadlockQueue.h new file mode 100644 index 0000000..85e0b46 --- /dev/null +++ b/vdo/kernel/deadlockQueue.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.h#1 $ + */ + +#ifndef DEADLOCK_QUEUE_H +#define DEADLOCK_QUEUE_H + +#include + +#include "bio.h" + +/** + * A holding space for incoming bios if we're not able to block until VIOs + * become available to process them. + **/ +typedef struct deadlockQueue { + /* Protection for the other fields. */ + spinlock_t lock; + /* List of bios we had to accept but don't have VIOs for. */ + struct bio_list list; + /* + * Arrival time to use for statistics tracking for the above bios, since we + * haven't the space to store individual arrival times for each. + */ + Jiffies arrivalTime; +} DeadlockQueue; + +/** + * Initialize the DeadlockQueue structure. + * + * @param queue The structure to initialize + **/ +void initializeDeadlockQueue(DeadlockQueue *queue); + +/** + * Add an incoming bio to the list of saved-up bios we're not ready to start + * processing yet. + * + * This excess buffering on top of what the caller implements is generally a + * bad idea, and should be used only when necessary, such as to avoid a + * possible deadlock situation. + * + * @param queue The incoming-bio queue structure + * @param bio The new incoming bio to save + * @param arrivalTime The arrival time of this new bio + **/ +void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime); + +/** + * Pull an incoming bio off the queue. + * + * The arrival time returned may be incorrect if multiple bios were saved, as + * there is no per-bio storage used, only one saved arrival time for the whole + * queue. + * + * @param [in] queue The incoming-bio queue + * @param [out] arrivalTime The arrival time to use for this bio + * + * @return a BIO pointer, or NULL if none were queued + **/ +static inline BIO *pollDeadlockQueue(DeadlockQueue *queue, + Jiffies *arrivalTime) +{ + spin_lock(&queue->lock); + BIO *bio = bio_list_pop(&queue->list); + if (unlikely(bio != NULL)) { + *arrivalTime = queue->arrivalTime; + } + spin_unlock(&queue->lock); + return bio; +} + +#endif // DEADLOCK_QUEUE_H diff --git a/vdo/kernel/dedupeIndex.c b/vdo/kernel/dedupeIndex.c new file mode 100644 index 0000000..811cd93 --- /dev/null +++ b/vdo/kernel/dedupeIndex.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.c#1 $ + */ + +#include "dedupeIndex.h" + +#include "numeric.h" + +#include "udsIndex.h" + +// These times are in milliseconds +unsigned int albireoTimeoutInterval = 5000; +unsigned int minAlbireoTimerInterval = 100; + +// These times are in jiffies +Jiffies albireoTimeoutJiffies = 0; +static Jiffies minAlbireoTimerJiffies = 0; + +/**********************************************************************/ +Jiffies getAlbireoTimeout(Jiffies startJiffies) +{ + return maxULong(startJiffies + albireoTimeoutJiffies, + jiffies + minAlbireoTimerJiffies); +} + +/**********************************************************************/ +void setAlbireoTimeoutInterval(unsigned int value) +{ + // Arbitrary maximum value is two minutes + if (value > 120000) { + value = 120000; + } + // Arbitrary minimum value is 2 jiffies + Jiffies albJiffies = msecs_to_jiffies(value); + if (albJiffies < 2) { + albJiffies = 2; + value = jiffies_to_msecs(albJiffies); + } + albireoTimeoutInterval = value; + albireoTimeoutJiffies = albJiffies; +} + +/**********************************************************************/ +void setMinAlbireoTimerInterval(unsigned int value) +{ + // Arbitrary maximum value is one second + if (value > 1000) { + value = 1000; + } + + // Arbitrary minimum value is 2 jiffies + Jiffies minJiffies = msecs_to_jiffies(value); + if (minJiffies < 2) { + minJiffies = 2; + value = jiffies_to_msecs(minJiffies); + } + + minAlbireoTimerInterval = value; + minAlbireoTimerJiffies = minJiffies; +} + +/**********************************************************************/ +int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) +{ + if (albireoTimeoutJiffies == 0) { + setAlbireoTimeoutInterval(albireoTimeoutInterval); + } + + if (minAlbireoTimerJiffies == 0) { + setMinAlbireoTimerInterval(minAlbireoTimerInterval); + } + + return makeUDSIndex(layer, indexPtr); +} diff --git a/vdo/kernel/dedupeIndex.h b/vdo/kernel/dedupeIndex.h new file mode 100644 index 0000000..31d7631 --- /dev/null +++ b/vdo/kernel/dedupeIndex.h @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.h#5 $ + */ + +#ifndef DEDUPE_INDEX_H +#define DEDUPE_INDEX_H + +#include "dataKVIO.h" + +struct dedupeIndex { + + /** + * Do the dedupe section of dmsetup message vdo0 0 dump ... + * + * @param index The dedupe index + * @param showQueue true to dump a dedupe work queue + **/ + void (*dump)(DedupeIndex *index, bool showQueue); + + /** + * Free a dedupe index. The "finish" method must have been called + * first. + * + * @param index The dedupe index + **/ + void (*free)(DedupeIndex *index); + + /** + * Get the name of the deduplication state + * + * @param index The dedupe index + * + * @return the dedupe state name + **/ + const char *(*getDedupeStateName)(DedupeIndex *index); + + /** + * Get the index statistics + * + * @param index The dedupe index + * @param stats The index statistics + **/ + void (*getStatistics)(DedupeIndex *index, IndexStatistics *stats); + + /** + * Process a dmsetup message directed to the index. + * + * @param index The dedupe index + * @param name The message name + * + * @return 0 or an error code + **/ + int (*message)(DedupeIndex *index, const char *name); + + /** + * Look up the chunkname of the DataKVIO. If found, return the PBN + * previously associated with the name. If not found, associate the + * new PBN with the name. + * + * @param dataKVIO The DataKVIO + **/ + void (*post)(DataKVIO *dataKVIO); + + /** + * Look up the chunkname of the DataKVIO. If found, return the PBN + * previously associated with the name. If not found, do nothing. + * + * @param dataKVIO The DataKVIO + **/ + void (*query)(DataKVIO *dataKVIO); + + /** + * Start the dedupe index. + * + * @param index The dedupe index + * @param createFlag If true, create a new index without first attempting + * to load an existing index + **/ + void (*start)(DedupeIndex *index, bool createFlag); + + /** + * Stop the dedupe index. May be called by any thread, but will wait for + * the shutdown to be completed. + * + * @param index The dedupe index + **/ + void (*stop)(DedupeIndex *index); + + /** + * Suspend the dedupe index. If there are any outstanding index + * requests, wait for them to finish. If the index is doing any + * asynchronous writing, wait for the I/O to complete. If the index + * is not open yet and we are doing a rebuild of the master index, + * pause the rebuild so that it can be resumed later. May be called + * from any thread. + * + * @param index The dedupe index + * @param saveFlag True if we should save the index + **/ + void (*suspend)(DedupeIndex *index, bool saveFlag); + + /** + * Resume a suspended dedupe index. May be called from any thread. + * + * @param index The dedupe index + **/ + void (*resume)(DedupeIndex *index); + + /** + * Finish the dedupe index; shuts it down for good and prepares to + * free resources. After this point, no more requests may be sent to + * it. + * + * @param index The dedupe index + **/ + void (*finish)(DedupeIndex *index); + + /** + * Look up the chunkname of the DataKVIO and associate the new PBN with the + * name. + * + * @param dataKVIO The DataKVIO + **/ + void (*update)(DataKVIO *dataKVIO); +}; + +/** + * Make a dedupe index + * + * @param indexPtr dedupe index returned here + * @param layer the kernel layer + * + * @return VDO_SUCCESS or an error code + **/ +int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) + __attribute__((warn_unused_result)); + + +/** + * Do the dedupe section of dmsetup message vdo0 0 dump ... + * + * @param index The dedupe index + * @param showQueue true to dump a dedupe work queue + **/ +static inline void dumpDedupeIndex(DedupeIndex *index, bool showQueue) +{ + index->dump(index, showQueue); +} + +/** + * Free the dedupe index + * + * @param index The dedupe index + **/ +static inline void freeDedupeIndex(DedupeIndex **index) +{ + if (*index != NULL) { + (*index)->free(*index); + *index = NULL; + } +} + +/** + * Get the name of the deduplication state + * + * @param index The dedupe index + * + * @return the dedupe state name + **/ +static inline const char *getDedupeStateName(DedupeIndex *index) +{ + return index->getDedupeStateName(index); +} + +/** + * Get the index statistics + * + * @param index The dedupe index + * @param stats The index statistics + **/ +static inline void getIndexStatistics(DedupeIndex *index, + IndexStatistics *stats) +{ + return index->getStatistics(index, stats); +} + +/** + * Return from a dedupe operation by invoking the callback function + * + * @param dataKVIO The DataKVIO + **/ +static inline void invokeDedupeCallback(DataKVIO *dataKVIO) +{ + + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F($dup);cb=dedupe($dup)")); + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/** + * Process a dmsetup message directed to the index. + * + * @param index The dedupe index + * @param name The message name + * + * @return 0 or an error code + **/ +static inline int messageDedupeIndex(DedupeIndex *index, const char *name) +{ + return index->message(index, name); +} + +/** + * Look up the chunkname of the DataKVIO and identify duplicated chunks. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice to offer to the index will be obtained + * via getDedupeAdvice(). The advice found in the index + * (or NULL if none) will be returned via setDedupeAdvice(). + * dedupeContext.status is set to the return status code of + * any asynchronous index processing. + **/ +static inline void postDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->post(dataKVIO); +} + +/** + * Look up the chunkname of the DataKVIO and identify duplicated chunks. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice found in the index (or NULL if none) will + * be returned via setDedupeAdvice(). + * dedupeContext.status is set to the return status code of + * any asynchronous index processing. + **/ +static inline void queryDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->query(dataKVIO); +} + +/** + * Start the dedupe index. + * + * @param index The dedupe index + * @param createFlag If true, create a new index without first attempting + * to load an existing index + **/ +static inline void startDedupeIndex(DedupeIndex *index, bool createFlag) +{ + index->start(index, createFlag); +} + +/** + * Stop the dedupe index. May be called by any thread, but will wait for + * the shutdown to be completed. + * + * @param index The dedupe index + **/ +static inline void stopDedupeIndex(DedupeIndex *index) +{ + return index->stop(index); +} + +/** + * Suspend the dedupe index. If there are any outstanding index + * requests, wait for them to finish. If the index is doing any + * asynchronous writing, wait for the I/O to complete. If the index is + * not open yet and we are doing a rebuild of the master index, pause + * the rebuild so that it can be resumed later. May be called from any + * thread. + * + * @param index The dedupe index + * @param saveFlag True if we should save the index + **/ +static inline void suspendDedupeIndex(DedupeIndex *index, bool saveFlag) +{ + index->suspend(index, saveFlag); +} + +/** + * Resume a suspended dedupe index. May be called from any thread. + * + * @param index The dedupe index + **/ +static inline void resumeDedupeIndex(DedupeIndex *index) +{ + index->resume(index); +} + +/** + * Finish the dedupe index. + * + * @param index The dedupe index + **/ +static inline void finishDedupeIndex(DedupeIndex *index) +{ + return index->finish(index); +} + +/** + * Look up the chunkname of the DataKVIO and associate the new PBN with the + * name. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice to offer to the index will be obtained + * via getDedupeAdvice(). dedupeContext.status is set to the + * return status code of any asynchronous index processing. + **/ +static inline void updateDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->update(dataKVIO); +} + +// Interval (in milliseconds or jiffies) from submission until switching to +// fast path and skipping Albireo. +extern unsigned int albireoTimeoutInterval; +extern Jiffies albireoTimeoutJiffies; + +// Minimum time interval (in milliseconds) between timer invocations to +// check for requests waiting for Albireo that should now time out. +extern unsigned int minAlbireoTimerInterval; + +/** + * Calculate the actual end of a timer, taking into account the absolute + * start time and the present time. + * + * @param startJiffies The absolute start time, in jiffies + * + * @return the absolute end time for the timer, in jiffies + **/ +Jiffies getAlbireoTimeout(Jiffies startJiffies); + +/** + * Set the interval from submission until switching to fast path and + * skipping Albireo. + * + * @param value The number of milliseconds + **/ +void setAlbireoTimeoutInterval(unsigned int value); + +/** + * Set the minimum time interval between timer invocations to check for + * requests waiting for Albireo that should now time out. + * + * @param value The number of milliseconds + **/ +void setMinAlbireoTimerInterval(unsigned int value); + +#endif /* DEDUPE_INDEX_H */ diff --git a/vdo/kernel/deviceConfig.c b/vdo/kernel/deviceConfig.c new file mode 100644 index 0000000..08e864c --- /dev/null +++ b/vdo/kernel/deviceConfig.c @@ -0,0 +1,769 @@ +/** + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.c#14 $ + */ + +#include "deviceConfig.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "kernelLayer.h" +#include "vdoStringUtils.h" + +#include "constants.h" + +enum { + // If we bump this, update the arrays below + TABLE_VERSION = 2, + // Limits used when parsing thread-count config spec strings + BIO_ROTATION_INTERVAL_LIMIT = 1024, + LOGICAL_THREAD_COUNT_LIMIT = 60, + PHYSICAL_THREAD_COUNT_LIMIT = 16, + THREAD_COUNT_LIMIT = 100, + // XXX The bio-submission queue configuration defaults are temporarily + // still being defined here until the new runtime-based thread + // configuration has been fully implemented for managed VDO devices. + + // How many bio submission work queues to use + DEFAULT_NUM_BIO_SUBMIT_QUEUES = 4, + // How often to rotate between bio submission work queues + DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, +}; + +// arrays for handling different table versions +static const uint8_t REQUIRED_ARGC[] = {10, 12, 9}; +static const uint8_t POOL_NAME_ARG_INDEX[] = {8, 10, 8}; + +/** + * Decide the version number from argv. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [out] errorPtr A pointer to return a error string in + * @param [out] versionPtr A pointer to return the version + * + * @return VDO_SUCCESS or an error code + **/ +static int getVersionNumber(int argc, + char **argv, + char **errorPtr, + TableVersion *versionPtr) +{ + // version, if it exists, is in a form of V + if (sscanf(argv[0], "V%u", versionPtr) == 1) { + if (*versionPtr < 1 || *versionPtr > TABLE_VERSION) { + *errorPtr = "Unknown version number detected"; + return VDO_BAD_CONFIGURATION; + } + } else { + // V0 actually has no version number in the table string + *versionPtr = 0; + } + + // V0 and V1 have no optional parameters. There will always be + // a parameter for thread config, even if its a "." to show + // its an empty list. + if (*versionPtr <= 1) { + if (argc != REQUIRED_ARGC[*versionPtr]) { + *errorPtr = "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + } else if (argc < REQUIRED_ARGC[*versionPtr]) { + *errorPtr = "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + + if (*versionPtr != TABLE_VERSION) { + logWarning("Detected version mismatch between kernel module and tools " + " kernel: %d, tool: %d", TABLE_VERSION, *versionPtr); + logWarning("Please consider upgrading management tools to match kernel."); + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int getPoolNameFromArgv(int argc, + char **argv, + char **errorPtr, + char **poolNamePtr) +{ + TableVersion version; + int result = getVersionNumber(argc, argv, errorPtr, &version); + if (result != VDO_SUCCESS) { + return result; + } + *poolNamePtr = argv[POOL_NAME_ARG_INDEX[version]]; + return VDO_SUCCESS; +} + +/** + * Resolve the config with write policy, physical size, and other unspecified + * fields based on the device, if needed. + * + * @param [in,out] config The config possibly missing values + * @param [in] verbose Whether to log about the underlying device + **/ +static void resolveConfigWithDevice(DeviceConfig *config, + bool verbose) +{ + struct dm_dev *dev = config->ownedDevice; + struct request_queue *requestQueue = bdev_get_queue(dev->bdev); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0) + bool flushSupported + = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_WC)) != 0); + bool fuaSupported + = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_FUA)) != 0); +#else + bool flushSupported = ((requestQueue->flush_flags & REQ_FLUSH) == REQ_FLUSH); + bool fuaSupported = ((requestQueue->flush_flags & REQ_FUA) == REQ_FUA); +#endif + if (verbose) { + logInfo("underlying device, REQ_FLUSH: %s, REQ_FUA: %s", + (flushSupported ? "supported" : "not supported"), + (fuaSupported ? "supported" : "not supported")); + } else { + // We should probably always log, but need to make sure that makes sense + // before changing behavior. + } + + if (config->writePolicy == WRITE_POLICY_AUTO) { + config->writePolicy + = (flushSupported ? WRITE_POLICY_ASYNC : WRITE_POLICY_SYNC); + logInfo("Using write policy %s automatically.", + getConfigWritePolicyString(config)); + } else { + logInfo("Using write policy %s.", getConfigWritePolicyString(config)); + } + + if (flushSupported && (config->writePolicy == WRITE_POLICY_SYNC)) { + logWarning("WARNING: Running in sync mode atop a device supporting flushes" + " is dangerous!"); + } + + if (config->version == 0) { + uint64_t deviceSize = i_size_read(dev->bdev->bd_inode); + config->physicalBlocks = deviceSize / VDO_BLOCK_SIZE; + } +} + +/** + * Parse a two-valued option into a bool. + * + * @param [in] boolStr The string value to convert to a bool + * @param [in] trueStr The string value which should be converted to true + * @param [in] falseStr The string value which should be converted to false + * @param [out] boolPtr A pointer to return the bool value in + * + * @return VDO_SUCCESS or an error if boolStr is neither trueStr nor falseStr + **/ +__attribute__((warn_unused_result)) +static inline int parseBool(const char *boolStr, + const char *trueStr, + const char *falseStr, + bool *boolPtr) +{ + bool value = false; + if (strcmp(boolStr, trueStr) == 0) { + value = true; + } else if (strcmp(boolStr, falseStr) == 0) { + value = false; + } else { + return VDO_BAD_CONFIGURATION; + } + + *boolPtr = value; + return VDO_SUCCESS; +} + +/** + * Process one component of a thread parameter configuration string and + * update the configuration data structure. + * + * If the thread count requested is invalid, a message is logged and + * -EINVAL returned. If the thread name is unknown, a message is logged + * but no error is returned. + * + * @param threadParamType The type of thread specified + * @param count The thread count requested + * @param config The configuration data structure to update + * + * @return VDO_SUCCESS or -EINVAL + **/ +static int processOneThreadConfigSpec(const char *threadParamType, + unsigned int count, + ThreadCountConfig *config) +{ + // Handle limited thread parameters + if (strcmp(threadParamType, "bioRotationInterval") == 0) { + if (count == 0) { + logError("thread config string error:" + " 'bioRotationInterval' of at least 1 is required"); + return -EINVAL; + } else if (count > BIO_ROTATION_INTERVAL_LIMIT) { + logError("thread config string error:" + " 'bioRotationInterval' cannot be higher than %d", + BIO_ROTATION_INTERVAL_LIMIT); + return -EINVAL; + } + config->bioRotationInterval = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "logical") == 0) { + if (count > LOGICAL_THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d 'logical' threads" + " are allowed", + LOGICAL_THREAD_COUNT_LIMIT); + return -EINVAL; + } + config->logicalZones = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "physical") == 0) { + if (count > PHYSICAL_THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d 'physical' threads" + " are allowed", + PHYSICAL_THREAD_COUNT_LIMIT); + return -EINVAL; + } + config->physicalZones = count; + return VDO_SUCCESS; + } else { + // Handle other thread count parameters + if (count > THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d '%s' threads" + " are allowed", + THREAD_COUNT_LIMIT, threadParamType); + return -EINVAL; + } + + if (strcmp(threadParamType, "hash") == 0) { + config->hashZones = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "cpu") == 0) { + if (count == 0) { + logError("thread config string error:" + " at least one 'cpu' thread required"); + return -EINVAL; + } + config->cpuThreads = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "ack") == 0) { + config->bioAckThreads = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "bio") == 0) { + if (count == 0) { + logError("thread config string error:" + " at least one 'bio' thread required"); + return -EINVAL; + } + config->bioThreads = count; + return VDO_SUCCESS; + } + } + + // Don't fail, just log. This will handle version mismatches between + // user mode tools and kernel. + logInfo("unknown thread parameter type \"%s\"", threadParamType); + return VDO_SUCCESS; +} + +/** + * Parse one component of a thread parameter configuration string and + * update the configuration data structure. + * + * @param spec The thread parameter specification string + * @param config The configuration data to be updated + **/ +static int parseOneThreadConfigSpec(const char *spec, + ThreadCountConfig *config) +{ + char **fields; + int result = splitString(spec, '=', &fields); + if (result != UDS_SUCCESS) { + return result; + } + if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { + logError("thread config string error:" + " expected thread parameter assignment, saw \"%s\"", + spec); + freeStringArray(fields); + return -EINVAL; + } + + unsigned int count; + result = stringToUInt(fields[1], &count); + if (result != UDS_SUCCESS) { + logError("thread config string error: integer value needed, found \"%s\"", + fields[1]); + freeStringArray(fields); + return result; + } + + result = processOneThreadConfigSpec(fields[0], count, config); + freeStringArray(fields); + return result; +} + +/** + * Parse the configuration string passed and update the specified + * counts and other parameters of various types of threads to be created. + * + * The configuration string should contain one or more comma-separated specs + * of the form "typename=number"; the supported type names are "cpu", "ack", + * "bio", "bioRotationInterval", "logical", "physical", and "hash". + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and pass back an error. + * + * @param string Thread parameter configuration string + * @param config The thread configuration data to update + * + * @return VDO_SUCCESS or -EINVAL or -ENOMEM + **/ +static int parseThreadConfigString(const char *string, + ThreadCountConfig *config) +{ + int result = VDO_SUCCESS; + + char **specs; + if (strcmp(".", string) != 0) { + result = splitString(string, ',', &specs); + if (result != UDS_SUCCESS) { + return result; + } + for (unsigned int i = 0; specs[i] != NULL; i++) { + result = parseOneThreadConfigSpec(specs[i], config); + if (result != VDO_SUCCESS) { + break; + } + } + freeStringArray(specs); + } + return result; +} + +/** + * Process one component of an optional parameter string and + * update the configuration data structure. + * + * If the value requested is invalid, a message is logged and + * -EINVAL returned. If the key is unknown, a message is logged + * but no error is returned. + * + * @param key The optional parameter key name + * @param value The optional parameter value + * @param config The configuration data structure to update + * + * @return VDO_SUCCESS or -EINVAL + **/ +static int processOneKeyValuePair(const char *key, + unsigned int value, + DeviceConfig *config) +{ + // Non thread optional parameters + if (strcmp(key, "maxDiscard") == 0) { + if (value == 0) { + logError("optional parameter error:" + " at least one max discard block required"); + return -EINVAL; + } + // Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 + if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { + logError("optional parameter error: at most %d max discard" + " blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); + return -EINVAL; + } + config->maxDiscardBlocks = value; + return VDO_SUCCESS; + } + // Handles unknown key names + return processOneThreadConfigSpec(key, value, &config->threadCounts); +} + +/** + * Parse one key/value pair and update the configuration + * data structure. + * + * @param key The optional key name + * @param value The optional value + * @param config The configuration data to be updated + * + * @return VDO_SUCCESS or error + **/ +static int parseOneKeyValuePair(const char *key, + const char *value, + DeviceConfig *config) +{ + if (strcmp(key, "deduplication") == 0) { + return parseBool(value, "on", "off", &config->deduplication); + } + + // The remaining arguments must have integral values. + unsigned int count; + int result = stringToUInt(value, &count); + if (result != UDS_SUCCESS) { + logError("optional config string error: integer value needed, found \"%s\"", + value); + return result; + } + return processOneKeyValuePair(key, count, config); +} + +/** + * Parse all key/value pairs from a list of arguments. + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and return the error. + * + * @param argc The total number of arguments in list + * @param argv The list of key/value pairs + * @param config The device configuration data to update + * + * @return VDO_SUCCESS or error + **/ +static int parseKeyValuePairs(int argc, + char **argv, + DeviceConfig *config) +{ + int result = VDO_SUCCESS; + while (argc) { + result = parseOneKeyValuePair(argv[0], argv[1], config); + if (result != VDO_SUCCESS) { + break; + } + + argc -= 2; + argv += 2; + } + + return result; +} + +/** + * Parse the configuration string passed in for optional arguments. + * + * For V0/V1 configurations, there will only be one optional parameter; + * the thread configuration. The configuration string should contain + * one or more comma-separated specs of the form "typename=number"; the + * supported type names are "cpu", "ack", "bio", "bioRotationInterval", + * "logical", "physical", and "hash". + * + * For V2 configurations and beyond, there could be any number of + * arguments. They should contain one or more key/value pairs + * separated by a space. + * + * @param argSet The structure holding the arguments to parse + * @param errorPtr Pointer to a buffer to hold the error string + * @param config Pointer to device configuration data to update + * + * @return VDO_SUCCESS or error + */ +int parseOptionalArguments(struct dm_arg_set *argSet, + char **errorPtr, + DeviceConfig *config) +{ + int result = VDO_SUCCESS; + + if (config->version == 0 || config->version == 1) { + result = parseThreadConfigString(argSet->argv[0], + &config->threadCounts); + if (result != VDO_SUCCESS) { + *errorPtr = "Invalid thread-count configuration"; + return VDO_BAD_CONFIGURATION; + } + } else { + if ((argSet->argc % 2) != 0) { + *errorPtr = "Odd number of optional arguments given but they" + " should be pairs"; + return VDO_BAD_CONFIGURATION; + } + result = parseKeyValuePairs(argSet->argc, argSet->argv, config); + if (result != VDO_SUCCESS) { + *errorPtr = "Invalid optional argument configuration"; + return VDO_BAD_CONFIGURATION; + } + } + return result; +} + +/** + * Handle a parsing error. + * + * @param configPtr A pointer to the config to free + * @param errorPtr A place to store a constant string about the error + * @param errorStr A constant string to store in errorPtr + **/ +static void handleParseError(DeviceConfig **configPtr, + char **errorPtr, + char *errorStr) +{ + freeDeviceConfig(configPtr); + *errorPtr = errorStr; +} + +/**********************************************************************/ +int parseDeviceConfig(int argc, + char **argv, + struct dm_target *ti, + bool verbose, + DeviceConfig **configPtr) +{ + char **errorPtr = &ti->error; + DeviceConfig *config = NULL; + int result = ALLOCATE(1, DeviceConfig, "DeviceConfig", &config); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not allocate config structure"); + return VDO_BAD_CONFIGURATION; + } + + config->owningTarget = ti; + initializeRing(&config->configNode); + + // Save the original string. + result = joinStrings(argv, argc, ' ', &config->originalString); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not populate string"); + return VDO_BAD_CONFIGURATION; + } + + // Set defaults. + // + // XXX Defaults for bioThreads and bioRotationInterval are currently defined + // using the old configuration scheme of constants. These values are relied + // upon for performance testing on MGH machines currently. + // This should be replaced with the normally used testing defaults being + // defined in the file-based thread-configuration settings. The values used + // as defaults internally should really be those needed for VDO in its + // default shipped-product state. + config->threadCounts = (ThreadCountConfig) { + .bioAckThreads = 1, + .bioThreads = DEFAULT_NUM_BIO_SUBMIT_QUEUES, + .bioRotationInterval = DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, + .cpuThreads = 1, + .logicalZones = 0, + .physicalZones = 0, + .hashZones = 0, + }; + config->maxDiscardBlocks = 1; + config->deduplication = true; + + struct dm_arg_set argSet; + + argSet.argc = argc; + argSet.argv = argv; + + result = getVersionNumber(argc, argv, errorPtr, &config->version); + if (result != VDO_SUCCESS) { + // getVersionNumber sets errorPtr itself. + handleParseError(&config, errorPtr, *errorPtr); + return result; + } + // Move the arg pointer forward only if the argument was there. + if (config->version >= 1) { + dm_shift_arg(&argSet); + } + + result = duplicateString(dm_shift_arg(&argSet), "parent device name", + &config->parentDeviceName); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not copy parent device name"); + return VDO_BAD_CONFIGURATION; + } + + // Get the physical blocks, if known. + if (config->version >= 1) { + result = kstrtoull(dm_shift_arg(&argSet), 10, &config->physicalBlocks); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid physical block count"); + return VDO_BAD_CONFIGURATION; + } + } + + // Get the logical block size and validate + bool enable512e; + result = parseBool(dm_shift_arg(&argSet), "512", "4096", &enable512e); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid logical block size"); + return VDO_BAD_CONFIGURATION; + } + config->logicalBlockSize = (enable512e ? 512 : 4096); + + // Skip past the two no longer used read cache options. + if (config->version <= 1) { + dm_consume_args(&argSet, 2); + } + + // Get the page cache size. + result = stringToUInt(dm_shift_arg(&argSet), &config->cacheSize); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid block map page cache size"); + return VDO_BAD_CONFIGURATION; + } + + // Get the block map era length. + result = stringToUInt(dm_shift_arg(&argSet), &config->blockMapMaximumAge); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid block map maximum age"); + return VDO_BAD_CONFIGURATION; + } + + // Get the MD RAID5 optimization mode and validate + result = parseBool(dm_shift_arg(&argSet), "on", "off", + &config->mdRaid5ModeEnabled); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid MD RAID5 mode"); + return VDO_BAD_CONFIGURATION; + } + + // Get the write policy and validate. + if (strcmp(argSet.argv[0], "async") == 0) { + config->writePolicy = WRITE_POLICY_ASYNC; + } else if (strcmp(argSet.argv[0], "async-unsafe") == 0) { + config->writePolicy = WRITE_POLICY_ASYNC_UNSAFE; + } else if (strcmp(argSet.argv[0], "sync") == 0) { + config->writePolicy = WRITE_POLICY_SYNC; + } else if (strcmp(argSet.argv[0], "auto") == 0) { + config->writePolicy = WRITE_POLICY_AUTO; + } else { + handleParseError(&config, errorPtr, "Invalid write policy"); + return VDO_BAD_CONFIGURATION; + } + dm_shift_arg(&argSet); + + // Make sure the enum to get the pool name from argv directly is still in + // sync with the parsing of the table line. + if (&argSet.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) { + handleParseError(&config, errorPtr, "Pool name not in expected location"); + return VDO_BAD_CONFIGURATION; + } + + // Get the address where the albserver is running. Check for validation + // is done in dedupe.c code during startKernelLayer call + result = duplicateString(dm_shift_arg(&argSet), "pool name", + &config->poolName); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not copy pool name"); + return VDO_BAD_CONFIGURATION; + } + + // Get the optional arguments and validate. + result = parseOptionalArguments(&argSet, errorPtr, config); + if (result != VDO_SUCCESS) { + // parseOptionalArguments sets errorPtr itself. + handleParseError(&config, errorPtr, *errorPtr); + return result; + } + + /* + * Logical, physical, and hash zone counts can all be zero; then we get one + * thread doing everything, our older configuration. If any zone count is + * non-zero, the others must be as well. + */ + if (((config->threadCounts.logicalZones == 0) + != (config->threadCounts.physicalZones == 0)) + || ((config->threadCounts.physicalZones == 0) + != (config->threadCounts.hashZones == 0)) + ) { + handleParseError(&config, errorPtr, + "Logical, physical, and hash zones counts must all be" + " zero or all non-zero"); + return VDO_BAD_CONFIGURATION; + } + + result = dm_get_device(ti, config->parentDeviceName, + dm_table_get_mode(ti->table), &config->ownedDevice); + if (result != 0) { + logError("couldn't open device \"%s\": error %d", + config->parentDeviceName, result); + handleParseError(&config, errorPtr, "Unable to open storage device"); + return VDO_BAD_CONFIGURATION; + } + + resolveConfigWithDevice(config, verbose); + + *configPtr = config; + return result; +} + +/**********************************************************************/ +void freeDeviceConfig(DeviceConfig **configPtr) +{ + if (configPtr == NULL) { + return; + } + + DeviceConfig *config = *configPtr; + if (config == NULL) { + *configPtr = NULL; + return; + } + + if (config->ownedDevice != NULL) { + dm_put_device(config->owningTarget, config->ownedDevice); + } + + FREE(config->poolName); + FREE(config->parentDeviceName); + FREE(config->originalString); + + // Reduce the chance a use-after-free (as in BZ 1669960) happens to work. + memset(config, 0, sizeof(*config)); + + FREE(config); + *configPtr = NULL; +} + +/**********************************************************************/ +const char *getConfigWritePolicyString(DeviceConfig *config) +{ + switch (config->writePolicy) { + case WRITE_POLICY_AUTO: + return "auto"; + case WRITE_POLICY_ASYNC: + return "async"; + case WRITE_POLICY_ASYNC_UNSAFE: + return "async-unsafe"; + case WRITE_POLICY_SYNC: + return "sync"; + default: + return "unknown"; + } +} + +/**********************************************************************/ +void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer) +{ + unspliceRingNode(&config->configNode); + if (layer != NULL) { + pushRingNode(&layer->deviceConfigRing, &config->configNode); + } + config->layer = layer; +} diff --git a/vdo/kernel/deviceConfig.h b/vdo/kernel/deviceConfig.h new file mode 100644 index 0000000..36199dd --- /dev/null +++ b/vdo/kernel/deviceConfig.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.h#11 $ + */ +#ifndef DEVICE_CONFIG_H +#define DEVICE_CONFIG_H + +#include + +#include "ringNode.h" + +#include "kernelTypes.h" + +// This structure is memcmp'd for equality. Keep it +// packed and don't add any fields that are not +// properly set in both extant and parsed configs. +typedef struct { + int bioAckThreads; + int bioThreads; + int bioRotationInterval; + int cpuThreads; + int logicalZones; + int physicalZones; + int hashZones; +} __attribute__((packed)) ThreadCountConfig; + +typedef uint32_t TableVersion; + +typedef struct { + struct dm_target *owningTarget; + struct dm_dev *ownedDevice; + KernelLayer *layer; + /** All configs referencing a layer are kept on a ring in the layer */ + RingNode configNode; + char *originalString; + TableVersion version; + char *parentDeviceName; + BlockCount physicalBlocks; + unsigned int logicalBlockSize; + WritePolicy writePolicy; + unsigned int cacheSize; + unsigned int blockMapMaximumAge; + bool mdRaid5ModeEnabled; + bool deduplication; + char *poolName; + ThreadCountConfig threadCounts; + BlockCount maxDiscardBlocks; +} DeviceConfig; + +/** + * Convert a RingNode to the DeviceConfig that contains it. + * + * @param node The RingNode to convert + * + * @return The DeviceConfig wrapping the RingNode + **/ +static inline DeviceConfig *asDeviceConfig(RingNode *node) +{ + if (node == NULL) { + return NULL; + } + return (DeviceConfig *) ((byte *) node - offsetof(DeviceConfig, configNode)); +} + +/** + * Grab a pointer to the pool name out of argv. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [out] errorPtr A pointer to return a error string in + * @param [out] poolNamePtr A pointer to return the pool name + * + * @return VDO_SUCCESS or an error code + **/ +int getPoolNameFromArgv(int argc, + char **argv, + char **errorPtr, + char **poolNamePtr) + __attribute__((warn_unused_result)); + +/** + * Convert the dmsetup table into a DeviceConfig. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [in] ti The target structure for this table + * @param [in] verbose Whether to log about the underlying device + * @param [out] configPtr A pointer to return the allocated config + * + * @return VDO_SUCCESS or an error code + **/ +int parseDeviceConfig(int argc, + char **argv, + struct dm_target *ti, + bool verbose, + DeviceConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Free a device config created by parseDeviceConfig(). + * + * @param configPtr The pointer holding the config, which will be nulled + **/ +void freeDeviceConfig(DeviceConfig **configPtr); + +/** + * Get the text describing the write policy. + * + * @param config The device config + * + * @returns a pointer to a string describing the write policy + **/ +const char *getConfigWritePolicyString(DeviceConfig *config) + __attribute__((warn_unused_result)); + +/** + * Acquire or release a reference from the config to a kernel layer. + * + * @param config The config in question + * @param layer The kernel layer in question + **/ +void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer); + +#endif // DEVICE_CONFIG_H diff --git a/vdo/kernel/deviceRegistry.c b/vdo/kernel/deviceRegistry.c new file mode 100644 index 0000000..13764b4 --- /dev/null +++ b/vdo/kernel/deviceRegistry.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.c#3 $ + */ + +#include "deviceRegistry.h" + +#include +#include +#include + +#include "memoryAlloc.h" + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. We can use a PointerMap if we need to later. + */ +typedef struct { + struct list_head links; + rwlock_t lock; +} DeviceRegistry; + +typedef struct { + struct list_head links; + KernelLayer *layer; +} RegisteredDevice; + +static DeviceRegistry registry; + +/**********************************************************************/ +void initializeDeviceRegistryOnce(void) +{ + INIT_LIST_HEAD(®istry.links); + rwlock_init(®istry.lock); +} + +/** + * Implements LayerFilter. + **/ +static bool layerIsEqual(KernelLayer *layer, void *context) +{ + return ((void *) layer == context); +} + +/** + * Find a layer in the registry if it exists there. Must be called holding + * the lock. + * + * @param filter The filter function to apply to devices + * @param context A bit of context to provide the filter. + * + * @return the layer object found, if any + **/ +__attribute__((warn_unused_result)) +static KernelLayer *filterLayersLocked(LayerFilter *filter, void *context) +{ + RegisteredDevice *device; + list_for_each_entry(device, ®istry.links, links) { + if (filter(device->layer, context)) { + return device->layer; + } + } + return NULL; +} + +/**********************************************************************/ +int addLayerToDeviceRegistry(KernelLayer *layer) +{ + RegisteredDevice *newDevice; + int result = ALLOCATE(1, RegisteredDevice, __func__, &newDevice); + if (result != VDO_SUCCESS) { + return result; + } + + INIT_LIST_HEAD(&newDevice->links); + newDevice->layer = layer; + + write_lock(®istry.lock); + KernelLayer *oldLayer = filterLayersLocked(layerIsEqual, layer); + result = ASSERT(oldLayer == NULL, "Layer not already registered"); + if (result == VDO_SUCCESS) { + list_add_tail(&newDevice->links, ®istry.links); + } + write_unlock(®istry.lock); + + return result; +} + +/**********************************************************************/ +void removeLayerFromDeviceRegistry(KernelLayer *layer) +{ + write_lock(®istry.lock); + RegisteredDevice *device = NULL; + list_for_each_entry(device, ®istry.links, links) { + if (device->layer == layer) { + list_del_init(&device->links); + FREE(device); + break; + } + } + write_unlock(®istry.lock); +} + +/**********************************************************************/ +KernelLayer *findLayerMatching(LayerFilter *filter, void *context) +{ + read_lock(®istry.lock); + KernelLayer *layer = filterLayersLocked(filter, context); + read_unlock(®istry.lock); + return layer; +} diff --git a/vdo/kernel/deviceRegistry.h b/vdo/kernel/deviceRegistry.h new file mode 100644 index 0000000..94c1635 --- /dev/null +++ b/vdo/kernel/deviceRegistry.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.h#2 $ + */ + +#ifndef DEVICE_REGISTRY_H +#define DEVICE_REGISTRY_H + +#include "kernelTypes.h" + +/** + * Initialize the necessary structures for the device registry. + **/ +void initializeDeviceRegistryOnce(void); + +/** + * Add a layer to the device registry. The layer must not already exist in the + * registry. + * + * @param layer The layer to add + * + * @return VDO_SUCCESS or an error + **/ +int addLayerToDeviceRegistry(KernelLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Remove a layer from the device registry. + * + * @param layer The layer to remove + **/ +void removeLayerFromDeviceRegistry(KernelLayer *layer); + +/** + * Find and return the first (if any) layer matching a given filter function. + * + * @param filter The filter function to apply to layers + * @param context A bit of context to provide the filter. + **/ +KernelLayer *findLayerMatching(LayerFilter *filter, void *context); + +#endif // DEVICE_REGISTRY_H diff --git a/vdo/kernel/dmvdo.c b/vdo/kernel/dmvdo.c new file mode 100644 index 0000000..a6c7b98 --- /dev/null +++ b/vdo/kernel/dmvdo.c @@ -0,0 +1,889 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.c#42 $ + */ + +#include "dmvdo.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "constants.h" +#include "ringNode.h" +#include "threadConfig.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "deviceRegistry.h" +#include "dump.h" +#include "instanceNumber.h" +#include "ioSubmitter.h" +#include "kernelLayer.h" +#include "kvdoFlush.h" +#include "memoryUsage.h" +#include "statusProcfs.h" +#include "stringUtils.h" +#include "sysfs.h" +#include "threadDevice.h" +#include "threadRegistry.h" + +struct kvdoDevice kvdoDevice; // global driver state (poorly named) + +/* + * Pre kernel version 4.3, we use the functionality in blkdev_issue_discard + * and the value in max_discard_sectors to split large discards into smaller + * ones. 4.3 to 4.18 kernels have removed the code in blkdev_issue_discard + * and so in place of that, we use the code in device mapper itself to + * split the discards. Unfortunately, it uses the same value to split large + * discards as it does to split large data bios. + * + * In kernel version 4.18, support for splitting discards was added + * back into blkdev_issue_discard. Since this mode of splitting + * (based on max_discard_sectors) is preferable to splitting always + * on 4k, we are turning off the device mapper splitting from 4.18 + * on. + */ +#define HAS_NO_BLKDEV_SPLIT LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) \ + && LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0) + +/**********************************************************************/ + +/** + * Get the kernel layer associated with a dm target structure. + * + * @param ti The dm target structure + * + * @return The kernel layer, or NULL. + **/ +static KernelLayer *getKernelLayerForTarget(struct dm_target *ti) +{ + return ((DeviceConfig *) ti->private)->layer; +} + +/** + * Begin VDO processing of a bio. This is called by the device mapper + * through the "map" function, and has resulted from a call to either + * submit_bio or generic_make_request. + * + * @param ti The dm_target. We only need the "private" member to give + * us the KernelLayer. + * @param bio The bio. + * + * @return One of these values: + * + * negative A negative value is an error code. + * Usually -EIO. + * + * DM_MAPIO_SUBMITTED VDO will take care of this I/O, either + * processing it completely and calling + * bio_endio, or forwarding it onward by + * calling generic_make_request. + * + * DM_MAPIO_REMAPPED VDO has modified the bio and the device + * mapper will immediately forward the bio + * onward using generic_make_request. + * + * DM_MAPIO_REQUEUE We do not use this. It is used by device + * mapper devices to defer an I/O request + * during suspend/resume processing. + **/ +static int vdoMapBio(struct dm_target *ti, BIO *bio) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + return kvdoMapBio(layer, bio); +} + +/**********************************************************************/ +static void vdoIoHints(struct dm_target *ti, struct queue_limits *limits) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + + limits->logical_block_size = layer->deviceConfig->logicalBlockSize; + limits->physical_block_size = VDO_BLOCK_SIZE; + + // The minimum io size for random io + blk_limits_io_min(limits, VDO_BLOCK_SIZE); + // The optimal io size for streamed/sequential io + blk_limits_io_opt(limits, VDO_BLOCK_SIZE); + + /* + * Sets the maximum discard size that will be passed into VDO. This value + * comes from a table line value passed in during dmsetup create. + * + * The value 1024 is the largest usable value on HD systems. A 2048 sector + * discard on a busy HD system takes 31 seconds. We should use a value no + * higher than 1024, which takes 15 to 16 seconds on a busy HD system. + * + * But using large values results in 120 second blocked task warnings in + * /var/log/kern.log. In order to avoid these warnings, we choose to use the + * smallest reasonable value. See VDO-3062 and VDO-3087. + * + * We allow setting of the value for max_discard_sectors even in situations + * where we only split on 4k (see comments for HAS_NO_BLKDEV_SPLIT) as the + * value is still used in other code, like sysfs display of queue limits and + * most especially in dm-thin to determine whether to pass down discards. + */ + limits->max_discard_sectors + = layer->deviceConfig->maxDiscardBlocks * VDO_SECTORS_PER_BLOCK; + + limits->discard_granularity = VDO_BLOCK_SIZE; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) + limits->discard_zeroes_data = 1; +#endif +} + +/**********************************************************************/ +static int vdoIterateDevices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + sector_t len = blockToSector(layer, layer->deviceConfig->physicalBlocks); + + return fn(ti, layer->deviceConfig->ownedDevice, 0, len, data); +} + +/* + * Status line is: + * + * + */ + +/**********************************************************************/ +static void vdoStatus(struct dm_target *ti, + status_type_t status_type, + unsigned int status_flags, + char *result, + unsigned int maxlen) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + char nameBuffer[BDEVNAME_SIZE]; + // N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". + int sz = 0; + + switch (status_type) { + case STATUSTYPE_INFO: + // Report info for dmsetup status + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + VDOStatistics *stats = &layer->vdoStatsStorage; + DMEMIT("/dev/%s %s %s %s %s %llu %llu", + bdevname(getKernelLayerBdev(layer), nameBuffer), + stats->mode, + stats->inRecoveryMode ? "recovering" : "-", + getDedupeStateName(layer->dedupeIndex), + getKVDOCompressing(&layer->kvdo) ? "online" : "offline", + stats->dataBlocksUsed + stats->overheadBlocksUsed, + stats->physicalBlocks); + mutex_unlock(&layer->statsMutex); + break; + + case STATUSTYPE_TABLE: + // Report the string actually specified in the beginning. + DMEMIT("%s", ((DeviceConfig *) ti->private)->originalString); + break; + } + +// spin_unlock_irqrestore(&layer->lock, flags); +} + + +/** + * Get the size of the underlying device, in blocks. + * + * @param [in] layer The layer + * + * @return The size in blocks + **/ +static BlockCount getUnderlyingDeviceBlockCount(KernelLayer *layer) +{ + uint64_t physicalSize = i_size_read(getKernelLayerBdev(layer)->bd_inode); + return physicalSize / VDO_BLOCK_SIZE; +} + +/**********************************************************************/ +static int vdoPrepareToGrowLogical(KernelLayer *layer, char *sizeString) +{ + BlockCount logicalCount; + if (sscanf(sizeString, "%llu", &logicalCount) != 1) { + logWarning("Logical block count \"%s\" is not a number", sizeString); + return -EINVAL; + } + + if (logicalCount > MAXIMUM_LOGICAL_BLOCKS) { + logWarning("Logical block count \"%llu\" exceeds the maximum (%" + PRIu64 ")", logicalCount, MAXIMUM_LOGICAL_BLOCKS); + return -EINVAL; + } + + return prepareToResizeLogical(layer, logicalCount); +} + +/** + * Process a dmsetup message now that we know no other message is being + * processed. + * + * @param layer The layer to which the message was sent + * @param argc The argument count of the message + * @param argv The arguments to the message + * + * @return -EINVAL if the message is unrecognized or the result of processing + * the message + **/ +__attribute__((warn_unused_result)) +static int processVDOMessageLocked(KernelLayer *layer, + unsigned int argc, + char **argv) +{ + // Messages with variable numbers of arguments. + if (strncasecmp(argv[0], "x-", 2) == 0) { + int result = performKVDOExtendedCommand(&layer->kvdo, argc, argv); + if (result == VDO_UNKNOWN_COMMAND) { + logWarning("unknown extended command '%s' to dmsetup message", argv[0]); + result = -EINVAL; + } + + return result; + } + + // Messages with fixed numbers of arguments. + switch (argc) { + case 1: + if (strcasecmp(argv[0], "sync-dedupe") == 0) { + waitForNoRequestsActive(layer); + return 0; + } + + if (strcasecmp(argv[0], "trace-on") == 0) { + logInfo("Tracing on"); + layer->traceLogging = true; + return 0; + } + + if (strcasecmp(argv[0], "trace-off") == 0) { + logInfo("Tracing off"); + layer->traceLogging = false; + return 0; + } + + if (strcasecmp(argv[0], "prepareToGrowPhysical") == 0) { + return prepareToResizePhysical(layer, + getUnderlyingDeviceBlockCount(layer)); + } + + if (strcasecmp(argv[0], "growPhysical") == 0) { + // The actual growPhysical will happen when the device is resumed. + + if (layer->deviceConfig->version != 0) { + // XXX Uncomment this branch when new VDO manager is updated to not + // send this message. + + // Old style message on new style table is unexpected; it means the + // user started the VDO with new manager and is growing with old. + // logInfo("Mismatch between growPhysical method and table version."); + // return -EINVAL; + } else { + layer->deviceConfig->physicalBlocks + = getUnderlyingDeviceBlockCount(layer); + } + return 0; + } + + break; + + case 2: + if (strcasecmp(argv[0], "compression") == 0) { + if (strcasecmp(argv[1], "on") == 0) { + setKVDOCompressing(&layer->kvdo, true); + return 0; + } + + if (strcasecmp(argv[1], "off") == 0) { + setKVDOCompressing(&layer->kvdo, false); + return 0; + } + + logWarning("invalid argument '%s' to dmsetup compression message", + argv[1]); + return -EINVAL; + } + + if (strcasecmp(argv[0], "prepareToGrowLogical") == 0) { + return vdoPrepareToGrowLogical(layer, argv[1]); + } + + break; + + + default: + break; + } + + logWarning("unrecognized dmsetup message '%s' received", argv[0]); + return -EINVAL; +} + +/** + * Process a dmsetup message. If the message is a dump, just do it. Otherwise, + * check that no other message is being processed, and only proceed if so. + * + * @param layer The layer to which the message was sent + * @param argc The argument count of the message + * @param argv The arguments to the message + * + * @return -EBUSY if another message is being processed or the result of + * processsing the message + **/ +__attribute__((warn_unused_result)) +static int processVDOMessage(KernelLayer *layer, + unsigned int argc, + char **argv) +{ + /* + * All messages which may be processed in parallel with other messages should + * be handled here before the atomic check below. Messages which should be + * exclusive should be processed in processVDOMessageLocked(). + */ + + // Dump messages should always be processed + if (strcasecmp(argv[0], "dump") == 0) { + return vdoDump(layer, argc, argv, "dmsetup message"); + } + + if (argc == 1) { + if (strcasecmp(argv[0], "dump-on-shutdown") == 0) { + layer->dumpOnShutdown = true; + return 0; + } + + // Index messages should always be processed + if ((strcasecmp(argv[0], "index-close") == 0) + || (strcasecmp(argv[0], "index-create") == 0) + || (strcasecmp(argv[0], "index-disable") == 0) + || (strcasecmp(argv[0], "index-enable") == 0)) { + return messageDedupeIndex(layer->dedupeIndex, argv[0]); + } + + // XXX - the "connect" messages are misnamed for the kernel index. These + // messages should go away when all callers have been fixed to use + // "index-enable" or "index-disable". + if (strcasecmp(argv[0], "reconnect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-enable"); + } + + if (strcasecmp(argv[0], "connect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-enable"); + } + + if (strcasecmp(argv[0], "disconnect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-disable"); + } + } + + if (!compareAndSwapBool(&layer->processingMessage, false, true)) { + return -EBUSY; + } + + int result = processVDOMessageLocked(layer, argc, argv); + atomicStoreBool(&layer->processingMessage, false); + return result; +} + +/**********************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) +static int vdoMessage(struct dm_target *ti, + unsigned int argc, + char **argv, + char *resultBuffer, + unsigned int maxlen) +#else +static int vdoMessage(struct dm_target *ti, unsigned int argc, char **argv) +#endif +{ + if (argc == 0) { + logWarning("unspecified dmsetup message"); + return -EINVAL; + } + + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = processVDOMessage(layer, argc, argv); + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return mapToSystemError(result); +} + +/** + * Configure the dm_target with our capabilities. + * + * @param ti The device mapper target representing our device + * @param layer The kernel layer to get the write policy from + **/ +static void configureTargetCapabilities(struct dm_target *ti, + KernelLayer *layer) +{ + ti->discards_supported = 1; + + /** + * This may appear to indicate we don't support flushes in sync mode. + * However, dm will set up the request queue to accept flushes if any + * device in the stack accepts flushes. Hence if the device under VDO + * accepts flushes, we will receive flushes. + **/ + ti->flush_supported = shouldProcessFlush(layer); + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + + // If this value changes, please make sure to update the + // value for maxDiscardSectors accordingly. + BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0); + +/* + * Please see comments above where the macro is defined. + */ +#if HAS_NO_BLKDEV_SPLIT + ti->split_discard_bios = 1; +#endif +} + +/** + * Handle a vdoInitialize failure, freeing all appropriate structures. + * + * @param ti The device mapper target representing our device + * @param threadConfig The thread config (possibly NULL) + * @param layer The kernel layer (possibly NULL) + * @param instance The instance number to be released + * @param why The reason for failure + **/ +static void cleanupInitialize(struct dm_target *ti, + ThreadConfig *threadConfig, + KernelLayer *layer, + unsigned int instance, + char *why) +{ + if (threadConfig != NULL) { + freeThreadConfig(&threadConfig); + } + if (layer != NULL) { + // This releases the instance number too. + freeKernelLayer(layer); + } else { + // With no KernelLayer taking ownership we have to release explicitly. + releaseKVDOInstance(instance); + } + + ti->error = why; +} + +/** + * Initializes a single VDO instance and loads the data from disk + * + * @param ti The device mapper target representing our device + * @param instance The device instantiation counter + * @param config The parsed config for the instance + * + * @return VDO_SUCCESS or an error code + * + **/ +static int vdoInitialize(struct dm_target *ti, + unsigned int instance, + DeviceConfig *config) +{ + logInfo("loading device '%s'", config->poolName); + + uint64_t blockSize = VDO_BLOCK_SIZE; + uint64_t logicalSize = to_bytes(ti->len); + BlockCount logicalBlocks = logicalSize / blockSize; + + logDebug("Logical block size = %llu", + (uint64_t) config->logicalBlockSize); + logDebug("Logical blocks = %llu", logicalBlocks); + logDebug("Physical block size = %llu", (uint64_t) blockSize); + logDebug("Physical blocks = %llu", config->physicalBlocks); + logDebug("Block map cache blocks = %u", config->cacheSize); + logDebug("Block map maximum age = %u", config->blockMapMaximumAge); + logDebug("MD RAID5 mode = %s", (config->mdRaid5ModeEnabled + ? "on" : "off")); + logDebug("Write policy = %s", getConfigWritePolicyString(config)); + logDebug("Deduplication = %s", (config->deduplication + ? "on" : "off")); + + // The threadConfig will be copied by the VDO if it's successfully + // created. + VDOLoadConfig loadConfig = { + .cacheSize = config->cacheSize, + .threadConfig = NULL, + .writePolicy = config->writePolicy, + .maximumAge = config->blockMapMaximumAge, + }; + + char *failureReason; + KernelLayer *layer; + int result = makeKernelLayer(ti->begin, instance, config, + &kvdoDevice.kobj, &loadConfig.threadConfig, + &failureReason, &layer); + if (result != VDO_SUCCESS) { + logError("Could not create kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + cleanupInitialize(ti, loadConfig.threadConfig, NULL, instance, + failureReason); + return result; + } + + // Now that we have read the geometry, we can finish setting up the + // VDOLoadConfig. + setLoadConfigFromGeometry(&layer->geometry, &loadConfig); + + if (config->cacheSize < (2 * MAXIMUM_USER_VIOS + * loadConfig.threadConfig->logicalZoneCount)) { + logWarning("Insufficient block map cache for logical zones"); + cleanupInitialize(ti, loadConfig.threadConfig, layer, instance, + "Insufficient block map cache for logical zones"); + return VDO_BAD_CONFIGURATION; + } + + // Henceforth it is the kernel layer's responsibility to clean up the + // ThreadConfig. + result = preloadKernelLayer(layer, &loadConfig, &failureReason); + if (result != VDO_SUCCESS) { + logError("Could not start kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + cleanupInitialize(ti, NULL, layer, instance, failureReason); + return result; + } + + setDeviceConfigLayer(config, layer); + setKernelLayerActiveConfig(layer, config); + ti->private = config; + configureTargetCapabilities(ti, layer); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static int vdoCtr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int result = VDO_SUCCESS; + + RegisteredThread allocatingThread; + registerAllocatingThread(&allocatingThread, NULL); + + const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); + KernelLayer *oldLayer = findLayerMatching(layerIsNamed, (void *)deviceName); + unsigned int instance; + if (oldLayer == NULL) { + result = allocateKVDOInstance(&instance); + if (result != VDO_SUCCESS) { + unregisterAllocatingThread(); + return -ENOMEM; + } + } else { + instance = oldLayer->instance; + } + + RegisteredThread instanceThread; + registerThreadDeviceID(&instanceThread, &instance); + + bool verbose = (oldLayer == NULL); + DeviceConfig *config = NULL; + result = parseDeviceConfig(argc, argv, ti, verbose, &config); + if (result != VDO_SUCCESS) { + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + if (oldLayer == NULL) { + releaseKVDOInstance(instance); + } + return -EINVAL; + } + + // Is there already a device of this name? + if (oldLayer != NULL) { + /* + * To preserve backward compatibility with old VDO Managers, we need to + * allow this to happen when either suspended or not. We could assert + * that if the config is version 0, we are suspended, and if not, we + * are not, but we can't do that till new VDO Manager does the right + * order. + */ + logInfo("preparing to modify device '%s'", config->poolName); + result = prepareToModifyKernelLayer(oldLayer, config, &ti->error); + if (result != VDO_SUCCESS) { + result = mapToSystemError(result); + freeDeviceConfig(&config); + } else { + setDeviceConfigLayer(config, oldLayer); + ti->private = config; + configureTargetCapabilities(ti, oldLayer); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; + } + + result = vdoInitialize(ti, instance, config); + if (result != VDO_SUCCESS) { + // vdoInitialize calls into various VDO routines, so map error + result = mapToSystemError(result); + freeDeviceConfig(&config); + } + + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static void vdoDtr(struct dm_target *ti) +{ + DeviceConfig *config = ti->private; + KernelLayer *layer = config->layer; + + setDeviceConfigLayer(config, NULL); + + if (isRingEmpty(&layer->deviceConfigRing)) { + // This was the last config referencing the layer. Free it. + unsigned int instance = layer->instance; + RegisteredThread allocatingThread, instanceThread; + registerThreadDeviceID(&instanceThread, &instance); + registerAllocatingThread(&allocatingThread, NULL); + + waitForNoRequestsActive(layer); + logInfo("stopping device '%s'", config->poolName); + + if (layer->dumpOnShutdown) { + vdoDumpAll(layer, "device shutdown"); + } + + freeKernelLayer(layer); + logInfo("device '%s' stopped", config->poolName); + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + } else if (config == layer->deviceConfig) { + // The layer still references this config. Give it a reference to a + // config that isn't being destroyed. + layer->deviceConfig = asDeviceConfig(layer->deviceConfigRing.next); + } + + freeDeviceConfig(&config); + ti->private = NULL; +} + +/**********************************************************************/ +static void vdoPresuspend(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + if (dm_noflush_suspending(ti)) { + layer->noFlushSuspend = true; + } + unregisterThreadDeviceID(); +} + +/**********************************************************************/ +static void vdoPostsuspend(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + const char *poolName = layer->deviceConfig->poolName; + logInfo("suspending device '%s'", poolName); + int result = suspendKernelLayer(layer); + if (result == VDO_SUCCESS) { + logInfo("device '%s' suspended", poolName); + } else { + logError("suspend of device '%s' failed with error: %d", poolName, result); + } + layer->noFlushSuspend = false; + unregisterThreadDeviceID(); +} + +/**********************************************************************/ +static int vdoPreresume(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + DeviceConfig *config = ti->private; + RegisteredThread instanceThread; + + BlockCount backingBlocks = getUnderlyingDeviceBlockCount(layer); + if (backingBlocks < config->physicalBlocks) { + logError("resume of device '%s' failed: backing device has %" PRIu64 + " blocks but VDO physical size is %llu blocks", + config->poolName, backingBlocks, config->physicalBlocks); + return -EINVAL; + } + + registerThreadDevice(&instanceThread, layer); + + if (getKernelLayerState(layer) == LAYER_STARTING) { + // This is the first time this device has been resumed, so run it. + logInfo("starting device '%s'", config->poolName); + char *failureReason; + int result = startKernelLayer(layer, &failureReason); + if (result != VDO_SUCCESS) { + logError("Could not run kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + setKVDOReadOnly(&layer->kvdo, result); + unregisterThreadDeviceID(); + return mapToSystemError(result); + } + + logInfo("device '%s' started", config->poolName); + } + + logInfo("resuming device '%s'", config->poolName); + + // This is a noop if nothing has changed, and by calling it every time + // we capture old-style growPhysicals, which change the config in place. + int result = modifyKernelLayer(layer, config); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, "Commit of modifications to device '%s'" + " failed", config->poolName); + setKernelLayerActiveConfig(layer, config); + setKVDOReadOnly(&layer->kvdo, result); + } else { + setKernelLayerActiveConfig(layer, config); + result = resumeKernelLayer(layer); + if (result != VDO_SUCCESS) { + logError("resume of device '%s' failed with error: %d", + layer->deviceConfig->poolName, result); + } + } + unregisterThreadDeviceID(); + return mapToSystemError(result); +} + +/**********************************************************************/ +static void vdoResume(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + logInfo("device '%s' resumed", layer->deviceConfig->poolName); + unregisterThreadDeviceID(); +} + +/* + * If anything changes that affects how user tools will interact + * with vdo, update the version number and make sure + * documentation about the change is complete so tools can + * properly update their management code. + */ +static struct target_type vdoTargetBio = { + .features = DM_TARGET_SINGLETON, + .name = "vdo", + .version = {6, 2, 3}, + .module = THIS_MODULE, + .ctr = vdoCtr, + .dtr = vdoDtr, + .io_hints = vdoIoHints, + .iterate_devices = vdoIterateDevices, + .map = vdoMapBio, + .message = vdoMessage, + .status = vdoStatus, + .presuspend = vdoPresuspend, + .postsuspend = vdoPostsuspend, + .preresume = vdoPreresume, + .resume = vdoResume, +}; + +static bool dmRegistered = false; +static bool sysfsInitialized = false; + +/**********************************************************************/ +static void vdoDestroy(void) +{ + logDebug("in %s", __func__); + + kvdoDevice.status = SHUTTING_DOWN; + + if (sysfsInitialized) { + vdoPutSysfs(&kvdoDevice.kobj); + } + vdoDestroyProcfs(); + + kvdoDevice.status = UNINITIALIZED; + + if (dmRegistered) { + dm_unregister_target(&vdoTargetBio); + } + + cleanUpInstanceNumberTracking(); + + logInfo("unloaded version %s", CURRENT_VERSION); +} + +/**********************************************************************/ +static int __init vdoInit(void) +{ + int result = 0; + + initializeThreadDeviceRegistry(); + initializeStandardErrorBlocks(); + initializeDeviceRegistryOnce(); + logInfo("loaded version %s", CURRENT_VERSION); + + result = dm_register_target(&vdoTargetBio); + if (result < 0) { + logError("dm_register_target failed %d", result); + vdoDestroy(); + return result; + } + dmRegistered = true; + + kvdoDevice.status = UNINITIALIZED; + + vdoInitProcfs(); + /* + * Set up global sysfs stuff + */ + result = vdoInitSysfs(&kvdoDevice.kobj); + if (result < 0) { + logError("sysfs initialization failed %d", result); + vdoDestroy(); + // vdoInitSysfs only returns system error codes + return result; + } + sysfsInitialized = true; + + initWorkQueueOnce(); + initializeTraceLoggingOnce(); + initKernelVDOOnce(); + initializeInstanceNumberTracking(); + + kvdoDevice.status = READY; + return result; +} + +/**********************************************************************/ +static void __exit vdoExit(void) +{ + vdoDestroy(); +} + +module_init(vdoInit); +module_exit(vdoExit); + +MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_VERSION(CURRENT_VERSION); diff --git a/vdo/kernel/dmvdo.h b/vdo/kernel/dmvdo.h new file mode 100644 index 0000000..a71e39d --- /dev/null +++ b/vdo/kernel/dmvdo.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.h#2 $ + */ + +#ifndef DMVDO_H +#define DMVDO_H + +#include +#include +#include + +#include "kernelLayer.h" + +typedef enum { + UNINITIALIZED = 0, + READY, + SHUTTING_DOWN, +} KVDOStatus; + +/* + * The internal representation of our device. + */ +struct kvdoDevice { + KVDOStatus status; + struct kobject kobj; +}; + +extern struct kvdoDevice kvdoDevice; + +#endif /* DMVDO_H */ diff --git a/vdo/kernel/dump.c b/vdo/kernel/dump.c new file mode 100644 index 0000000..b9b02e2 --- /dev/null +++ b/vdo/kernel/dump.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.c#2 $ + */ + +#include "dump.h" + +#include + +#include "memoryAlloc.h" +#include "typeDefs.h" + +#include "constants.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "histogram.h" +#include "ioSubmitter.h" +#include "logger.h" + +enum dumpOptions { + // WorkQueues + SHOW_ALBIREO_QUEUE, + SHOW_BIO_ACK_QUEUE, + SHOW_BIO_QUEUE, + SHOW_CPU_QUEUES, + SHOW_REQUEST_QUEUE, + // MemoryPools + SHOW_VIO_POOL, + // Others + SHOW_VDO_STATUS, + // This one means an option overrides the "default" choices, instead + // of altering them. + SKIP_DEFAULT +}; + +enum dumpOptionFlags { + // WorkQueues + FLAG_SHOW_ALBIREO_QUEUE = (1 << SHOW_ALBIREO_QUEUE), + FLAG_SHOW_BIO_ACK_QUEUE = (1 << SHOW_BIO_ACK_QUEUE), + FLAG_SHOW_BIO_QUEUE = (1 << SHOW_BIO_QUEUE), + FLAG_SHOW_CPU_QUEUES = (1 << SHOW_CPU_QUEUES), + FLAG_SHOW_REQUEST_QUEUE = (1 << SHOW_REQUEST_QUEUE), + // MemoryPools + FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL), + // Others + FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS), + // Special + FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT) + }; + +enum { + FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL), + FLAGS_ALL_QUEUES = (FLAG_SHOW_REQUEST_QUEUE + | FLAG_SHOW_ALBIREO_QUEUE + | FLAG_SHOW_BIO_ACK_QUEUE + | FLAG_SHOW_BIO_QUEUE + | FLAG_SHOW_CPU_QUEUES), + FLAGS_ALL_THREADS = (FLAGS_ALL_QUEUES), + DEFAULT_DUMP_FLAGS = (FLAGS_ALL_THREADS | FLAG_SHOW_VDO_STATUS) +}; + +/**********************************************************************/ +static inline bool isArgString(const char *arg, const char *thisOption) +{ + // device-mapper convention seems to be case-independent options + return strncasecmp(arg, thisOption, strlen(thisOption)) == 0; +} + +/**********************************************************************/ +static void doDump(KernelLayer *layer, + unsigned int dumpOptionsRequested, + const char *why) +{ + logInfo("%s dump triggered via %s", THIS_MODULE->name, why); + // XXX Add in number of outstanding requests being processed by vdo + uint32_t active, maximum; + getLimiterValuesAtomically(&layer->requestLimiter, &active, &maximum); + int64_t outstanding = atomic64_read(&layer->biosSubmitted) + - atomic64_read(&layer->biosCompleted); + logInfo("%" PRIu32 " device requests outstanding (max %" PRIu32 "), " + "%" PRId64 " bio requests outstanding, poolName '%s'", + active, maximum, outstanding, layer->deviceConfig->poolName); + if ((dumpOptionsRequested & FLAG_SHOW_REQUEST_QUEUE) != 0) { + dumpKVDOWorkQueue(&layer->kvdo); + } + if ((dumpOptionsRequested & FLAG_SHOW_BIO_QUEUE) != 0) { + dumpBioWorkQueue(layer->ioSubmitter); + } + if (useBioAckQueue(layer) + && ((dumpOptionsRequested & FLAG_SHOW_BIO_ACK_QUEUE) != 0)) { + dumpWorkQueue(layer->bioAckQueue); + } + if ((dumpOptionsRequested & FLAG_SHOW_CPU_QUEUES) != 0) { + dumpWorkQueue(layer->cpuQueue); + } + dumpDedupeIndex(layer->dedupeIndex, + (dumpOptionsRequested & FLAG_SHOW_ALBIREO_QUEUE) != 0); + dumpBufferPool(layer->dataKVIOPool, + (dumpOptionsRequested & FLAG_SHOW_VIO_POOL) != 0); + if ((dumpOptionsRequested & FLAG_SHOW_VDO_STATUS) != 0) { + // Options should become more fine-grained when we have more to + // display here. + dumpKVDOStatus(&layer->kvdo); + } + reportMemoryUsage(); + logInfo("end of %s dump", THIS_MODULE->name); +} + +/**********************************************************************/ +static int parseDumpOptions(unsigned int argc, + char * const *argv, + unsigned int *dumpOptionsRequestedPtr) +{ + unsigned int dumpOptionsRequested = 0; + + static const struct { + const char *name; + unsigned int flags; + } optionNames[] = { + // Should "albireo" mean sending queue + receiving thread + outstanding? + { "dedupe", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "dedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "kvdodedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "bioack", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "kvdobioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "bioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "bio", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "kvdobioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "bioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "cpu", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "kvdocpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "cpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "request", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "kvdoreqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "reqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL }, + { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS }, + + { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS }, + { "queues", FLAG_SKIP_DEFAULT | FLAGS_ALL_QUEUES }, + { "threads", FLAG_SKIP_DEFAULT | FLAGS_ALL_THREADS }, + { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS }, + { "all", ~0 }, + }; + + bool optionsOkay = true; + for (int i = 1; i < argc; i++) { + int j; + for (j = 0; j < COUNT_OF(optionNames); j++) { + if (isArgString(argv[i], optionNames[j].name)) { + dumpOptionsRequested |= optionNames[j].flags; + break; + } + } + if (j == COUNT_OF(optionNames)) { + logWarning("dump option name '%s' unknown", argv[i]); + optionsOkay = false; + } + } + if (!optionsOkay) { + return -EINVAL; + } + if ((dumpOptionsRequested & FLAG_SKIP_DEFAULT) == 0) { + dumpOptionsRequested |= DEFAULT_DUMP_FLAGS; + } + *dumpOptionsRequestedPtr = dumpOptionsRequested; + return 0; +} + +/**********************************************************************/ +int vdoDump(KernelLayer *layer, + unsigned int argc, + char * const *argv, + const char *why) +{ + unsigned int dumpOptionsRequested = 0; + int result = parseDumpOptions(argc, argv, &dumpOptionsRequested); + if (result != 0) { + return result; + } + doDump(layer, dumpOptionsRequested, why); + return 0; +} + +/**********************************************************************/ +void vdoDumpAll(KernelLayer *layer, const char *why) +{ + doDump(layer, ~0, why); +} diff --git a/vdo/kernel/dump.h b/vdo/kernel/dump.h new file mode 100644 index 0000000..5187d4f --- /dev/null +++ b/vdo/kernel/dump.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.h#1 $ + */ + +#ifndef DUMP_H +#define DUMP_H + +#include "kernelLayer.h" + +/** + * Dump internal state and/or statistics to the kernel log, as + * specified by zero or more string arguments. + * + * @param layer The kernel layer + * @param argc Number of arguments + * @param argv The argument list + * @param why Reason for doing the dump + **/ +int vdoDump(KernelLayer *layer, + unsigned int argc, + char * const *argv, + const char *why); + +/** + * Dump lots of internal state and statistics to the kernel log. + * Identical to "dump all", without each caller needing to set up the + * argument list. + * + * @param layer The kernel layer + * @param why Reason for doing the dump + **/ +void vdoDumpAll(KernelLayer *layer, const char *why); + +#endif // DUMP_H diff --git a/vdo/kernel/errors.c b/vdo/kernel/errors.c new file mode 100644 index 0000000..dc9303e --- /dev/null +++ b/vdo/kernel/errors.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.c#2 $ + */ + +#include "errors.h" + +#include +#include +#include + +#include "permassert.h" +#include "statusCodes.h" + +static const struct errorInfo errorList[] = { + { "UDS_UNINITIALIZED", "UDS library is not initialized" }, + { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, + { "UDS_EMODULE_LOAD", "Could not load modules" }, + { "UDS_ENOTHREADS", "Could not create a new thread" }, + { "UDS_NOCONTEXT", "Could not find the requested library context" }, + { "UDS_DISABLED", "UDS library context is disabled" }, + { "UDS_CORRUPT_FILE", "Corrupt file" }, + { "UDS_UNKNOWN_ERROR", "Unknown error" }, + { "UDS_GRID_NO_SERVERS", "No servers in grid configuration" }, + { "UDS_GRID_CONFIG_INCONSISTENT", "Grid configuration inconsistent" }, + { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, + { "UDS_NO_INDEXSESSION", "Index session not known" }, + { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, + { "UDS_SHORT_READ", "Could not read requested number of bytes" }, + { "UDS_AI_ERROR", "Network address and service translation error" }, + { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, + { "UDS_WRONG_CONTEXT_TYPE", "Context type mismatch" }, + { "UDS_BLOCK_ADDRESS_REQUIRED", "A block address is required" }, + { "UDS_CHUNK_DATA_REQUIRED", "Block data is required" }, + { "UDS_CHUNK_NAME_REQUIRED", "A chunk name is required" }, + { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, + { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, + { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, + { "UDS_CONTEXT_PTR_REQUIRED", "A context pointer is required" }, + { "UDS_FILEID_REQUIRED", "A file ID is required" }, + { "UDS_STREAM_REQUIRED", "A stream is required" }, + { "UDS_STREAMID_REQUIRED", "A stream ID is required" }, + { "UDS_STREAM_PTR_REQUIRED", "A stream pointer is required" }, + { "UDS_INVALID_MEMORY_SIZE", + "Configured memory too small or unsupported size" }, + { "UDS_INVALID_METADATA_SIZE", "Invalid metadata size" }, + { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, + { "UDS_CONF_REQUIRED", "A configuration is required" }, + { "UDS_BAD_FILE_DESCRIPTOR", "Bad file descriptor" }, + { "UDS_INDEX_EXISTS", "Index already exists" }, + { "UDS_REQUESTS_OUT_OF_RANGE", "Maximum request value out of range" }, + { "UDS_BAD_NAMESPACE", "Bad namespace" }, + { "UDS_MIGRATOR_MISMATCH", + "Migrator arguments do not match reader arguments" }, + { "UDS_NO_INDEX", "No index found" }, + { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, + { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, + { "UDS_INDEX_PATH_NOT_DIR", "Index path does not point to a directory" }, + { "UDS_ALREADY_OPEN", "Open invoked on already opened connection" }, + { "UDS_CALLBACK_ALREADY_REGISTERED", "Callback already registered" }, + { "UDS_INDEX_PATH_TOO_LONG", "Index path too long" }, + { "UDS_END_OF_FILE", "Unexpected end of file" }, + { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, +}; + +static const struct errorInfo internalErrorList[] = { + { "UDS_PROTOCOL_ERROR", "Client/server protocol error" }, + { "UDS_OVERFLOW", "Index overflow" }, + { "UDS_FILLDONE", "Fill phase done" }, + { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, + { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, + { "UDS_DUPLICATE_NAME", + "Attempt to enter the same name into a delta index twice" }, + { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, + { "UDS_INJECTED_ERROR", "Injected error" }, + { "UDS_ASSERTION_FAILED", "Assertion failed" }, + { "UDS_UNSCANNABLE", "Unscannable" }, + { "UDS_QUEUED", "Request queued" }, + { "UDS_QUEUE_ALREADY_CONNECTED", "Queue already connected" }, + { "UDS_BAD_FILL_PHASE", "Fill phase not supported" }, + { "UDS_BUFFER_ERROR", "Buffer error" }, + { "UDS_CONNECTION_LOST", "Lost connection to peer" }, + { "UDS_TIMEOUT", "A time out has occurred" }, + { "UDS_NO_DIRECTORY", "Expected directory is missing" }, + { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, + { "UDS_INVALID_RUN_ID", "Invalid albGenTest server run ID" }, + { "UDS_RUN_CANCELED", "albGenTest server run canceled" }, + { "UDS_ALREADY_REGISTERED", "error range already registered" }, +}; + +/** Error attributes - or into top half of error code */ +enum { + UDS_UNRECOVERABLE = (1 << 17) +}; + +typedef struct errorBlock { + const char *name; + int base; + int last; + int max; + const ErrorInfo *infos; +} ErrorBlock; + +enum { + MAX_ERROR_BLOCKS = 6 // needed for testing +}; + +static struct errorInformation { + int allocated; + int count; + ErrorBlock blocks[MAX_ERROR_BLOCKS]; +} registeredErrors; + +/**********************************************************************/ +void initializeStandardErrorBlocks(void) +{ + registeredErrors.allocated = MAX_ERROR_BLOCKS; + registeredErrors.count = 0; + + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = "UDS Error", + .base = UDS_ERROR_CODE_BASE, + .last = UDS_ERROR_CODE_LAST, + .max = UDS_ERROR_CODE_BLOCK_END, + .infos = errorList, + }; + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = "UDS Internal Error", + .base = UDS_INTERNAL_ERROR_CODE_BASE, + .last = UDS_INTERNAL_ERROR_CODE_LAST, + .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, + .infos = internalErrorList, + }; + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = THIS_MODULE->name, + .base = VDO_BLOCK_START, + .last = VDO_STATUS_CODE_LAST, + .max = VDO_BLOCK_END, + .infos = vdoStatusList, + }; +} + +/** + * Fetch the error info (if any) for the error number. + * + * @param errnum the error number + * @param infoPtr the place to store the info for this error (if known), + * otherwise set to NULL + * + * @return the name of the error block (if known), NULL otherwise + **/ +static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) +{ + for (ErrorBlock *block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if ((errnum >= block->base) && (errnum < block->last)) { + if (infoPtr != NULL) { + *infoPtr = block->infos + (errnum - block->base); + } + return block->name; + } else if ((errnum >= block->last) && (errnum < block->max)) { + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return block->name; + } + } + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return NULL; +} + +/*****************************************************************************/ +const char *stringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + snprintf(buf, buflen, "%s: %s", blockName, info->message); + } else { + snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); + } + } else { + snprintf(buf, buflen, "System error %d", errnum); + } + return buf; +} + +/*****************************************************************************/ +const char *stringErrorName(int errnum, char *buf, size_t buflen) +{ + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + snprintf(buf, buflen, "%s: %s", blockName, info->name); + } else { + snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); + } + } else { + snprintf(buf, buflen, "System error %d", errnum); + } + return buf; +} + +/*****************************************************************************/ +int makeUnrecoverable(int resultCode) +{ + return ((resultCode == UDS_SUCCESS) + ? resultCode + : (resultCode | UDS_UNRECOVERABLE)); +} + +/*****************************************************************************/ +int sansUnrecoverable(int resultCode) +{ + return resultCode & ~UDS_UNRECOVERABLE; +} + +/*****************************************************************************/ +bool isUnrecoverable(int resultCode) +{ + return (bool)(resultCode & UDS_UNRECOVERABLE); +} + +/*****************************************************************************/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize) +{ + int result = ASSERT(firstError < lastReservedError, + "bad error block range"); + if (result != UDS_SUCCESS) { + return result; + } + + if (registeredErrors.count == registeredErrors.allocated) { + // could reallocate and grow, but should never happen + return UDS_OVERFLOW; + } + + for (ErrorBlock *block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if (strcmp(blockName, block->name) == 0) { + return UDS_DUPLICATE_NAME; + } + // check for overlap in error ranges + if ((firstError < block->max) && (lastReservedError > block->base)) { + return UDS_ALREADY_REGISTERED; + } + } + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = blockName, + .base = firstError, + .last = firstError + (infoSize / sizeof(ErrorInfo)), + .max = lastReservedError, + .infos = infos + }; + + return UDS_SUCCESS; +} diff --git a/vdo/kernel/errors.h b/vdo/kernel/errors.h new file mode 100644 index 0000000..acfb777 --- /dev/null +++ b/vdo/kernel/errors.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.h#1 $ + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include +#include "uds-error.h" + +enum udsInternalErrorCodes { + /** Used as a base value for reporting internal errors */ + UDS_INTERNAL_ERROR_CODE_BASE = 66560, + /** Client/server protocol framing error */ + UDS_PROTOCOL_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 0, + /** Index overflow */ + UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, + /** Fill phase done (intended for albfill only) */ + UDS_FILLDONE = UDS_INTERNAL_ERROR_CODE_BASE + 2, + /** Invalid argument passed to internal routine */ + UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, + /** UDS data structures are in an invalid state */ + UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, + /** Attempt to enter the same name into an internal structure twice */ + UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, + /** An internal protocol violation between system components */ + UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, + /** An error created by test case processing */ + UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, + /** An assertion failed */ + UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, + /** A file or stream is not scannable with the current scanner */ + UDS_UNSCANNABLE = UDS_INTERNAL_ERROR_CODE_BASE + 9, + /** Not an actual error, but reporting that the result will be delayed */ + UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, + /** Queue already connected */ + UDS_QUEUE_ALREADY_CONNECTED = UDS_INTERNAL_ERROR_CODE_BASE + 11, + /** Fill phase not supported */ + UDS_BAD_FILL_PHASE = UDS_INTERNAL_ERROR_CODE_BASE + 12, + /** A problem has occurred with a Buffer */ + UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, + /** A network connection was lost */ + UDS_CONNECTION_LOST = UDS_INTERNAL_ERROR_CODE_BASE + 14, + /** A time out has occurred */ + UDS_TIMEOUT = UDS_INTERNAL_ERROR_CODE_BASE + 15, + /** No directory was found where one was expected */ + UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, + /** Checkpoint not completed */ + UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, + /** Invalid albGenTest server run ID */ + UDS_INVALID_RUN_ID = UDS_INTERNAL_ERROR_CODE_BASE + 18, + /** albGenTest server run canceled */ + UDS_RUN_CANCELED = UDS_INTERNAL_ERROR_CODE_BASE + 19, + /** this error range has already been registered */ + UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, + /** One more than the last UDS_INTERNAL error code */ + UDS_INTERNAL_ERROR_CODE_LAST, + /** One more than the last error this block will ever use */ + UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 +}; + +enum { + ERRBUF_SIZE = 128 // default size for buffer passed to stringError +}; + +const char *stringError(int errnum, char *buf, size_t buflen); +const char *stringErrorName(int errnum, char *buf, size_t buflen); + +int makeUnrecoverable(int resultCode) __attribute__((warn_unused_result)); +bool isUnrecoverable(int resultCode) __attribute__((warn_unused_result)); +int sansUnrecoverable(int resultCode) __attribute__((warn_unused_result)); + +typedef struct errorInfo { + const char *name; + const char *message; +} ErrorInfo; + +/** + * Initialize UDS error code blocks. + * + * @note Must be called once, before any of the other routines in this + * file. + **/ +void initializeStandardErrorBlocks(void); + +/** + * Register an error code block for stringError and stringErrorName. + * + * @param blockName the name of the block of error codes + * @param firstError the first error code in the block + * @param lastReservedError one past the highest possible error in the block + * @param infos a pointer to the error info array for the block + * @param infoSize the size of the error info array, which determines + * the last actual error for which information is + * available + * + * @return a success or error code, particularly UDS_DUPLICATE_NAME if the + * block name is already present, or UDS_ALREADY_REGISTERED if a + * block with the specified error code is present + **/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize); + +#endif /* ERRORS_H */ diff --git a/vdo/kernel/histogram.c b/vdo/kernel/histogram.c new file mode 100644 index 0000000..0e1a6ae --- /dev/null +++ b/vdo/kernel/histogram.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.c#2 $ + */ + +#include + +#include "memoryAlloc.h" +#include "typeDefs.h" + +#include "histogram.h" +#include "logger.h" +#include "numUtils.h" + +/* + * Set NO_BUCKETS to streamline the histogram code by reducing it to + * tracking just minimum, maximum, mean, etc. Only one bucket counter + * (the final one for "bigger" values) will be used, no range checking + * is needed to find the right bucket, and no histogram will be + * reported. With newer compilers, the histogram output code will be + * optimized out. + */ +enum { + NO_BUCKETS = 1 +}; + +/* + * Support histogramming in the VDO code. + * + * This is not a complete and general histogram package. It follows the XP + * practice of implementing the "customer" requirements, and no more. We can + * support other requirements after we know what they are. + * + * The code was originally borrowed from Albireo, and includes both linear and + * logarithmic histograms. VDO only uses the logarithmic histograms. + * + * All samples are uint64_t values. + * + * A unit conversion option is supported internally to allow sample values to + * be supplied in "jiffies" and results to be reported via /sys in + * milliseconds. Depending on the system configuration, this could mean a + * factor of four (a bucket for values of 1 jiffy is reported as 4-7 + * milliseconds). In theory it could be a non-integer ratio (including less + * than one), but as the x86-64 platforms we've encountered appear to use 1 or + * 4 milliseconds per jiffy, we don't support non-integer values yet. + * + * All internal processing uses the values as passed to enterHistogramSample. + * Conversions only affect the values seen or input through the /sys interface, + * including possibly rounding a "limit" value entered. + */ + +struct histogram { + // These fields are ordered so that enterHistogramSample touches + // only the first cache line. + atomic64_t *counters; // Counter for each bucket + uint64_t limit; // We want to know how many samples are larger + atomic64_t sum; // Sum of all the samples + atomic64_t count; // Number of samples + atomic64_t minimum; // Minimum value + atomic64_t maximum; // Maximum value + atomic64_t unacceptable; // Number of samples that exceed the limit + int numBuckets; // The number of buckets + bool logFlag; // True if the y scale should be logarithmic + // These fields are used only when reporting results. + const char *label; // Histogram label + const char *countedItems; // Name for things being counted + const char *metric; // Term for value used to divide into buckets + const char *sampleUnits; // Unit for measuring metric; NULL for count + unsigned int conversionFactor; // Converts input units to reporting units + struct kobject kobj; +}; + +/* + * Fixed table defining the top value for each bucket of a logarithmic + * histogram. We arbitrarily limit the histogram to 12 orders of magnitude. + */ +enum { MAX_LOG_SIZE = 12 }; +static const uint64_t bottomValue[1 + 10 * MAX_LOG_SIZE] = { + // 0 to 10 - The first 10 buckets are linear + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + // 10 to 100 - From this point on, the Nth entry of the table is + // floor(exp10((double)N/10.0)). + 12, 15, 19, 25, 31, 39, 50, 63, 79, 100, + // 100 to 1K + 125, 158, 199, 251, 316, 398, 501, 630, 794, 1000, + // 1K to 10K + 1258, 1584, 1995, 2511, 3162, 3981, 5011, 6309, 7943, 10000, + // 10K to 100K + 12589, 15848, 19952, 25118, 31622, 39810, 50118, 63095, 79432, 100000, + // 100K to 1M + 125892, 158489, 199526, 251188, 316227, + 398107, 501187, 630957, 794328, 1000000, + // 1M to 10M + 1258925, 1584893, 1995262, 2511886, 3162277, + 3981071, 5011872, 6309573, 7943282, 10000000, + // 10M to 100M + 12589254, 15848931, 19952623, 25118864, 31622776, + 39810717, 50118723, 63095734, 79432823, 100000000, + // 100M to 1G + 125892541, 158489319, 199526231, 251188643, 316227766, + 398107170, 501187233, 630957344, 794328234, 1000000000, + // 1G to 10G + 1258925411L, 1584893192L, 1995262314L, 2511886431L, 3162277660L, + 3981071705L, 5011872336L, 6309573444L, 7943282347L, 10000000000L, + // 10G to 100G + 12589254117L, 15848931924L, 19952623149L, 25118864315L, 31622776601L, + 39810717055L, 50118723362L, 63095734448L, 79432823472L, 100000000000L, + // 100G to 1T + 125892541179L, 158489319246L, 199526231496L, 251188643150L, 316227766016L, + 398107170553L, 501187233627L, 630957344480L, 794328234724L, 1000000000000L, +}; + +/***********************************************************************/ +static unsigned int divideRoundingToNearest(uint64_t number, uint64_t divisor) +{ + number += divisor / 2; + return number / divisor; +} + +/***********************************************************************/ +static int maxBucket(Histogram *h) +{ + int max = h->numBuckets; + while ((max >= 0) && (atomic64_read(&h->counters[max]) == 0)) { + max--; + } + // max == -1 means that there were no samples + return max; +} + +/***********************************************************************/ + +typedef struct { + struct attribute attr; + ssize_t (*show)(Histogram *h, char *buf); + ssize_t (*store)(Histogram *h, const char *buf, size_t length); +} HistogramAttribute; + +/***********************************************************************/ +static void histogramKobjRelease(struct kobject *kobj) +{ + Histogram *h = container_of(kobj, Histogram, kobj); + FREE(h->counters); + FREE(h); +} + +/***********************************************************************/ +static ssize_t histogramShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); + if (ha->show == NULL) { + return -EINVAL; + } + Histogram *h = container_of(kobj, Histogram, kobj); + return ha->show(h, buf); +} + +/***********************************************************************/ +static ssize_t histogramStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); + if (ha->show == NULL) { + return -EINVAL; + } + Histogram *h = container_of(kobj, Histogram, kobj); + return ha->store(h, buf, length); +} + +/***********************************************************************/ +static ssize_t histogramShowCount(Histogram *h, char *buf) +{ + int64_t count = atomic64_read(&h->count); + return sprintf(buf, "%" PRId64 "\n", count); +} + +/***********************************************************************/ +static ssize_t histogramShowHistogram(Histogram *h, char *buffer) +{ + /* + * We're given one page in which to write. The caller logs a complaint if we + * report that we've written too much, so we'll truncate to PAGE_SIZE-1. + */ + size_t bufferSize = PAGE_SIZE; + bool bars = true; + ssize_t length = 0; + int max = maxBucket(h); + // If max is -1, we'll fall through to reporting the total of zero. + + enum { BAR_SIZE = 50 }; + char bar[BAR_SIZE + 2]; + bar[0] = ' '; + memset(bar + 1, '=', BAR_SIZE); + bar[BAR_SIZE + 1] = '\0'; + + uint64_t total = 0; + for (int i = 0; i <= max; i++) { + total += atomic64_read(&h->counters[i]); + } + + length += snprintf(buffer, bufferSize, "%s Histogram - number of %s by %s", + h->label, h->countedItems, h->metric); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + if (h->sampleUnits != NULL) { + length += snprintf(buffer + length, bufferSize - length, " (%s)", + h->sampleUnits); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + } + length += snprintf(buffer + length, bufferSize - length, "\n"); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + for (int i = 0; i <= max; i++) { + uint64_t value = atomic64_read(&h->counters[i]); + + unsigned int barLength; + if (bars && (total != 0)) { + // +1 for the space at the beginning + barLength = (divideRoundingToNearest(value * BAR_SIZE, total) + 1); + if (barLength == 1) { + // Don't bother printing just the initial space. + barLength = 0; + } + } else { + // 0 means skip the space and the bar + barLength = 0; + } + + if (h->logFlag) { + if (i == h->numBuckets) { + length += snprintf(buffer + length, bufferSize - length, "%-16s", + "Bigger"); + } else { + unsigned int lower = h->conversionFactor * bottomValue[i]; + unsigned int upper = h->conversionFactor * bottomValue[i + 1] - 1; + length += snprintf(buffer + length, bufferSize - length, "%6u - %7u", + lower, upper); + } + } else { + if (i == h->numBuckets) { + length += snprintf(buffer + length, bufferSize - length, "%6s", + "Bigger"); + } else { + length += snprintf(buffer + length, bufferSize - length, "%6d", i); + } + } + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + length += snprintf(buffer + length, bufferSize - length, + " : %12llu%.*s\n", value, barLength, bar); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + } + + length += snprintf(buffer + length, bufferSize - length, + "total %llu\n", total); + return minSizeT(bufferSize - 1, length); +} + +/***********************************************************************/ +static ssize_t histogramShowMaximum(Histogram *h, char *buf) +{ + // Maximum is initialized to 0. + unsigned long value = atomic64_read(&h->maximum); + return sprintf(buf, "%lu\n", h->conversionFactor * value); +} + +/***********************************************************************/ +static ssize_t histogramShowMinimum(Histogram *h, char *buf) +{ + // Minimum is initialized to -1. + unsigned long value = ((atomic64_read(&h->count) > 0) + ? atomic64_read(&h->minimum) + : 0); + return sprintf(buf, "%lu\n", h->conversionFactor * value); +} + +/***********************************************************************/ +static ssize_t histogramShowLimit(Histogram *h, char *buf) +{ + // Display the limit in the reporting units + return sprintf(buf, "%u\n", (unsigned int)(h->conversionFactor * h->limit)); +} + +/***********************************************************************/ +static ssize_t histogramStoreLimit(Histogram *h, + const char *buf, + size_t length) +{ + unsigned int value; + if ((length > 12) || (sscanf(buf, "%u", &value) != 1)) { + return -EINVAL; + } + /* + * Convert input from reporting units (e.g., milliseconds) to internal + * recording units (e.g., jiffies). + * + * computeBucketCount could also be called "divideRoundingUp". + */ + h->limit = computeBucketCount(value, h->conversionFactor); + atomic64_set(&h->unacceptable, 0); + return length; +} + +/***********************************************************************/ +static ssize_t histogramShowMean(Histogram *h, char *buf) +{ + uint64_t count = atomic64_read(&h->count); + if (count == 0) { + return sprintf(buf, "0/0\n"); + } + // Compute mean, scaled up by 1000, in reporting units + unsigned long sumTimes1000InReportingUnits + = h->conversionFactor * atomic64_read(&h->sum) * 1000; + unsigned int meanTimes1000 + = divideRoundingToNearest(sumTimes1000InReportingUnits, count); + // Print mean with fractional part + return sprintf(buf, "%u.%03u\n", meanTimes1000 / 1000, + meanTimes1000 % 1000); +} + +/***********************************************************************/ +static ssize_t histogramShowUnacceptable(Histogram *h, char *buf) +{ + int64_t count = atomic64_read(&h->unacceptable); + return sprintf(buf, "%" PRId64 "\n", count); +} + +/***********************************************************************/ +static ssize_t histogramShowLabel(Histogram *h, char *buf) +{ + return sprintf(buf, "%s\n", h->label); +} + +/***********************************************************************/ +static ssize_t histogramShowUnit(Histogram *h, char *buf) +{ + if (h->sampleUnits != NULL) { + return sprintf(buf, "%s\n", h->sampleUnits); + } else { + *buf = 0; + return 0; + } +} + +/***********************************************************************/ + +static struct sysfs_ops histogramSysfsOps = { + .show = histogramShow, + .store = histogramStore, +}; + +static HistogramAttribute countAttribute = { + .attr = { .name = "count", .mode = 0444, }, + .show = histogramShowCount, +}; + +static HistogramAttribute histogramAttribute = { + .attr = { .name = "histogram", .mode = 0444, }, + .show = histogramShowHistogram, +}; + +static HistogramAttribute labelAttribute = { + .attr = { .name = "label", .mode = 0444, }, + .show = histogramShowLabel, +}; + +static HistogramAttribute maximumAttribute = { + .attr = { .name = "maximum", .mode = 0444, }, + .show = histogramShowMaximum, +}; + +static HistogramAttribute minimumAttribute = { + .attr = { .name = "minimum", .mode = 0444, }, + .show = histogramShowMinimum, +}; + +static HistogramAttribute limitAttribute = { + .attr = { .name = "limit", .mode = 0644, }, + .show = histogramShowLimit, + .store = histogramStoreLimit, +}; + +static HistogramAttribute meanAttribute = { + .attr = { .name = "mean", .mode = 0444, }, + .show = histogramShowMean, +}; + +static HistogramAttribute unacceptableAttribute = { + .attr = { .name = "unacceptable", .mode = 0444, }, + .show = histogramShowUnacceptable, +}; + +static HistogramAttribute unitAttribute = { + .attr = { .name = "unit", .mode = 0444, }, + .show = histogramShowUnit, +}; + +// "Real" histogram plotting. +static struct attribute *histogramAttributes[] = { + &countAttribute.attr, + &histogramAttribute.attr, + &labelAttribute.attr, + &limitAttribute.attr, + &maximumAttribute.attr, + &meanAttribute.attr, + &minimumAttribute.attr, + &unacceptableAttribute.attr, + &unitAttribute.attr, + NULL, +}; + +static struct kobj_type histogramKobjType = { + .release = histogramKobjRelease, + .sysfs_ops = &histogramSysfsOps, + .default_attrs = histogramAttributes, +}; + +static struct attribute *bucketlessHistogramAttributes[] = { + &countAttribute.attr, + &labelAttribute.attr, + &maximumAttribute.attr, + &meanAttribute.attr, + &minimumAttribute.attr, + &unitAttribute.attr, + NULL, +}; + +static struct kobj_type bucketlessHistogramKobjType = { + .release = histogramKobjRelease, + .sysfs_ops = &histogramSysfsOps, + .default_attrs = bucketlessHistogramAttributes, +}; + +/***********************************************************************/ +static Histogram *makeHistogram(struct kobject *parent, + const char *name, + const char *label, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int numBuckets, + unsigned long conversionFactor, + bool logFlag) +{ + Histogram *h; + if (ALLOCATE(1, Histogram, "histogram", &h) != UDS_SUCCESS) { + return NULL; + } + + if (NO_BUCKETS) { + numBuckets = 0; // plus 1 for "bigger" bucket + } + + if (numBuckets <= 10) { + /* + * The first buckets in a "logarithmic" histogram are still + * linear, but the bucket-search mechanism is a wee bit slower + * than for linear, so change the type. + */ + logFlag = false; + } + + h->label = label; + h->countedItems = countedItems; + h->metric = metric; + h->sampleUnits = sampleUnits; + h->logFlag = logFlag; + h->numBuckets = numBuckets; + h->conversionFactor = conversionFactor; + atomic64_set(&h->minimum, -1UL); + + if (ALLOCATE(h->numBuckets + 1, atomic64_t, "histogram counters", + &h->counters) != UDS_SUCCESS) { + histogramKobjRelease(&h->kobj); + return NULL; + } + + kobject_init(&h->kobj, + ((numBuckets > 0) + ? &histogramKobjType + : &bucketlessHistogramKobjType)); + if (kobject_add(&h->kobj, parent, name) != 0) { + histogramKobjRelease(&h->kobj); + return NULL; + } + return h; +} + +/***********************************************************************/ +Histogram *makeLinearHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int size) +{ + return makeHistogram(parent, name, initLabel, countedItems, + metric, sampleUnits, size, 1, false); +} + + +/** + * Intermediate routine for creating logarithmic histograms. + * + * Limits the histogram size, and computes the bucket count from the + * orders-of-magnitude count. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is + * used for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into + * buckets. + * @param sampleUnits The units (plural) for the metric, or NULL if it's + * a simple counter. + * @param logSize The number of buckets. There are buckets for a + * range of sizes up to 10^logSize, and an extra + * bucket for larger samples. + * @param conversionFactor Unit conversion factor for reporting. + * + * @return the histogram + **/ +static Histogram * +makeLogarithmicHistogramWithConversionFactor(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize, + uint64_t conversionFactor) +{ + if (logSize > MAX_LOG_SIZE) { + logSize = MAX_LOG_SIZE; + } + return makeHistogram(parent, name, + initLabel, countedItems, metric, sampleUnits, + 10 * logSize, conversionFactor, true); +} + +/***********************************************************************/ +Histogram *makeLogarithmicHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize) +{ + return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, + countedItems, + metric, sampleUnits, + logSize, 1); +} + +/***********************************************************************/ +Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + int logSize) +{ + /* + * If these fail, we have a jiffy duration that is not an integral number of + * milliseconds, and the unit conversion code needs updating. + */ + STATIC_ASSERT(HZ <= MSEC_PER_SEC); + STATIC_ASSERT((MSEC_PER_SEC % HZ) == 0); + return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, + countedItems, + metric, "milliseconds", + logSize, + jiffies_to_msecs(1)); +} + +/***********************************************************************/ +void enterHistogramSample(Histogram *h, uint64_t sample) +{ + int bucket; + if (h->logFlag) { + int lo = 0; + int hi = h->numBuckets; + while (lo < hi) { + int middle = (lo + hi) / 2; + if (sample < bottomValue[middle + 1]) { + hi = middle; + } else { + lo = middle + 1; + } + } + bucket = lo; + } else { + bucket = sample < h->numBuckets ? sample : h->numBuckets; + } + atomic64_inc(&h->counters[bucket]); + atomic64_inc(&h->count); + atomic64_add(sample, &h->sum); + if ((h->limit > 0) && (sample > h->limit)) { + atomic64_inc(&h->unacceptable); + } + + /* + * Theoretically this could loop a lot; in practice it should rarely + * do more than a single read, with no memory barrier, from a cache + * line we've already referenced above. + */ + uint64_t oldMaximum = atomic64_read(&h->maximum); + while (oldMaximum < sample) { + uint64_t readValue = atomic64_cmpxchg(&h->maximum, oldMaximum, sample); + if (readValue == oldMaximum) { + break; + } + oldMaximum = readValue; + } + + uint64_t oldMinimum = atomic64_read(&h->minimum); + while (oldMinimum > sample) { + uint64_t readValue = atomic64_cmpxchg(&h->minimum, oldMinimum, sample); + if (readValue == oldMinimum) { + break; + } + oldMinimum = readValue; + } +} + +/***********************************************************************/ +void freeHistogram(Histogram **hp) +{ + if (*hp != NULL) { + Histogram *h = *hp; + kobject_put(&h->kobj); + *hp = NULL; + } +} diff --git a/vdo/kernel/histogram.h b/vdo/kernel/histogram.h new file mode 100644 index 0000000..a177e0a --- /dev/null +++ b/vdo/kernel/histogram.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.h#1 $ + */ + +#ifndef HISTOGRAM_H +#define HISTOGRAM_H + +#include + +typedef struct histogram Histogram; + +/** + * Allocate and initialize a histogram that uses linearly sized buckets. + * + * The histogram label reported via /sys is constructed from several of the + * values passed here; it will be something like "Init Label Histogram - number + * of countedItems grouped by metric (sampleUnits)", e.g., "Flush Forwarding + * Histogram - number of flushes grouped by latency (milliseconds)". Thus + * countedItems and sampleUnits should be plural. + * + * The sampleUnits string will also be reported separately via another /sys + * entry to aid in programmatic processing of the results, so the strings used + * should be consistent (e.g., always "milliseconds" and not "ms" for + * milliseconds). + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param sampleUnits The unit (plural) for the metric, or NULL if it's a + * simple counter. + * @param size The number of buckets. There are buckets for every + * value from 0 up to size (but not including) size. + * There is an extra bucket for larger samples. + * + * @return the histogram + **/ +Histogram *makeLinearHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int size); + +/** + * Allocate and initialize a histogram that uses logarithmically sized + * buckets. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param sampleUnits The unit (plural) for the metric, or NULL if it's a + * simple counter. + * @param logSize The number of buckets. There are buckets for a range + * of sizes up to 10^logSize, and an extra bucket for + * larger samples. + * + * @return the histogram + **/ +Histogram *makeLogarithmicHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize); + +/** + * Allocate and initialize a histogram that uses logarithmically sized + * buckets. Values are entered that count in jiffies, and they are + * reported in milliseconds. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param logSize The number of buckets. There are buckets for a range + * of sizes up to 10^logSize, and an extra bucket for + * larger samples. + * + * @return the histogram + **/ +Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + int logSize); + +/** + * Enter a sample into a histogram + * + * @param h The histogram + * @param sample The sample + **/ +void enterHistogramSample(Histogram *h, uint64_t sample); + +/** + * Free a histogram and null out the reference to it. + * + * @param hp The reference to the histogram. + **/ +void freeHistogram(Histogram **hp); + +#endif /* HISTOGRAM_H */ diff --git a/vdo/kernel/instanceNumber.c b/vdo/kernel/instanceNumber.c new file mode 100644 index 0000000..178fd92 --- /dev/null +++ b/vdo/kernel/instanceNumber.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.c#1 $ + */ + +#include "instanceNumber.h" + +#include +#include + +#include "memoryAlloc.h" +#include "numUtils.h" +#include "permassert.h" + +/* + * Track in-use instance numbers using a flat bit array. + * + * O(n) run time isn't ideal, but if we have 1000 VDO devices in use + * simultaneously we still only need to scan 16 words, so it's not + * likely to be a big deal compared to other resource usage. + */ + +enum { + /** + * This minimum size for the bit array creates a numbering space of 0-999, + * which allows successive starts of the same volume to have different + * instance numbers in any reasonably-sized test. Changing instances on + * restart allows vdoMonReport to detect that the ephemeral stats have reset + * to zero. + **/ + BIT_COUNT_MINIMUM = 1000, + /** Grow the bit array by this many bits when needed */ + BIT_COUNT_INCREMENT = 100, +}; + +static struct mutex instanceNumberLock; +static unsigned int bitCount; +static unsigned long *words; +static unsigned int instanceCount; +static unsigned int nextInstance; + +/** + * Return the number of bytes needed to store a bit array of the specified + * capacity in an array of unsigned longs. + * + * @param bitCount The number of bits the array must hold + * + * @return the number of bytes needed for the array reperesentation + **/ +static size_t getBitArraySize(unsigned int bitCount) +{ + // Round up to a multiple of the word size and convert to a byte count. + return (computeBucketCount(bitCount, BITS_PER_LONG) * sizeof(unsigned long)); +} + +/** + * Re-allocate the bitmap word array so there will more instance numbers that + * can be allocated. Since the array is initially NULL, this also initializes + * the array the first time we allocate an instance number. + * + * @return UDS_SUCCESS or an error code from the allocation + **/ +static int growBitArray(void) +{ + unsigned int newCount = maxUInt(bitCount + BIT_COUNT_INCREMENT, + BIT_COUNT_MINIMUM); + unsigned long *newWords; + int result = reallocateMemory(words, + getBitArraySize(bitCount), + getBitArraySize(newCount), + "instance number bit array", + &newWords); + if (result != UDS_SUCCESS) { + return result; + } + + bitCount = newCount; + words = newWords; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int allocateKVDOInstanceLocked(unsigned int *instancePtr) +{ + // If there are no unallocated instances, grow the bit array. + if (instanceCount >= bitCount) { + int result = growBitArray(); + if (result != UDS_SUCCESS) { + return result; + } + } + + // There must be a zero bit somewhere now. Find it, starting just after the + // last instance allocated. + unsigned int instance = find_next_zero_bit(words, bitCount, nextInstance); + if (instance >= bitCount) { + // Nothing free after nextInstance, so wrap around to instance zero. + instance = find_first_zero_bit(words, bitCount); + int result = ASSERT(instance < bitCount, "impossibly, no zero bit found"); + if (result != UDS_SUCCESS) { + return result; + } + } + + __set_bit(instance, words); + instanceCount += 1; + nextInstance = instance + 1; + *instancePtr = instance; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int allocateKVDOInstance(unsigned int *instancePtr) +{ + mutex_lock(&instanceNumberLock); + int result = allocateKVDOInstanceLocked(instancePtr); + mutex_unlock(&instanceNumberLock); + return result; +} + +/**********************************************************************/ +void releaseKVDOInstance(unsigned int instance) +{ + mutex_lock(&instanceNumberLock); + if (instance >= bitCount) { + ASSERT_LOG_ONLY(false, "instance number %u must be less than bit count %u", + instance, bitCount); + } else if (test_bit(instance, words) == 0) { + ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance); + } else { + __clear_bit(instance, words); + instanceCount -= 1; + } + mutex_unlock(&instanceNumberLock); +} + +/**********************************************************************/ +void initializeInstanceNumberTracking(void) +{ + mutex_init(&instanceNumberLock); +} + +/**********************************************************************/ +void cleanUpInstanceNumberTracking(void) +{ + ASSERT_LOG_ONLY(instanceCount == 0, + "should have no instance numbers still in use, but have %u", + instanceCount); + FREE(words); + words = NULL; + bitCount = 0; + instanceCount = 0; + nextInstance = 0; + mutex_destroy(&instanceNumberLock); +} diff --git a/vdo/kernel/instanceNumber.h b/vdo/kernel/instanceNumber.h new file mode 100644 index 0000000..6d96bad --- /dev/null +++ b/vdo/kernel/instanceNumber.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.h#1 $ + */ + +/** + * Allocate an instance number. + * + * @param [out] instancePtr An integer to hold the allocated instance number + * + * @result UDS_SUCCESS or an error code + **/ +int allocateKVDOInstance(unsigned int *instancePtr); + +/** + * Release an instance number previously allocated. + * + * @param instance The instance number to release + **/ +void releaseKVDOInstance(unsigned int instance); + +/** + * Initialize the instance-number tracking data structures. + **/ +void initializeInstanceNumberTracking(void); + +/** + * Free up the instance-number tracking data structures. + **/ +void cleanUpInstanceNumberTracking(void); diff --git a/vdo/kernel/ioSubmitter.c b/vdo/kernel/ioSubmitter.c new file mode 100644 index 0000000..036bf25 --- /dev/null +++ b/vdo/kernel/ioSubmitter.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.c#8 $ + */ + +#include "ioSubmitter.h" + +#include + +#include "memoryAlloc.h" + +#include "bio.h" +#include "dataKVIO.h" +#include "kernelLayer.h" +#include "logger.h" + +enum { + /* + * Whether to use bio merging code. + * + * Merging I/O requests in the request queue below us is helpful for + * many devices, and VDO does a good job sometimes of shuffling up + * the I/O order (too much for some simple I/O schedulers to sort + * out) as we deal with dedupe advice etc. The bio map tracks the + * yet-to-be-submitted I/O requests by block number so that we can + * collect together and submit sequential I/O operations that should + * be easy to merge. (So we don't actually *merge* them here, we + * just arrange them so that merging can happen.) + * + * For some devices, merging may not help, and we may want to turn + * off this code and save compute/spinlock cycles. + */ + USE_BIOMAP = 1, +}; + +/* + * Submission of bio operations to the underlying storage device will + * go through a separate work queue thread (or more than one) to + * prevent blocking in other threads if the storage device has a full + * queue. The plug structure allows that thread to do better batching + * of requests to make the I/O more efficient. + * + * When multiple worker threads are used, a thread is chosen for a + * I/O operation submission based on the PBN, so a given PBN will + * consistently wind up on the same thread. Flush operations are + * assigned round-robin. + * + * The map (protected by the mutex) collects pending I/O operations so + * that the worker thread can reorder them to try to encourage I/O + * request merging in the request queue underneath. + */ +typedef struct bioQueueData { + KvdoWorkQueue *queue; + struct blk_plug plug; + IntMap *map; + struct mutex lock; + unsigned int queueNumber; +} BioQueueData; + +struct ioSubmitter { + unsigned int numBioQueuesUsed; + unsigned int bioQueueRotationInterval; + unsigned int bioQueueRotor; + BioQueueData bioQueueData[]; +}; + +/**********************************************************************/ +static void startBioQueue(void *ptr) +{ + BioQueueData *bioQueueData = (BioQueueData *)ptr; + blk_start_plug(&bioQueueData->plug); +} + +/**********************************************************************/ +static void finishBioQueue(void *ptr) +{ + BioQueueData *bioQueueData = (BioQueueData *)ptr; + blk_finish_plug(&bioQueueData->plug); +} + +static const KvdoWorkQueueType bioQueueType = { + .start = startBioQueue, + .finish = finishBioQueue, + .actionTable = { + { .name = "bio_compressed_data", + .code = BIO_Q_ACTION_COMPRESSED_DATA, + .priority = 0 }, + { .name = "bio_data", + .code = BIO_Q_ACTION_DATA, + .priority = 0 }, + { .name = "bio_flush", + .code = BIO_Q_ACTION_FLUSH, + .priority = 2 }, + { .name = "bio_high", + .code = BIO_Q_ACTION_HIGH, + .priority = 2 }, + { .name = "bio_metadata", + .code = BIO_Q_ACTION_METADATA, + .priority = 1 }, + { .name = "bio_readcache", + .code = BIO_Q_ACTION_READCACHE, + .priority = 0 }, + { .name = "bio_verify", + .code = BIO_Q_ACTION_VERIFY, + .priority = 1 }, + }, +}; + +/** + * Check that we're running normally (i.e., not in an + * interrupt-servicing context) in an IOSubmitter bio thread. + **/ +static void assertRunningInBioQueue(void) +{ + ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); + ASSERT_LOG_ONLY(strnstr(current->comm, "bioQ", TASK_COMM_LEN) != NULL, + "running in bio submission work queue thread"); +} + +/** + * Returns the BioQueueData pointer associated with the current thread. + * Results are undefined if called from any other thread. + * + * @return the BioQueueData pointer + **/ +static inline BioQueueData *getCurrentBioQueueData(void) +{ + BioQueueData *bioQueueData = (BioQueueData *) getWorkQueuePrivateData(); + // Does it look like a bio queue thread? + BUG_ON(bioQueueData == NULL); + BUG_ON(bioQueueData->queue != getCurrentWorkQueue()); + return bioQueueData; +} + +/**********************************************************************/ +static inline IOSubmitter *bioQueueToSubmitter(BioQueueData *bioQueue) +{ + BioQueueData *firstBioQueue = bioQueue - bioQueue->queueNumber; + IOSubmitter *submitter = container_of(firstBioQueue, IOSubmitter, + bioQueueData[0]); + return submitter; +} + +/** + * Return the bio thread number handling the specified physical block + * number. + * + * @param ioSubmitter The I/O submitter data + * @param pbn The physical block number + * + * @return read cache zone number + **/ +static unsigned int bioQueueNumberForPBN(IOSubmitter *ioSubmitter, + PhysicalBlockNumber pbn) +{ + unsigned int bioQueueIndex + = ((pbn + % (ioSubmitter->numBioQueuesUsed + * ioSubmitter->bioQueueRotationInterval)) + / ioSubmitter->bioQueueRotationInterval); + + return bioQueueIndex; +} + +/** + * Check that we're running normally (i.e., not in an + * interrupt-servicing context) in an IOSubmitter bio thread. Also + * require that the thread we're running on is the correct one for the + * supplied physical block number. + * + * @param pbn The PBN that should have been used in thread selection + **/ +static void assertRunningInBioQueueForPBN(PhysicalBlockNumber pbn) +{ + assertRunningInBioQueue(); + + BioQueueData *thisQueue = getCurrentBioQueueData(); + IOSubmitter *submitter = bioQueueToSubmitter(thisQueue); + unsigned int computedQueueNumber = bioQueueNumberForPBN(submitter, pbn); + ASSERT_LOG_ONLY(thisQueue->queueNumber == computedQueueNumber, + "running in correct bio queue (%u vs %u) for PBN %llu", + thisQueue->queueNumber, computedQueueNumber, pbn); +} + +/** + * Increments appropriate counters for bio completions + * + * @param kvio the kvio associated with the bio + * @param bio the bio to count + */ +static void countAllBiosCompleted(KVIO *kvio, BIO *bio) +{ + KernelLayer *layer = kvio->layer; + if (isData(kvio)) { + countBios(&layer->biosOutCompleted, bio); + return; + } + + countBios(&layer->biosMetaCompleted, bio); + if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { + countBios(&layer->biosJournalCompleted, bio); + } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { + countBios(&layer->biosPageCacheCompleted, bio); + } +} + +/**********************************************************************/ +void countCompletedBios(BIO *bio) +{ + KVIO *kvio = (KVIO *)bio->bi_private; + KernelLayer *layer = kvio->layer; + atomic64_inc(&layer->biosCompleted); + countAllBiosCompleted(kvio, bio); +} + +/**********************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +void completeAsyncBio(BIO *bio) +#else +void completeAsyncBio(BIO *bio, int error) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + int error = getBioResult(bio); +#endif + KVIO *kvio = (KVIO *) bio->bi_private; + kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io);cb=io($io)")); + countCompletedBios(bio); + if ((error == 0) && isData(kvio) && isReadVIO(kvio->vio)) { + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + if (!isCompressed(dataKVIO->dataVIO.mapped.state) + && !dataKVIO->isPartial) { + kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); + return; + } + } + kvdoContinueKvio(kvio, error); +} + +/** + * Determines which bio counter to use + * + * @param kvio the kvio associated with the bio + * @param bio the bio to count + */ +static void countAllBios(KVIO *kvio, BIO *bio) +{ + KernelLayer *layer = kvio->layer; + if (isData(kvio)) { + countBios(&layer->biosOut, bio); + return; + } + + countBios(&layer->biosMeta, bio); + if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { + countBios(&layer->biosJournal, bio); + } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { + countBios(&layer->biosPageCache, bio); + } +} + +/** + * Update stats and tracing info, then submit the supplied bio to the + * OS for processing. + * + * @param kvio The KVIO associated with the bio + * @param bio The bio to submit to the OS + * @param location Call site location for tracing + **/ +static void sendBioToDevice(KVIO *kvio, BIO *bio, TraceLocation location) +{ + assertRunningInBioQueueForPBN(kvio->vio->physical); + + atomic64_inc(&kvio->layer->biosSubmitted); + countAllBios(kvio, bio); + kvioAddTraceRecord(kvio, location); + bio->bi_next = NULL; + generic_make_request(bio); +} + +/** + * Submits a bio to the underlying block device. May block if the + * device is busy. + * + * For metadata or if USE_BIOMAP is disabled, kvio->bioToSubmit holds + * the BIO pointer to submit to the target device. For normal + * data when USE_BIOMAP is enabled, kvio->biosMerged is the list of + * all bios collected together in this group; all of them get + * submitted. In both cases, the bi_end_io callback is invoked when + * each I/O operation completes. + * + * @param item The work item in the KVIO "owning" either the bio to + * submit, or the head of the bio_list to be submitted. + **/ +static void processBioMap(KvdoWorkItem *item) +{ + assertRunningInBioQueue(); + KVIO *kvio = workItemAsKVIO(item); + /* + * XXX Make these paths more regular: Should bi_bdev be set here, or + * in the caller, or in the callback function? Should we call + * finishBioQueue for the biomap case on old kernels? + */ + if (USE_BIOMAP && isData(kvio)) { + // We need to make sure to do two things here: + // 1. Use each bio's kvio when submitting. Any other kvio is not safe + // 2. Detach the bio list from the kvio before submitting, because it + // could get reused/free'd up before all bios are submitted. + BioQueueData *bioQueueData = getWorkQueuePrivateData(); + BIO *bio = NULL; + mutex_lock(&bioQueueData->lock); + if (!bio_list_empty(&kvio->biosMerged)) { + intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.head)); + intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.tail)); + } + bio = kvio->biosMerged.head; + bio_list_init(&kvio->biosMerged); + mutex_unlock(&bioQueueData->lock); + // Somewhere in the list we'll be submitting the current "kvio", + // so drop our handle on it now. + kvio = NULL; + + while (bio != NULL) { + KVIO *kvioBio = bio->bi_private; + BIO *next = bio->bi_next; + bio->bi_next = NULL; + setBioBlockDevice(bio, getKernelLayerBdev(kvioBio->layer)); + sendBioToDevice(kvioBio, bio, THIS_LOCATION("$F($io)")); + bio = next; + } + } else { + sendBioToDevice(kvio, kvio->bioToSubmit, THIS_LOCATION("$F($io)")); + } +} + +/** + * This function will attempt to find an already queued bio that the current + * bio can be merged with. There are two types of merging possible, forward + * and backward, which are distinguished by a flag that uses kernel + * elevator terminology. + * + * @param map The bio map to use for merging + * @param kvio The kvio we want to merge + * @param mergeType The type of merging we want to try + * + * @return the kvio to merge to, NULL if no merging is possible + */ +static KVIO *getMergeableLocked(IntMap *map, + KVIO *kvio, + unsigned int mergeType) +{ + BIO *bio = kvio->bioToSubmit; + sector_t mergeSector = getBioSector(bio); + switch (mergeType) { + case ELEVATOR_BACK_MERGE: + mergeSector -= VDO_SECTORS_PER_BLOCK; + break; + case ELEVATOR_FRONT_MERGE: + mergeSector += VDO_SECTORS_PER_BLOCK; + break; + } + + KVIO *kvioMerge = intMapGet(map, mergeSector); + + if (kvioMerge != NULL) { + if (!areWorkItemActionsEqual(&kvio->enqueueable.workItem, + &kvioMerge->enqueueable.workItem)) { + return NULL; + } else if (bio_data_dir(bio) != bio_data_dir(kvioMerge->bioToSubmit)) { + return NULL; + } else if (bio_list_empty(&kvioMerge->biosMerged)) { + return NULL; + } else { + switch (mergeType) { + case ELEVATOR_BACK_MERGE: + if (getBioSector(kvioMerge->biosMerged.tail) != mergeSector) { + return NULL; + } + break; + case ELEVATOR_FRONT_MERGE: + if (getBioSector(kvioMerge->biosMerged.head) != mergeSector) { + return NULL; + } + break; + } + } + } + + return kvioMerge; +} + +/**********************************************************************/ +static inline unsigned int advanceBioRotor(IOSubmitter *bioData) +{ + unsigned int index = bioData->bioQueueRotor++ + % (bioData->numBioQueuesUsed + * bioData->bioQueueRotationInterval); + index /= bioData->bioQueueRotationInterval; + return index; +} + +/**********************************************************************/ +static bool tryBioMapMerge(BioQueueData *bioQueueData, KVIO *kvio, BIO *bio) +{ + bool merged = false; + + mutex_lock(&bioQueueData->lock); + KVIO *prevKvio = getMergeableLocked(bioQueueData->map, kvio, + ELEVATOR_BACK_MERGE); + KVIO *nextKvio = getMergeableLocked(bioQueueData->map, kvio, + ELEVATOR_FRONT_MERGE); + if (prevKvio == nextKvio) { + nextKvio = NULL; + } + int result; + if ((prevKvio == NULL) && (nextKvio == NULL)) { + // no merge. just add to bioQueue + result = intMapPut(bioQueueData->map, getBioSector(bio), kvio, true, NULL); + // We don't care about failure of intMapPut in this case. + result = result; + mutex_unlock(&bioQueueData->lock); + } else { + if (nextKvio == NULL) { + // Only prev. merge to prev's tail + intMapRemove(bioQueueData->map, getBioSector(prevKvio->biosMerged.tail)); + bio_list_merge(&prevKvio->biosMerged, &kvio->biosMerged); + result = intMapPut(bioQueueData->map, + getBioSector(prevKvio->biosMerged.head), + prevKvio, true, NULL); + result = intMapPut(bioQueueData->map, + getBioSector(prevKvio->biosMerged.tail), + prevKvio, true, NULL); + } else { + // Only next. merge to next's head + // + // Handle "next merge" and "gap fill" cases the same way so as to + // reorder bios in a way that's compatible with using funnel queues + // in work queues. This avoids removing an existing work item. + intMapRemove(bioQueueData->map, getBioSector(nextKvio->biosMerged.head)); + bio_list_merge_head(&nextKvio->biosMerged, &kvio->biosMerged); + result = intMapPut(bioQueueData->map, + getBioSector(nextKvio->biosMerged.head), + nextKvio, true, NULL); + result = intMapPut(bioQueueData->map, + getBioSector(nextKvio->biosMerged.tail), + nextKvio, true, NULL); + } + + // We don't care about failure of intMapPut in this case. + result = result; + mutex_unlock(&bioQueueData->lock); + merged = true; + } + return merged; +} + +/**********************************************************************/ +static BioQueueData *bioQueueDataForPBN(IOSubmitter *ioSubmitter, + PhysicalBlockNumber pbn) +{ + unsigned int bioQueueIndex = bioQueueNumberForPBN(ioSubmitter, pbn); + return &ioSubmitter->bioQueueData[bioQueueIndex]; +} + +/**********************************************************************/ +void submitBio(BIO *bio, BioQAction action) +{ + KVIO *kvio = bio->bi_private; + kvio->bioToSubmit = bio; + setupKVIOWork(kvio, processBioMap, (KvdoWorkFunction) bio->bi_end_io, + action); + + KernelLayer *layer = kvio->layer; + BioQueueData *bioQueueData + = bioQueueDataForPBN(layer->ioSubmitter, kvio->vio->physical); + + kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io)")); + + bio->bi_next = NULL; + bio_list_init(&kvio->biosMerged); + bio_list_add(&kvio->biosMerged, bio); + + /* + * Enabling of MD RAID5 mode optimizes performance for MD RAID5 storage + * configurations. It clears the bits for sync I/O RW flags on data block + * bios and sets the bits for sync I/O RW flags on all journal-related + * bios. + * + * This increases the frequency of full-stripe writes by altering flags of + * submitted bios. For workloads with write requests this increases the + * likelihood that the MD RAID5 device will update a full stripe instead of + * a partial stripe, thereby avoiding making read requests to the underlying + * physical storage for purposes of parity chunk calculations. + * + * Setting the sync-flag on journal-related bios is expected to reduce + * latency on journal updates submitted to an MD RAID5 device. + */ + if (layer->deviceConfig->mdRaid5ModeEnabled) { + if (isData(kvio)) { + // Clear the bits for sync I/O RW flags on data block bios. + clearBioOperationFlagSync(bio); + } else if ((kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) + || (kvio->vio->type == VIO_TYPE_SLAB_JOURNAL)) { + // Set the bits for sync I/O RW flags on all journal-related and + // slab-journal-related bios. + setBioOperationFlagSync(bio); + } + } + + /* + * Try to use the bio map to submit this bio earlier if we're already sending + * IO for an adjacent block. If we can't use an existing pending bio, enqueue + * an operation to run in a bio submission thread appropriate to the + * indicated physical block number. + */ + + bool merged = false; + if (USE_BIOMAP && isData(kvio)) { + merged = tryBioMapMerge(bioQueueData, kvio, bio); + } + if (!merged) { + enqueueKVIOWork(bioQueueData->queue, kvio); + } +} + +/**********************************************************************/ +static int initializeBioQueue(BioQueueData *bioQueueData, + const char *threadNamePrefix, + const char *queueName, + unsigned int queueNumber, + KernelLayer *layer) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38) + bioQueueData->bdev = layer->dev->bdev; +#endif + bioQueueData->queueNumber = queueNumber; + + return makeWorkQueue(threadNamePrefix, queueName, &layer->wqDirectory, + layer, bioQueueData, &bioQueueType, 1, + &bioQueueData->queue); +} + +/**********************************************************************/ +int makeIOSubmitter(const char *threadNamePrefix, + unsigned int threadCount, + unsigned int rotationInterval, + unsigned int maxRequestsActive, + KernelLayer *layer, + IOSubmitter **ioSubmitterPtr) +{ + IOSubmitter *ioSubmitter; + int result = ALLOCATE_EXTENDED(IOSubmitter, + threadCount, + BioQueueData, + "bio submission data", + &ioSubmitter); + if (result != UDS_SUCCESS) { + return result; + } + + // Setup for each bio-submission work queue + char queueName[MAX_QUEUE_NAME_LEN]; + ioSubmitter->bioQueueRotationInterval = rotationInterval; + for (unsigned int i=0; i < threadCount; i++) { + BioQueueData *bioQueueData = &ioSubmitter->bioQueueData[i]; + snprintf(queueName, sizeof(queueName), "bioQ%u", i); + + if (USE_BIOMAP) { + mutex_init(&bioQueueData->lock); + /* + * One I/O operation per request, but both first & last sector numbers. + * + * If requests are assigned to threads round-robin, they should + * be distributed quite evenly. But if they're assigned based on + * PBN, things can sometimes be very uneven. So for now, we'll + * assume that all requests *may* wind up on one thread, and + * thus all in the same map. + */ + result = makeIntMap(maxRequestsActive * 2, 0, &bioQueueData->map); + if (result != 0) { + // Clean up the partially initialized bio-queue entirely and + // indicate that initialization failed. + logError("bio map initialization failed %d", result); + cleanupIOSubmitter(ioSubmitter); + freeIOSubmitter(ioSubmitter); + return result; + } + } + + result = initializeBioQueue(bioQueueData, + threadNamePrefix, + queueName, + i, + layer); + if (result != VDO_SUCCESS) { + // Clean up the partially initialized bio-queue entirely and + // indicate that initialization failed. + if (USE_BIOMAP) { + freeIntMap(&ioSubmitter->bioQueueData[i].map); + } + logError("bio queue initialization failed %d", result); + cleanupIOSubmitter(ioSubmitter); + freeIOSubmitter(ioSubmitter); + return result; + } + + ioSubmitter->numBioQueuesUsed++; + } + + *ioSubmitterPtr = ioSubmitter; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void cleanupIOSubmitter(IOSubmitter *ioSubmitter) +{ + for (int i=ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { + finishWorkQueue(ioSubmitter->bioQueueData[i].queue); + } +} + +/**********************************************************************/ +void freeIOSubmitter(IOSubmitter *ioSubmitter) +{ + for (int i = ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { + ioSubmitter->numBioQueuesUsed--; + freeWorkQueue(&ioSubmitter->bioQueueData[i].queue); + if (USE_BIOMAP) { + freeIntMap(&ioSubmitter->bioQueueData[i].map); + } + } + FREE(ioSubmitter); +} + +/**********************************************************************/ +void dumpBioWorkQueue(IOSubmitter *ioSubmitter) +{ + for (int i=0; i < ioSubmitter->numBioQueuesUsed; i++) { + dumpWorkQueue(ioSubmitter->bioQueueData[i].queue); + } +} + + +/**********************************************************************/ +void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem) +{ + unsigned int bioQueueIndex = advanceBioRotor(ioSubmitter); + enqueueWorkQueue(ioSubmitter->bioQueueData[bioQueueIndex].queue, + workItem); +} + diff --git a/vdo/kernel/ioSubmitter.h b/vdo/kernel/ioSubmitter.h new file mode 100644 index 0000000..c4fb5ce --- /dev/null +++ b/vdo/kernel/ioSubmitter.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.h#4 $ + */ + +#ifndef IOSUBMITTER_H +#define IOSUBMITTER_H + +#include + +#include "kernelLayer.h" +#include "kvio.h" + +/** + * Does all the appropriate accounting for bio completions + * + * @param bio the bio to count + **/ +void countCompletedBios(BIO *bio); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Completes a bio relating to a kvio, causing the completion callback + * to be invoked. + * + * This is used as the bi_end_io function for most of the bios created + * within VDO and submitted to the storage device. Exceptions are the + * flush code and the read-block code, both of which need to regain + * control in the kernel layer after the I/O is completed. + * + * @param bio The bio to complete + **/ +void completeAsyncBio(BIO *bio); +#else +/** + * Completes a bio relating to a kvio, causing the completion callback + * to be invoked. + * + * This is used as the bi_end_io function for most of the bios created + * within VDO and submitted to the storage device. Exceptions are the + * flush code and the read-block code, both of which need to regain + * control in the kernel layer after the I/O is completed. + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +void completeAsyncBio(BIO *bio, int error); +#endif + +/** + * Create a IOSubmitter structure for a new physical layer. + * + * @param [in] threadNamePrefix The per-device prefix to use in process names + * @param [in] threadCount Number of bio-submission threads to set up + * @param [in] rotationInterval Interval to use when rotating between + * bio-submission threads when enqueuing work + * items + * @param [in] maxRequestsActive Number of bios for merge tracking + * @param [in] layer The kernel layer + * @param [out] ioSubmitter Pointer to the new data structure + * + * @return VDO_SUCCESS or an error + **/ +int makeIOSubmitter(const char *threadNamePrefix, + unsigned int threadCount, + unsigned int rotationInterval, + unsigned int maxRequestsActive, + KernelLayer *layer, + IOSubmitter **ioSubmitter); + +/** + * Tear down the IOSubmitter fields as needed for a physical layer. + * + * @param [in] ioSubmitter The I/O submitter data to tear down + **/ +void cleanupIOSubmitter(IOSubmitter *ioSubmitter); + +/** + * Free the IOSubmitter fields and structure as needed for a + * physical layer. This must be called after + * cleanupIOSubmitter(). It is used to release resources late in + * the shutdown process to avoid or reduce the chance of race + * conditions. + * + * @param [in] ioSubmitter The I/O submitter data to destroy + **/ +void freeIOSubmitter(IOSubmitter *ioSubmitter); + +/** + * Dump info to the kernel log about the work queue used by the + * physical layer. For debugging only. + * + * @param [in] ioSubmitter The I/O submitter data + **/ +void dumpBioWorkQueue(IOSubmitter *ioSubmitter); + + +/** + * Enqueue a work item to run in the work queue(s) used for bio + * submissions from the physical layer. + * + * Outside of IOSubmitter, used only for finishing processing of empty + * flush bios by sending them to the storage device. + * + * @param ioSubmitter The I/O submitter data to update + * @param workItem The new work item to run + **/ +void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem); + +/** + * Submit bio but don't block. + * + * Submits the bio to a helper work queue which sits in a loop + * submitting bios. The worker thread may block if the target device + * is busy, which is why we don't want to do the submission in the + * original calling thread. + * + * The bi_private field of the bio must point to a KVIO associated + * with the operation. The bi_end_io callback is invoked when the I/O + * operation completes. + * + * @param bio the block I/O operation descriptor to submit + * @param action the action code specifying the priority for the operation + **/ +void submitBio(BIO *bio, BioQAction action); + +#endif // IOSUBMITTER_H diff --git a/vdo/kernel/kernelLayer.c b/vdo/kernel/kernelLayer.c new file mode 100644 index 0000000..8d4d4ed --- /dev/null +++ b/vdo/kernel/kernelLayer.c @@ -0,0 +1,1409 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.c#38 $ + */ + +#include "kernelLayer.h" + +#include +#include +#include +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" + +#include "lz4.h" +#include "releaseVersions.h" +#include "volumeGeometry.h" +#include "statistics.h" +#include "vdo.h" + +#include "bio.h" +#include "dataKVIO.h" +#include "dedupeIndex.h" +#include "deviceConfig.h" +#include "deviceRegistry.h" +#include "instanceNumber.h" +#include "ioSubmitter.h" +#include "kvdoFlush.h" +#include "kvio.h" +#include "poolSysfs.h" +#include "statusProcfs.h" +#include "stringUtils.h" +#include "verify.h" + +enum { + DEDUPE_TIMEOUT_REPORT_INTERVAL = 1000, +}; + +static const KvdoWorkQueueType bioAckQType = { + .actionTable = { + { .name = "bio_ack", + .code = BIO_ACK_Q_ACTION_ACK, + .priority = 0 }, + }, +}; + +static const KvdoWorkQueueType cpuQType = { + .actionTable = { + { .name = "cpu_complete_kvio", + .code = CPU_Q_ACTION_COMPLETE_KVIO, + .priority = 0 }, + { .name = "cpu_compress_block", + .code = CPU_Q_ACTION_COMPRESS_BLOCK, + .priority = 0 }, + { .name = "cpu_hash_block", + .code = CPU_Q_ACTION_HASH_BLOCK, + .priority = 0 }, + { .name = "cpu_event_reporter", + .code = CPU_Q_ACTION_EVENT_REPORTER, + .priority = 0 }, + }, +}; + +// 2000 is half the number of entries currently in our page cache, +// to allow for each in-progress operation to update two pages. +int defaultMaxRequestsActive = 2000; + +/**********************************************************************/ +static CRC32Checksum kvdoUpdateCRC32(CRC32Checksum crc, + const byte *buffer, + size_t length) +{ + /* + * The kernel's CRC 32 implementation does not do pre- and post- + * conditioning, so do it ourselves. + */ + return crc32(crc ^ 0xffffffff, buffer, length) ^ 0xffffffff; +} + +/**********************************************************************/ +static BlockCount kvdoGetBlockCount(PhysicalLayer *header) +{ + return asKernelLayer(header)->deviceConfig->physicalBlocks; +} + +/**********************************************************************/ +bool layerIsNamed(KernelLayer *layer, void *context) +{ + struct dm_target *ti = layer->deviceConfig->owningTarget; + const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); + return (strcmp(deviceName, (const char *) context) == 0); +} + +/** + * Implements LayerFilter. + **/ +static bool layerUsesDevice(KernelLayer *layer, void *context) +{ + DeviceConfig *config = context; + return (layer->deviceConfig->ownedDevice->bdev->bd_dev + == config->ownedDevice->bdev->bd_dev); +} + +int mapToSystemError(int error) +{ + // 0 is success, negative a system error code + if (likely(error <= 0)) { + return error; + } + if (error < 1024) { + // errno macro used without negating - may be a minor bug + return -error; + } + // VDO or UDS error + char errorName[80], errorMessage[ERRBUF_SIZE]; + switch (sansUnrecoverable(error)) { + case VDO_NO_SPACE: + return -ENOSPC; + case VDO_READ_ONLY: + return -EIO; + default: + logInfo("%s: mapping internal status code %d (%s: %s) to EIO", + __func__, error, + stringErrorName(error, errorName, sizeof(errorName)), + stringError(error, errorMessage, sizeof(errorMessage))); + return -EIO; + } +} + +/**********************************************************************/ +static void setKernelLayerState(KernelLayer *layer, KernelLayerState newState) +{ + atomicStore32(&layer->state, newState); +} + +/**********************************************************************/ +void waitForNoRequestsActive(KernelLayer *layer) +{ + // Do nothing if there are no requests active. This check is not necessary + // for correctness but does reduce log message traffic. + if (limiterIsIdle(&layer->requestLimiter)) { + return; + } + + // We have to make sure to flush the packer before waiting. We do this + // by turning off compression, which also means no new entries coming in + // while waiting will end up in the packer. + bool wasCompressing = setKVDOCompressing(&layer->kvdo, false); + // Now wait for there to be no active requests + limiterWaitForIdle(&layer->requestLimiter); + // Reset the compression state after all requests are done + if (wasCompressing) { + setKVDOCompressing(&layer->kvdo, true); + } +} + +/** + * Start processing a new data KVIO based on the supplied bio, but from within + * a VDO thread context, when we're not allowed to block. Using this path at + * all suggests a bug or erroneous usage, but we special-case it to avoid a + * deadlock that can apparently result. Message will be logged to alert the + * administrator that something has gone wrong, while we attempt to continue + * processing other requests. + * + * If a request permit can be acquired immediately, kvdoLaunchDataKVIOFromBio + * will be called. (If the bio is a discard operation, a permit from the + * discard limiter will be requested but the call will be made with or without + * it.) If the request permit is not available, the bio will be saved on a list + * to be launched later. Either way, this function will not block, and will + * take responsibility for processing the bio. + * + * @param layer The kernel layer + * @param bio The bio to launch + * @param arrivalTime The arrival time of the bio + * + * @return DM_MAPIO_SUBMITTED or a system error code + **/ +static int launchDataKVIOFromVDOThread(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime) +{ + logWarning("kvdoMapBio called from within a VDO thread!"); + /* + * We're not yet entirely sure what circumstances are causing this situation + * in [ESC-638], but it does appear to be happening and causing VDO to + * deadlock. + * + * Somehow kvdoMapBio is being called from generic_make_request which is + * being called from the VDO code to pass a flush on down to the underlying + * storage system; we've got 2000 requests in progress, so we have to wait + * for one to complete, but none can complete while the bio thread is blocked + * from passing more I/O requests down. Near as we can tell, the flush bio + * should always have gotten updated to point to the storage system, so we + * shouldn't be calling back into VDO unless something's gotten messed up + * somewhere. + * + * To side-step this case, if the limiter says we're busy *and* we're running + * on one of VDO's own threads, we'll drop the I/O request in a special queue + * for processing as soon as KVIOs become free. + * + * We don't want to do this in general because it leads to unbounded + * buffering, arbitrarily high latencies, inability to push back in a way the + * caller can take advantage of, etc. If someone wants huge amounts of + * buffering on top of VDO, they're welcome to access it through the kernel + * page cache or roll their own. + */ + if (!limiterPoll(&layer->requestLimiter)) { + addToDeadlockQueue(&layer->deadlockQueue, bio, arrivalTime); + logWarning("queued an I/O request to avoid deadlock!"); + + return DM_MAPIO_SUBMITTED; + } + + bool hasDiscardPermit + = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + if (result != VDO_SUCCESS) { + return result; + } + + return DM_MAPIO_SUBMITTED; +} + +/**********************************************************************/ +int kvdoMapBio(KernelLayer *layer, BIO *bio) +{ + Jiffies arrivalTime = jiffies; + KernelLayerState state = getKernelLayerState(layer); + ASSERT_LOG_ONLY(state == LAYER_RUNNING, + "kvdoMapBio should not be called while in state %d", state); + + // Count all incoming bios. + countBios(&layer->biosIn, bio); + + // Handle empty bios. Empty flush bios are not associated with a VIO. + if (isFlushBio(bio)) { + if (ASSERT(getBioSize(bio) == 0, "Flush bio is size 0") != VDO_SUCCESS) { + // We expect flushes to be of size 0. + return -EINVAL; + } + if (shouldProcessFlush(layer)) { + launchKVDOFlush(layer, bio); + return DM_MAPIO_SUBMITTED; + } else { + // We're not acknowledging this bio now, but we'll never touch it + // again, so this is the last chance to account for it. + countBios(&layer->biosAcknowledged, bio); + atomic64_inc(&layer->flushOut); + setBioBlockDevice(bio, getKernelLayerBdev(layer)); + return DM_MAPIO_REMAPPED; + } + } + + if (ASSERT(getBioSize(bio) != 0, "Data bio is not size 0") != VDO_SUCCESS) { + // We expect non-flushes to be non-zero in size. + return -EINVAL; + } + + if (isDiscardBio(bio) && isReadBio(bio)) { + // Read and Discard should never occur together + return -EIO; + } + + KvdoWorkQueue *currentWorkQueue = getCurrentWorkQueue(); + if ((currentWorkQueue != NULL) + && (layer == getWorkQueueOwner(currentWorkQueue))) { + /* + * This prohibits sleeping during I/O submission to VDO from its own + * thread. + */ + return launchDataKVIOFromVDOThread(layer, bio, arrivalTime); + } + bool hasDiscardPermit = false; + if (isDiscardBio(bio)) { + limiterWaitForOneFree(&layer->discardLimiter); + hasDiscardPermit = true; + } + limiterWaitForOneFree(&layer->requestLimiter); + + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + if (result != VDO_SUCCESS) { + return result; + } + + return DM_MAPIO_SUBMITTED; +} + +/**********************************************************************/ +struct block_device *getKernelLayerBdev(const KernelLayer *layer) +{ + return layer->deviceConfig->ownedDevice->bdev; +} + +/**********************************************************************/ +void completeManyRequests(KernelLayer *layer, uint32_t count) +{ + // If we had to buffer some requests to avoid deadlock, release them now. + while (count > 0) { + Jiffies arrivalTime = 0; + BIO *bio = pollDeadlockQueue(&layer->deadlockQueue, &arrivalTime); + if (likely(bio == NULL)) { + break; + } + + bool hasDiscardPermit + = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + if (result != VDO_SUCCESS) { + completeBio(bio, result); + } + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + count--; + } + // Notify the limiter, so it can wake any blocked processes. + if (count > 0) { + limiterReleaseMany(&layer->requestLimiter, count); + } +} + +/**********************************************************************/ +static void reportEvents(PeriodicEventReporter *reporter) +{ + atomic_set(&reporter->workItemQueued, 0); + uint64_t newValue = atomic64_read(&reporter->value); + uint64_t difference = newValue - reporter->lastReportedValue; + if (difference != 0) { + logDebug(reporter->format, difference); + reporter->lastReportedValue = newValue; + } +} + +/**********************************************************************/ +static void reportEventsWork(KvdoWorkItem *item) +{ + PeriodicEventReporter *reporter = container_of(item, PeriodicEventReporter, + workItem); + reportEvents(reporter); +} + +/**********************************************************************/ +static void initPeriodicEventReporter(PeriodicEventReporter *reporter, + const char *format, + unsigned long reportingInterval, + KernelLayer *layer) +{ + setupWorkItem(&reporter->workItem, reportEventsWork, NULL, + CPU_Q_ACTION_EVENT_REPORTER); + reporter->format = format; + reporter->reportingInterval = msecs_to_jiffies(reportingInterval); + reporter->layer = layer; +} + +/**********************************************************************/ +static void addEventCount(PeriodicEventReporter *reporter, unsigned int count) +{ + if (count > 0) { + atomic64_add(count, &reporter->value); + int oldWorkItemQueued = atomic_xchg(&reporter->workItemQueued, 1); + if (oldWorkItemQueued == 0) { + enqueueWorkQueueDelayed(reporter->layer->cpuQueue, + &reporter->workItem, + jiffies + reporter->reportingInterval); + } + } +} + +/**********************************************************************/ +static void stopPeriodicEventReporter(PeriodicEventReporter *reporter) +{ + reportEvents(reporter); +} + +/**********************************************************************/ +void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount) +{ + addEventCount(&layer->albireoTimeoutReporter, expiredCount); +} + +/**********************************************************************/ +static int kvdoCreateEnqueueable(VDOCompletion *completion) +{ + KvdoEnqueueable *kvdoEnqueueable; + int result = ALLOCATE(1, KvdoEnqueueable, "kvdoEnqueueable", + &kvdoEnqueueable); + if (result != VDO_SUCCESS) { + logError("kvdoEnqueueable allocation failure %d", result); + return result; + } + kvdoEnqueueable->enqueueable.completion = completion; + completion->enqueueable = &kvdoEnqueueable->enqueueable; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void kvdoDestroyEnqueueable(Enqueueable **enqueueablePtr) +{ + Enqueueable *enqueueable = *enqueueablePtr; + if (enqueueable != NULL) { + KvdoEnqueueable *kvdoEnqueueable + = container_of(enqueueable, KvdoEnqueueable, enqueueable); + FREE(kvdoEnqueueable); + *enqueueablePtr = NULL; + } +} + +/** + * Implements BufferAllocator. + **/ +static int kvdoAllocateIOBuffer(PhysicalLayer *layer __attribute__((unused)), + size_t bytes, + const char *why, + char **bufferPtr) +{ + return ALLOCATE(bytes, char, why, bufferPtr); +} + +/** + * Implements ExtentReader. Exists only for the geometry block; is unset after + * it is read. + **/ +static int kvdoSynchronousRead(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksRead) +{ + if (blockCount != 1) { + return VDO_NOT_IMPLEMENTED; + } + + KernelLayer *kernelLayer = asKernelLayer(layer); + + BIO *bio; + int result = createBio(kernelLayer, buffer, &bio); + if (result != VDO_SUCCESS) { + return result; + } + setBioBlockDevice(bio, getKernelLayerBdev(kernelLayer)); + setBioSector(bio, blockToSector(kernelLayer, startBlock)); + setBioOperationRead(bio); + result = submitBioAndWait(bio); + if (result != 0) { + logErrorWithStringError(result, "synchronous read failed"); + result = -EIO; + } + freeBio(bio, kernelLayer); + + if (result != VDO_SUCCESS) { + return result; + } + if (blocksRead != NULL) { + *blocksRead = blockCount; + } + return VDO_SUCCESS; +} + +/** + * Implements VIODestructor. + **/ +static void kvdoFreeVIO(VIO **vioPtr) +{ + VIO *vio = *vioPtr; + if (vio == NULL) { + return; + } + + BUG_ON(isDataVIO(vio)); + + if (isCompressedWriteVIO(vio)) { + CompressedWriteKVIO *compressedWriteKVIO + = allocatingVIOAsCompressedWriteKVIO(vioAsAllocatingVIO(vio)); + freeCompressedWriteKVIO(&compressedWriteKVIO); + } else { + MetadataKVIO *metadataKVIO = vioAsMetadataKVIO(vio); + freeMetadataKVIO(&metadataKVIO); + } + + *vioPtr = NULL; +} + +/**********************************************************************/ +static WritePolicy kvdoGetWritePolicy(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + return getKVDOWritePolicy(&layer->kvdo); +} + +/** + * Function that is called when a synchronous operation is completed. We let + * the waiting thread know it can continue. + * + *

Implements OperationComplete. + * + * @param common The kernel layer + **/ +static void kvdoCompleteSyncOperation(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + complete(&layer->callbackSync); +} + +/** + * Wait for a synchronous operation to complete. + * + *

Implements OperationWaiter. + * + * @param common The kernel layer + **/ +static void waitForSyncOperation(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + // Using the "interruptible" interface means that Linux will not log a + // message when we wait for more than 120 seconds. + while (wait_for_completion_interruptible(&layer->callbackSync) != 0) { + // However, if we get a signal in a user-mode process, we could + // spin... + msleep(1); + } +} + +/** + * Make the bio set for allocating new bios. + * + * @param layer The kernel layer + * + * @returns VDO_SUCCESS if bio set created, error code otherwise + **/ +static int makeDedupeBioSet(KernelLayer *layer) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + int result = ALLOCATE(1, struct bio_set, "bio set", &layer->bioset); + if (result != VDO_SUCCESS) { + return result; + } + + result = bioset_init(layer->bioset, 0, 0, BIOSET_NEED_BVECS); + if (result != 0) { + return result; + } +#else +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + layer->bioset = bioset_create(0, 0, BIOSET_NEED_BVECS); +#else + layer->bioset = bioset_create(0, 0); +#endif + if (layer->bioset == NULL) { + return -ENOMEM; + } +#endif + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeKernelLayer(uint64_t startingSector, + unsigned int instance, + DeviceConfig *config, + struct kobject *parentKobject, + ThreadConfig **threadConfigPointer, + char **reason, + KernelLayer **layerPtr) +{ + // VDO-3769 - Set a generic reason so we don't ever return garbage. + *reason = "Unspecified error"; + + KernelLayer *oldLayer = findLayerMatching(layerUsesDevice, config); + if (oldLayer != NULL) { + logError("Existing layer named %s already uses device %s", + oldLayer->deviceConfig->poolName, + oldLayer->deviceConfig->parentDeviceName); + *reason = "Cannot share storage device with already-running VDO"; + return VDO_BAD_CONFIGURATION; + } + + /* + * Part 1 - Allocate the kernel layer, its essential parts, and setup up the + * sysfs node. These must come first so that the sysfs node works correctly + * through the freeing of the kernel layer. After this part you must use + * freeKernelLayer. + */ + KernelLayer *layer; + int result = ALLOCATE(1, KernelLayer, "VDO configuration", &layer); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate VDO configuration"; + return result; + } + + // Allow the base VDO to allocate buffers and construct or destroy + // enqueuables as part of its allocation. + layer->common.allocateIOBuffer = kvdoAllocateIOBuffer; + layer->common.createEnqueueable = kvdoCreateEnqueueable; + layer->common.destroyEnqueueable = kvdoDestroyEnqueueable; + + result = allocateVDO(&layer->common, &layer->kvdo.vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate VDO"; + FREE(layer); + return result; + } + + // After this point, calling kobject_put on kobj will decrement its + // reference count, and when the count goes to 0 the KernelLayer will + // be freed. + kobject_init(&layer->kobj, &kernelLayerKobjType); + result = kobject_add(&layer->kobj, parentKobject, config->poolName); + if (result != 0) { + *reason = "Cannot add sysfs node"; + kobject_put(&layer->kobj); + return result; + } + kobject_init(&layer->wqDirectory, &workQueueDirectoryKobjType); + result = kobject_add(&layer->wqDirectory, &layer->kobj, "work_queues"); + if (result != 0) { + *reason = "Cannot add sysfs node"; + kobject_put(&layer->wqDirectory); + kobject_put(&layer->kobj); + return result; + } + + /* + * Part 2 - Do all the simple initialization. These initializations have no + * order dependencies and can be done in any order, but freeKernelLayer() + * cannot be called until all the simple layer properties are set. + * + * The KernelLayer structure starts as all zeros. Pointer initializations + * consist of replacing a NULL pointer with a non-NULL pointer, which can be + * easily undone by freeing all of the non-NULL pointers (using the proper + * free routine). + */ + setKernelLayerState(layer, LAYER_SIMPLE_THINGS_INITIALIZED); + + initializeDeadlockQueue(&layer->deadlockQueue); + + int requestLimit = defaultMaxRequestsActive; + initializeLimiter(&layer->requestLimiter, requestLimit); + initializeLimiter(&layer->discardLimiter, requestLimit * 3 / 4); + + layer->allocationsAllowed = true; + layer->instance = instance; + layer->deviceConfig = config; + layer->startingSectorOffset = startingSector; + initializeRing(&layer->deviceConfigRing); + + layer->common.updateCRC32 = kvdoUpdateCRC32; + layer->common.getBlockCount = kvdoGetBlockCount; + layer->common.getWritePolicy = kvdoGetWritePolicy; + layer->common.createMetadataVIO = kvdoCreateMetadataVIO; + layer->common.createCompressedWriteVIO = kvdoCreateCompressedWriteVIO; + layer->common.freeVIO = kvdoFreeVIO; + layer->common.completeFlush = kvdoCompleteFlush; + layer->common.enqueue = kvdoEnqueue; + layer->common.waitForAdminOperation = waitForSyncOperation; + layer->common.completeAdminOperation = kvdoCompleteSyncOperation; + layer->common.getCurrentThreadID = kvdoGetCurrentThreadID; + layer->common.zeroDataVIO = kvdoZeroDataVIO; + layer->common.compareDataVIOs = kvdoCompareDataVIOs; + layer->common.copyData = kvdoCopyDataVIO; + layer->common.readData = kvdoReadDataVIO; + layer->common.writeData = kvdoWriteDataVIO; + layer->common.writeCompressedBlock = kvdoWriteCompressedBlock; + layer->common.readMetadata = kvdoSubmitMetadataVIO; + layer->common.writeMetadata = kvdoSubmitMetadataVIO; + layer->common.applyPartialWrite = kvdoModifyWriteDataVIO; + layer->common.flush = kvdoFlushVIO; + layer->common.hashData = kvdoHashDataVIO; + layer->common.checkForDuplication = kvdoCheckForDuplication; + layer->common.verifyDuplication = kvdoVerifyDuplication; + layer->common.acknowledgeDataVIO = kvdoAcknowledgeDataVIO; + layer->common.compressDataVIO = kvdoCompressDataVIO; + layer->common.updateAlbireo = kvdoUpdateDedupeAdvice; + + spin_lock_init(&layer->flushLock); + mutex_init(&layer->statsMutex); + bio_list_init(&layer->waitingFlushes); + + result = addLayerToDeviceRegistry(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot add layer to device registry"; + freeKernelLayer(layer); + return result; + } + + snprintf(layer->threadNamePrefix, sizeof(layer->threadNamePrefix), "%s%u", + THIS_MODULE->name, instance); + + result = makeThreadConfig(config->threadCounts.logicalZones, + config->threadCounts.physicalZones, + config->threadCounts.hashZones, + threadConfigPointer); + if (result != VDO_SUCCESS) { + *reason = "Cannot create thread configuration"; + freeKernelLayer(layer); + return result; + } + + logInfo("zones: %d logical, %d physical, %d hash; base threads: %d", + config->threadCounts.logicalZones, + config->threadCounts.physicalZones, + config->threadCounts.hashZones, + (*threadConfigPointer)->baseThreadCount); + + result = makeBatchProcessor(layer, returnDataKVIOBatchToPool, layer, + &layer->dataKVIOReleaser); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate KVIO-freeing batch processor"; + freeKernelLayer(layer); + return result; + } + + // Spare KVDOFlush, so that we will always have at least one available + result = makeKVDOFlush(&layer->spareKVDOFlush); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate KVDOFlush record"; + freeKernelLayer(layer); + return result; + } + + // BIO pool (needed before the geometry block) + result = makeDedupeBioSet(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate dedupe bioset"; + freeKernelLayer(layer); + return result; + } + + // Read the geometry block so we know how to set up the index. Allow it to + // do synchronous reads. + layer->common.reader = kvdoSynchronousRead; + result = loadVolumeGeometry(&layer->common, &layer->geometry); + layer->common.reader = NULL; + if (result != VDO_SUCCESS) { + *reason = "Could not load geometry block"; + freeKernelLayer(layer); + return result; + } + + // Albireo Timeout Reporter + initPeriodicEventReporter(&layer->albireoTimeoutReporter, + "Albireo timeout on %llu requests", + DEDUPE_TIMEOUT_REPORT_INTERVAL, layer); + + // Dedupe Index + BUG_ON(layer->threadNamePrefix[0] == '\0'); + result = makeDedupeIndex(&layer->dedupeIndex, layer); + if (result != UDS_SUCCESS) { + *reason = "Cannot initialize dedupe index"; + freeKernelLayer(layer); + return result; + } + + // Compression context storage + result = ALLOCATE(config->threadCounts.cpuThreads, char *, "LZ4 context", + &layer->compressionContext); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + freeKernelLayer(layer); + return result; + } + for (int i = 0; i < config->threadCounts.cpuThreads; i++) { + result = ALLOCATE(LZ4_context_size(), char, "LZ4 context", + &layer->compressionContext[i]); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + freeKernelLayer(layer); + return result; + } + } + + + /* + * Part 3 - Do initializations that depend upon other previous + * initializations, but have no order dependencies at freeing time. + * Order dependencies for initialization are identified using BUG_ON. + */ + setKernelLayerState(layer, LAYER_BUFFER_POOLS_INITIALIZED); + + // Trace pool + BUG_ON(layer->requestLimiter.limit <= 0); + result = traceKernelLayerInit(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot initialize trace data"; + freeKernelLayer(layer); + return result; + } + + // KVIO and VIO pool + BUG_ON(layer->deviceConfig->logicalBlockSize <= 0); + BUG_ON(layer->requestLimiter.limit <= 0); + BUG_ON(layer->bioset == NULL); + BUG_ON(layer->deviceConfig->ownedDevice == NULL); + result = makeDataKVIOBufferPool(layer, layer->requestLimiter.limit, + &layer->dataKVIOPool); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate vio data"; + freeKernelLayer(layer); + return result; + } + + /* + * Part 4 - Do initializations that depend upon other previous + * initialization, that may have order dependencies at freeing time. + * These are mostly starting up the workqueue threads. + */ + + // Base-code thread, etc + result = initializeKVDO(&layer->kvdo, *threadConfigPointer, reason); + if (result != VDO_SUCCESS) { + freeKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_REQUEST_QUEUE_INITIALIZED); + + // Bio queue + result = makeIOSubmitter(layer->threadNamePrefix, + config->threadCounts.bioThreads, + config->threadCounts.bioRotationInterval, + layer->requestLimiter.limit, + layer, + &layer->ioSubmitter); + if (result != VDO_SUCCESS) { + // If initialization of the bio-queues failed, they are cleaned + // up already, so just free the rest of the kernel layer. + freeKernelLayer(layer); + *reason = "bio submission initialization failed"; + return result; + } + setKernelLayerState(layer, LAYER_BIO_DATA_INITIALIZED); + + // Bio ack queue + if (useBioAckQueue(layer)) { + result = makeWorkQueue(layer->threadNamePrefix, "ackQ", + &layer->wqDirectory, layer, layer, &bioAckQType, + config->threadCounts.bioAckThreads, + &layer->bioAckQueue); + if (result != VDO_SUCCESS) { + *reason = "bio ack queue initialization failed"; + freeKernelLayer(layer); + return result; + } + } + + setKernelLayerState(layer, LAYER_BIO_ACK_QUEUE_INITIALIZED); + + // CPU Queues + result = makeWorkQueue(layer->threadNamePrefix, "cpuQ", &layer->wqDirectory, + layer, NULL, &cpuQType, + config->threadCounts.cpuThreads, &layer->cpuQueue); + if (result != VDO_SUCCESS) { + *reason = "Albireo CPU queue initialization failed"; + freeKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_CPU_QUEUE_INITIALIZED); + + *layerPtr = layer; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int prepareToModifyKernelLayer(KernelLayer *layer, + DeviceConfig *config, + char **errorPtr) +{ + DeviceConfig *extantConfig = layer->deviceConfig; + if (config->owningTarget->begin != extantConfig->owningTarget->begin) { + *errorPtr = "Starting sector cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (strcmp(config->parentDeviceName, extantConfig->parentDeviceName) != 0) { + *errorPtr = "Underlying device cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->logicalBlockSize != extantConfig->logicalBlockSize) { + *errorPtr = "Logical block size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->cacheSize != extantConfig->cacheSize) { + *errorPtr = "Block map cache size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->blockMapMaximumAge != extantConfig->blockMapMaximumAge) { + *errorPtr = "Block map maximum age cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->mdRaid5ModeEnabled != extantConfig->mdRaid5ModeEnabled) { + *errorPtr = "mdRaid5Mode cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (memcmp(&config->threadCounts, &extantConfig->threadCounts, + sizeof(ThreadCountConfig)) != 0) { + *errorPtr = "Thread configuration cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + // Below here are the actions to take when a non-immutable property changes. + + if (config->writePolicy != extantConfig->writePolicy) { + // Nothing needs doing right now for a write policy change. + } + + if (config->owningTarget->len != extantConfig->owningTarget->len) { + size_t logicalBytes = to_bytes(config->owningTarget->len); + if ((logicalBytes % VDO_BLOCK_SIZE) != 0) { + *errorPtr = "Logical size must be a multiple of 4096"; + return VDO_PARAMETER_MISMATCH; + } + + int result = prepareToResizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); + if (result != VDO_SUCCESS) { + *errorPtr = "Device prepareToGrowLogical failed"; + return result; + } + } + + if (config->physicalBlocks != extantConfig->physicalBlocks) { + int result = prepareToResizePhysical(layer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + if (result == VDO_TOO_MANY_SLABS) { + *errorPtr = "Device prepareToGrowPhysical failed (specified physical" + " size too big based on formatted slab size)"; + } else { + *errorPtr = "Device prepareToGrowPhysical failed"; + } + return result; + } + } + + return VDO_SUCCESS; +} + +/********************************************************************** + * Modify the pool name of the device. + * + * @param layer The kernel layer + * @param oldName The old pool name + * @param newName The new pool name + * + * @return VDO_SUCCESS or an error + * + */ +int modifyPoolName(KernelLayer *layer, char *oldName, char *newName) +{ + // We use pool name for sysfs and procfs. Rename them accordingly + logInfo("Modify pool name from %s to %s", oldName, newName); + + void *procfsPrivate; + int result = vdoCreateProcfsEntry(layer, newName, &procfsPrivate); + if (result != VDO_SUCCESS) { + return result; + } + + result = kobject_rename(&layer->kobj, newName); + if (result != 0) { + vdoDestroyProcfsEntry(newName, procfsPrivate); + return result; + } + + void *tmpProcfs = layer->procfsPrivate; + layer->procfsPrivate = procfsPrivate; + + vdoDestroyProcfsEntry(oldName, tmpProcfs); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int modifyKernelLayer(KernelLayer *layer, + DeviceConfig *config) +{ + KernelLayerState state = getKernelLayerState(layer); + if (state == LAYER_RUNNING) { + return VDO_SUCCESS; + } else if (state != LAYER_SUSPENDED) { + logError("pre-resume invoked while in unexpected kernel layer state %d", + state); + return -EINVAL; + } + + setKernelLayerState(layer, LAYER_RESUMING); + + DeviceConfig *extantConfig = layer->deviceConfig; + + // A failure here is unrecoverable. So there is no problem if it happens. + + if (config->writePolicy != extantConfig->writePolicy) { + /* + * Ordinarily, when going from async to sync, we must flush any metadata + * written. However, because the underlying storage must have gone into + * sync mode before we suspend VDO, and suspending VDO concludes by + * issuing a flush, all metadata written before the suspend is flushed + * by the suspend and all metadata between the suspend and the write + * policy change is written to synchronous storage. + */ + logInfo("Modifying device '%s' write policy from %s to %s", + config->poolName, getConfigWritePolicyString(extantConfig), + getConfigWritePolicyString(config)); + setWritePolicy(layer->kvdo.vdo, config->writePolicy); + } + + if (config->owningTarget->len != extantConfig->owningTarget->len) { + size_t logicalBytes = to_bytes(config->owningTarget->len); + int result = resizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Grow physical if the version is 0, so we can't tell if we + // got an old-style growPhysical command, or if size changed. + if ((config->physicalBlocks != extantConfig->physicalBlocks) + || (config->version == 0)) { + int result = resizePhysical(layer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + } + + if (strcmp(config->poolName, extantConfig->poolName) != 0) { + logInfo("Modifying device '%s' pool name from %s to %s", + config->poolName, extantConfig->poolName, config->poolName); + int result = modifyPoolName(layer, extantConfig->poolName, + config->poolName); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeKernelLayer(KernelLayer *layer) +{ + // This is not the cleanest implementation, but given the current timing + // uncertainties in the shutdown process for work queues, we need to + // store information to enable a late-in-process deallocation of + // funnel-queue data structures in work queues. + bool usedBioAckQueue = false; + bool usedCpuQueue = false; + bool usedKVDO = false; + bool releaseInstance = false; + + KernelLayerState state = getKernelLayerState(layer); + switch (state) { + case LAYER_STOPPING: + logError("re-entered freeKernelLayer while stopping"); + break; + + case LAYER_RUNNING: + suspendKernelLayer(layer); + // fall through + + case LAYER_STARTING: + case LAYER_RESUMING: + case LAYER_SUSPENDED: + stopKernelLayer(layer); + // fall through + + case LAYER_STOPPED: + case LAYER_CPU_QUEUE_INITIALIZED: + finishWorkQueue(layer->cpuQueue); + usedCpuQueue = true; + releaseInstance = true; + // fall through + + case LAYER_BIO_ACK_QUEUE_INITIALIZED: + if (useBioAckQueue(layer)) { + finishWorkQueue(layer->bioAckQueue); + usedBioAckQueue = true; + } + // fall through + + case LAYER_BIO_DATA_INITIALIZED: + cleanupIOSubmitter(layer->ioSubmitter); + // fall through + + case LAYER_REQUEST_QUEUE_INITIALIZED: + finishKVDO(&layer->kvdo); + usedKVDO = true; + // fall through + + case LAYER_BUFFER_POOLS_INITIALIZED: + freeBufferPool(&layer->dataKVIOPool); + freeBufferPool(&layer->traceBufferPool); + // fall through + + case LAYER_SIMPLE_THINGS_INITIALIZED: + if (layer->compressionContext != NULL) { + for (int i = 0; i < layer->deviceConfig->threadCounts.cpuThreads; i++) { + FREE(layer->compressionContext[i]); + } + FREE(layer->compressionContext); + } + if (layer->dedupeIndex != NULL) { + finishDedupeIndex(layer->dedupeIndex); + } + FREE(layer->spareKVDOFlush); + layer->spareKVDOFlush = NULL; + freeBatchProcessor(&layer->dataKVIOReleaser); + removeLayerFromDeviceRegistry(layer); + break; + + default: + logError("Unknown Kernel Layer state: %d", state); + } + + // Late deallocation of resources in work queues. + if (usedCpuQueue) { + freeWorkQueue(&layer->cpuQueue); + } + if (usedBioAckQueue) { + freeWorkQueue(&layer->bioAckQueue); + } + if (layer->ioSubmitter) { + freeIOSubmitter(layer->ioSubmitter); + } + if (usedKVDO) { + destroyKVDO(&layer->kvdo); + } + if (layer->bioset != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + bioset_exit(layer->bioset); + FREE(layer->bioset); +#else + bioset_free(layer->bioset); +#endif + layer->bioset = NULL; + } + + freeDedupeIndex(&layer->dedupeIndex); + + stopPeriodicEventReporter(&layer->albireoTimeoutReporter); + if (releaseInstance) { + releaseKVDOInstance(layer->instance); + } + + // The call to kobject_put on the kobj sysfs node will decrement its + // reference count; when the count goes to zero the VDO object and + // the kernel layer object will be freed as a side effect. + kobject_put(&layer->wqDirectory); + kobject_put(&layer->kobj); +} + +/**********************************************************************/ +static void poolStatsRelease(struct kobject *kobj) +{ + KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); + complete(&layer->statsShutdown); +} + +/**********************************************************************/ +int preloadKernelLayer(KernelLayer *layer, + const VDOLoadConfig *loadConfig, + char **reason) +{ + if (getKernelLayerState(layer) != LAYER_CPU_QUEUE_INITIALIZED) { + *reason = "preloadKernelLayer() may only be invoked after initialization"; + return UDS_BAD_STATE; + } + + setKernelLayerState(layer, LAYER_STARTING); + int result = preloadKVDO(&layer->kvdo, &layer->common, loadConfig, + layer->vioTraceRecording, reason); + if (result != VDO_SUCCESS) { + stopKernelLayer(layer); + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int startKernelLayer(KernelLayer *layer, char **reason) +{ + if (getKernelLayerState(layer) != LAYER_STARTING) { + *reason = "Cannot start kernel from non-starting state"; + stopKernelLayer(layer); + return UDS_BAD_STATE; + } + + int result = startKVDO(&layer->kvdo, &layer->common, reason); + if (result != VDO_SUCCESS) { + stopKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_RUNNING); + static struct kobj_type statsDirectoryKobjType = { + .release = poolStatsRelease, + .sysfs_ops = &poolStatsSysfsOps, + .default_attrs = poolStatsAttrs, + }; + kobject_init(&layer->statsDirectory, &statsDirectoryKobjType); + result = kobject_add(&layer->statsDirectory, &layer->kobj, "statistics"); + if (result != 0) { + *reason = "Cannot add sysfs statistics node"; + stopKernelLayer(layer); + return result; + } + layer->statsAdded = true; + + if (layer->deviceConfig->deduplication) { + // Don't try to load or rebuild the index first (and log scary error + // messages) if this is known to be a newly-formatted volume. + startDedupeIndex(layer->dedupeIndex, wasNew(layer->kvdo.vdo)); + } + + result = vdoCreateProcfsEntry(layer, layer->deviceConfig->poolName, + &layer->procfsPrivate); + if (result != VDO_SUCCESS) { + *reason = "Could not create proc filesystem entry"; + stopKernelLayer(layer); + return result; + } + + layer->allocationsAllowed = false; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void stopKernelLayer(KernelLayer *layer) +{ + layer->allocationsAllowed = true; + + // Stop services that need to gather VDO statistics from the worker threads. + if (layer->statsAdded) { + layer->statsAdded = false; + init_completion(&layer->statsShutdown); + kobject_put(&layer->statsDirectory); + wait_for_completion(&layer->statsShutdown); + } + vdoDestroyProcfsEntry(layer->deviceConfig->poolName, layer->procfsPrivate); + + switch (getKernelLayerState(layer)) { + case LAYER_RUNNING: + suspendKernelLayer(layer); + // fall through + + case LAYER_SUSPENDED: + setKernelLayerState(layer, LAYER_STOPPING); + stopDedupeIndex(layer->dedupeIndex); + // fall through + + case LAYER_STOPPING: + case LAYER_STOPPED: + default: + setKernelLayerState(layer, LAYER_STOPPED); + } +} + +/**********************************************************************/ +int suspendKernelLayer(KernelLayer *layer) +{ + // It's important to note any error here does not actually stop device-mapper + // from suspending the device. All this work is done post suspend. + KernelLayerState state = getKernelLayerState(layer); + if (state == LAYER_SUSPENDED) { + return VDO_SUCCESS; + } + if (state != LAYER_RUNNING) { + logError("Suspend invoked while in unexpected kernel layer state %d", + state); + return -EINVAL; + } + + /* + * Attempt to flush all I/O before completing post suspend work. This is + * needed so that changing write policy upon resume is safe. Also, we think + * a suspended device is expected to have persisted all data written before + * the suspend, even if it hasn't been flushed yet. + */ + waitForNoRequestsActive(layer); + int result = synchronousFlush(layer); + if (result != VDO_SUCCESS) { + setKVDOReadOnly(&layer->kvdo, result); + } + + /* + * Suspend the VDO, writing out all dirty metadata if the no-flush flag + * was not set on the dmsetup suspend call. This will ensure that we don't + * have cause to write while suspended [VDO-4402]. + */ + int suspendResult = suspendKVDO(&layer->kvdo); + if (result == VDO_SUCCESS) { + result = suspendResult; + } + + suspendDedupeIndex(layer->dedupeIndex, !layer->noFlushSuspend); + setKernelLayerState(layer, LAYER_SUSPENDED); + return result; +} + +/**********************************************************************/ +int resumeKernelLayer(KernelLayer *layer) +{ + if (getKernelLayerState(layer) == LAYER_RUNNING) { + return VDO_SUCCESS; + } + + resumeDedupeIndex(layer->dedupeIndex); + int result = resumeKVDO(&layer->kvdo); + if (result != VDO_SUCCESS) { + return result; + } + + setKernelLayerState(layer, LAYER_RUNNING); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount) +{ + logInfo("Preparing to resize physical to %llu", physicalCount); + // Allocations are allowed and permissible through this non-VDO thread, + // since IO triggered by this allocation to VDO can finish just fine. + int result = kvdoPrepareToGrowPhysical(&layer->kvdo, physicalCount); + if (result != VDO_SUCCESS) { + // kvdoPrepareToGrowPhysical logs errors. + if (result == VDO_PARAMETER_MISMATCH) { + // If we don't trap this case, mapToSystemError() will remap it to -EIO, + // which is misleading and ahistorical. + return -EINVAL; + } else { + return result; + } + } + + logInfo("Done preparing to resize physical"); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int resizePhysical(KernelLayer *layer, BlockCount physicalCount) +{ + // We must not mark the layer as allowing allocations when it is suspended + // lest an allocation attempt block on writing IO to the suspended VDO. + int result = kvdoResizePhysical(&layer->kvdo, physicalCount); + if (result != VDO_SUCCESS) { + // kvdoResizePhysical logs errors + return result; + } + return VDO_SUCCESS; +} + +/***********************************************************************/ +int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount) +{ + logInfo("Preparing to resize logical to %llu", logicalCount); + // Allocations are allowed and permissible through this non-VDO thread, + // since IO triggered by this allocation to VDO can finish just fine. + int result = kvdoPrepareToGrowLogical(&layer->kvdo, logicalCount); + if (result != VDO_SUCCESS) { + // kvdoPrepareToGrowLogical logs errors + return result; + } + + logInfo("Done preparing to resize logical"); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int resizeLogical(KernelLayer *layer, BlockCount logicalCount) +{ + logInfo("Resizing logical to %llu", logicalCount); + // We must not mark the layer as allowing allocations when it is suspended + // lest an allocation attempt block on writing IO to the suspended VDO. + int result = kvdoResizeLogical(&layer->kvdo, logicalCount); + if (result != VDO_SUCCESS) { + // kvdoResizeLogical logs errors + return result; + } + + logInfo("Logical blocks now %llu", logicalCount); + return VDO_SUCCESS; +} + diff --git a/vdo/kernel/kernelLayer.h b/vdo/kernel/kernelLayer.h new file mode 100644 index 0000000..4e0bf8c --- /dev/null +++ b/vdo/kernel/kernelLayer.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.h#18 $ + */ + +#ifndef KERNELLAYER_H +#define KERNELLAYER_H + +#include + +#include "atomic.h" +#include "constants.h" +#include "flush.h" +#include "intMap.h" +#include "physicalLayer.h" +#include "ringNode.h" +#include "volumeGeometry.h" +#include "waitQueue.h" + +#include "batchProcessor.h" +#include "bufferPool.h" +#include "deadlockQueue.h" +#include "deviceConfig.h" +#include "histogram.h" +#include "kernelStatistics.h" +#include "kernelTypes.h" +#include "kernelVDO.h" +#include "ktrace.h" +#include "limiter.h" +#include "statistics.h" +#include "workQueue.h" + +enum { + VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT) +}; + +typedef enum { + LAYER_SIMPLE_THINGS_INITIALIZED, + LAYER_BUFFER_POOLS_INITIALIZED, + LAYER_REQUEST_QUEUE_INITIALIZED, + LAYER_CPU_QUEUE_INITIALIZED, + LAYER_BIO_ACK_QUEUE_INITIALIZED, + LAYER_BIO_DATA_INITIALIZED, + LAYER_STARTING, + LAYER_RUNNING, + LAYER_SUSPENDED, + LAYER_STOPPING, + LAYER_STOPPED, + LAYER_RESUMING, +} KernelLayerState; + +/* Keep BIO statistics atomically */ +struct atomicBioStats { + atomic64_t read; // Number of not REQ_WRITE bios + atomic64_t write; // Number of REQ_WRITE bios + atomic64_t discard; // Number of REQ_DISCARD bios + atomic64_t flush; // Number of REQ_FLUSH bios + atomic64_t fua; // Number of REQ_FUA bios +}; + +// Data managing the reporting of Albireo timeouts +typedef struct periodicEventReporter { + uint64_t lastReportedValue; + const char *format; + atomic64_t value; + Jiffies reportingInterval; // jiffies + /* + * Just an approximation. If nonzero, then either the work item has + * been queued to run, or some other thread currently has + * responsibility for enqueueing it, or the reporter function is + * running but hasn't looked at the current value yet. + * + * If this is set, don't set the timer again, because we don't want + * the work item queued twice. Use an atomic xchg or cmpxchg to + * test-and-set it, and an atomic store to clear it. + */ + atomic_t workItemQueued; + KvdoWorkItem workItem; + KernelLayer *layer; +} PeriodicEventReporter; + +static inline uint64_t getEventCount(PeriodicEventReporter *reporter) +{ + return atomic64_read(&reporter->value); +} + +/** + * The VDO representation of the target device + **/ +struct kernelLayer { + PhysicalLayer common; + // Layer specific info + DeviceConfig *deviceConfig; + /** A ring of all DeviceConfigs referencing this layer */ + RingNode deviceConfigRing; + char threadNamePrefix[MAX_QUEUE_NAME_LEN]; + struct kobject kobj; + struct kobject wqDirectory; + struct kobject statsDirectory; + /** + * A counter value to attach to thread names and log messages to + * identify the individual device. + **/ + unsigned int instance; + /** Contains the current KernelLayerState, which rarely changes */ + Atomic32 state; + bool noFlushSuspend; + bool allocationsAllowed; + AtomicBool processingMessage; + /** Limit the number of requests that are being processed. */ + Limiter requestLimiter; + Limiter discardLimiter; + KVDO kvdo; + /** Incoming bios we've had to buffer to avoid deadlock. */ + DeadlockQueue deadlockQueue; + // for REQ_FLUSH processing + struct bio_list waitingFlushes; + KVDOFlush *spareKVDOFlush; + spinlock_t flushLock; + Jiffies flushArrivalTime; + /** + * Bio submission manager used for sending bios to the storage + * device. + **/ + IOSubmitter *ioSubmitter; + /** + * Work queue (possibly with multiple threads) for miscellaneous + * CPU-intensive, non-blocking work. + **/ + KvdoWorkQueue *cpuQueue; + /** N blobs of context data for LZ4 code, one per CPU thread. */ + char **compressionContext; + Atomic32 compressionContextIndex; + /** Optional work queue for calling bio_endio. */ + KvdoWorkQueue *bioAckQueue; + /** Underlying block device info. */ + uint64_t startingSectorOffset; + VolumeGeometry geometry; + // Memory allocation + BufferPool *dataKVIOPool; + struct bio_set *bioset; + // Albireo specific info + DedupeIndex *dedupeIndex; + // Statistics + atomic64_t biosSubmitted; + atomic64_t biosCompleted; + atomic64_t dedupeContextBusy; + atomic64_t flushOut; + AtomicBioStats biosIn; + AtomicBioStats biosInPartial; + AtomicBioStats biosOut; + AtomicBioStats biosOutCompleted; + AtomicBioStats biosAcknowledged; + AtomicBioStats biosAcknowledgedPartial; + AtomicBioStats biosMeta; + AtomicBioStats biosMetaCompleted; + AtomicBioStats biosJournal; + AtomicBioStats biosPageCache; + AtomicBioStats biosJournalCompleted; + AtomicBioStats biosPageCacheCompleted; + // for reporting Albireo timeouts + PeriodicEventReporter albireoTimeoutReporter; + // Debugging + /* Whether to dump VDO state on shutdown */ + bool dumpOnShutdown; + /** + * Whether we should collect tracing info. (Actually, this controls + * allocations; non-null record pointers cause recording.) + **/ + bool vioTraceRecording; + SampleCounter traceSampleCounter; + /* Should we log tracing info? */ + bool traceLogging; + /* Storage for trace data. */ + BufferPool *traceBufferPool; + /* Private storage for procfs. */ + void *procfsPrivate; + /* For returning batches of DataKVIOs to their pool */ + BatchProcessor *dataKVIOReleaser; + + // Administrative operations + /* The object used to wait for administrative operations to complete */ + struct completion callbackSync; + + // Statistics reporting + /* Protects the *statsStorage structs */ + struct mutex statsMutex; + /* Used when shutting down the sysfs statistics */ + struct completion statsShutdown;; + /* true if sysfs statistics directory is set up */ + bool statsAdded; + /* Used to gather statistics without allocating memory */ + VDOStatistics vdoStatsStorage; + KernelStatistics kernelStatsStorage; +}; + +typedef enum bioQAction { + BIO_Q_ACTION_COMPRESSED_DATA, + BIO_Q_ACTION_DATA, + BIO_Q_ACTION_FLUSH, + BIO_Q_ACTION_HIGH, + BIO_Q_ACTION_METADATA, + BIO_Q_ACTION_READCACHE, + BIO_Q_ACTION_VERIFY +} BioQAction; + +typedef enum cpuQAction { + CPU_Q_ACTION_COMPLETE_KVIO, + CPU_Q_ACTION_COMPRESS_BLOCK, + CPU_Q_ACTION_EVENT_REPORTER, + CPU_Q_ACTION_HASH_BLOCK, +} CPUQAction; + +typedef enum bioAckQAction { + BIO_ACK_Q_ACTION_ACK, +} BioAckQAction; + +typedef void (*DedupeShutdownCallbackFunction)(KernelLayer *layer); + +/* + * Wrapper for the Enqueueable object, to associate it with a kernel + * layer work item. + */ +typedef struct kvdoEnqueueable { + KvdoWorkItem workItem; + Enqueueable enqueueable; +} KvdoEnqueueable; + +/** + * Implements LayerFilter. + **/ +bool layerIsNamed(KernelLayer *layer, void *context) + __attribute__((warn_unused_result)); + +/** + * Creates a kernel specific physical layer to be used by VDO + * + * @param startingSector The sector offset of our table entry in the + * DM device + * @param instance Device instantiation counter + * @param parentKobject The parent sysfs node + * @param config The device configuration + * @param threadConfigPointer Where to store the new threadConfig handle + * @param reason The reason for any failure during this call + * @param layerPtr A pointer to hold the created layer + * + * @return VDO_SUCCESS or an error + **/ +int makeKernelLayer(uint64_t startingSector, + unsigned int instance, + DeviceConfig *config, + struct kobject *parentKobject, + ThreadConfig **threadConfigPointer, + char **reason, + KernelLayer **layerPtr) + __attribute__((warn_unused_result)); + +/** + * Prepare to modify a kernel layer. + * + * @param layer The layer to modify + * @param config The new device configuration + * @param errorPtr A pointer to store the reason for any failure + * + * @return VDO_SUCCESS or an error + **/ +int prepareToModifyKernelLayer(KernelLayer *layer, + DeviceConfig *config, + char **errorPtr) + __attribute__((warn_unused_result)); + +/** + * Modify a kernel physical layer. + * + * @param layer The layer to modify + * @param config The new device configuration + * + * @return VDO_SUCCESS or an error + **/ +int modifyKernelLayer(KernelLayer *layer, + DeviceConfig *config) + __attribute__((warn_unused_result)); + +/** + * Free a kernel physical layer. + * + * @param layer The layer, which must have been created by + * makeKernelLayer + **/ +void freeKernelLayer(KernelLayer *layer); + +/** + * Make and configure a kernel layer. This method does not alter the VDO state + * on disk. It should be run from the VDO constructor for devices which have + * not been started. + * + * @param layer The kernel layer + * @param loadConfig Load-time parameters for the VDO + * @param reason The reason for any failure during this call + * + * @return VDO_SUCCESS or an error + * + * @note redundant starts are silently ignored + **/ +int preloadKernelLayer(KernelLayer *layer, + const VDOLoadConfig *loadConfig, + char **reason); + +/** + * Start the kernel layer. This method finishes bringing a VDO online now that + * a table is being resumed for the first time. + * + * @param layer The kernel layer + * @param reason The reason for any failure during this call + * + * @return VDO_SUCCESS or an error + **/ +int startKernelLayer(KernelLayer *layer, char **reason); + +/** + * Stop the kernel layer. + * + * @param layer The kernel layer + **/ +void stopKernelLayer(KernelLayer *layer); + +/** + * Suspend the kernel layer. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + **/ +int suspendKernelLayer(KernelLayer *layer); + +/** + * Resume the kernel layer. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + **/ +int resumeKernelLayer(KernelLayer *layer); + +/** + * Get the kernel layer state. + * + * @param layer The kernel layer + * + * @return the instantaneously correct kernel layer state + **/ +static inline KernelLayerState getKernelLayerState(const KernelLayer *layer) +{ + return atomicLoad32(&layer->state); +} + +/** + * Function call to begin processing a bio passed in from the block layer + * + * @param layer The physical layer + * @param bio The bio from the block layer + * + * @return value to return from the VDO map function. Either an error code + * or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdoMapBio for + * details). + **/ +int kvdoMapBio(KernelLayer *layer, BIO *bio); + +/** + * Convert a generic PhysicalLayer to a kernelLayer. + * + * @param layer The PhysicalLayer to convert + * + * @return The PhysicalLayer as a KernelLayer + **/ +static inline KernelLayer *asKernelLayer(PhysicalLayer *layer) +{ + return container_of(layer, KernelLayer, common); +} + +/** + * Convert a block number (or count) to a (512-byte-)sector number. + * + * The argument type is sector_t to force conversion to the type we + * want, although the actual values passed are of various integral + * types. It's just too easy to forget and do the multiplication + * without casting, resulting in 32-bit arithmetic that accidentally + * produces wrong results in devices over 2TB (2**32 sectors). + * + * @param [in] layer the physical layer + * @param [in] blockNumber the block number/count + * + * @return the sector number/count + **/ +static inline sector_t blockToSector(KernelLayer *layer, sector_t blockNumber) +{ + return (blockNumber * VDO_SECTORS_PER_BLOCK); +} + +/** + * Convert a sector number (or count) to a block number. Does not + * check to make sure the sector number is an integral number of + * blocks. + * + * @param [in] layer the physical layer + * @param [in] sectorNumber the sector number/count + * + * @return the block number/count + **/ +static inline sector_t sectorToBlock(KernelLayer *layer, sector_t sectorNumber) +{ + return (sectorNumber / VDO_SECTORS_PER_BLOCK); +} + +/** + * Convert a sector number to an offset within a block. + * + * @param [in] layer the physical layer + * @param [in] sectorNumber the sector number + * + * @return the offset within the block + **/ +static inline BlockSize sectorToBlockOffset(KernelLayer *layer, + sector_t sectorNumber) +{ + unsigned int sectorsPerBlockMask = VDO_SECTORS_PER_BLOCK - 1; + return to_bytes(sectorNumber & sectorsPerBlockMask); +} + +/** + * Get the block device object currently underlying a kernel layer. + * + * @param layer The kernel layer in question + * + * @return The block device object under the layer + **/ +struct block_device *getKernelLayerBdev(const KernelLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Set the layer's active config. + * + * @param layer The kernel layer in question + * @param config The config in question + **/ +static inline void setKernelLayerActiveConfig(KernelLayer *layer, + DeviceConfig *config) +{ + layer->deviceConfig = config; +} + +/** + * Given an error code, return a value we can return to the OS. The + * input error code may be a system-generated value (such as -EIO), an + * errno macro used in our code (such as EIO), or a UDS or VDO status + * code; the result must be something the rest of the OS can consume + * (negative errno values such as -EIO, in the case of the kernel). + * + * @param error the error code to convert + * + * @return a system error code value + **/ +int mapToSystemError(int error); + +/** + * Record and eventually report that some number of dedupe requests + * reached their expiration time without getting an answer, so we + * timed out on them. + * + * This is called in a timer context, so it shouldn't do the reporting + * directly. + * + * @param layer The kernel layer for the device + * @param expiredCount The number of expired requests we timed out on + **/ +void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount); + +/** + * Wait until there are no requests in progress. + * + * @param layer The kernel layer for the device + **/ +void waitForNoRequestsActive(KernelLayer *layer); + +/** + * Enqueues an item on our internal "cpu queues". Since there is more than + * one, we rotate through them in hopes of creating some general balance. + * + * @param layer The kernel layer + * @param item The work item to enqueue + */ +static inline void enqueueCPUWorkQueue(KernelLayer *layer, KvdoWorkItem *item) +{ + enqueueWorkQueue(layer->cpuQueue, item); +} + +/** + * Adjust parameters to prepare to use a larger physical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param physicalCount the new physical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount); + +/** + * Adjusts parameters to reflect resizing the underlying device. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param physicalCount the new physical count in blocks + * + * @return VDO_SUCCESS or an error + */ +int resizePhysical(KernelLayer *layer, BlockCount physicalCount); + +/** + * Adjust parameters to prepare to present a larger logical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param logicalCount the new logical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount); + +/** + * Adjust parameters to present a larger logical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param logicalCount the new logical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int resizeLogical(KernelLayer *layer, BlockCount logicalCount); + +/** + * Indicate whether the kernel layer is configured to use a separate + * work queue for acknowledging received and processed bios. + * + * Note that this directly controls handling of write operations, but + * the compile-time flag USE_BIO_ACK_QUEUE_FOR_READ is also checked + * for read operations. + * + * @param layer The kernel layer + * + * @return Whether a bio-acknowledgement work queue is in use + **/ +static inline bool useBioAckQueue(KernelLayer *layer) +{ + return layer->deviceConfig->threadCounts.bioAckThreads > 0; +} + +/** + * Update bookkeeping for the completion of some number of requests, so that + * more incoming requests can be accepted. + * + * @param layer The kernel layer + * @param count The number of completed requests + **/ +void completeManyRequests(KernelLayer *layer, uint32_t count); + +#endif /* KERNELLAYER_H */ diff --git a/vdo/kernel/kernelStatistics.h b/vdo/kernel/kernelStatistics.h new file mode 100644 index 0000000..a5c1210 --- /dev/null +++ b/vdo/kernel/kernelStatistics.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef KERNEL_STATISTICS_H +#define KERNEL_STATISTICS_H + +#include "header.h" +#include "types.h" + +typedef struct { + /** Number of not REQ_WRITE bios */ + uint64_t read; + /** Number of REQ_WRITE bios */ + uint64_t write; + /** Number of REQ_DISCARD bios */ + uint64_t discard; + /** Number of REQ_FLUSH bios */ + uint64_t flush; + /** Number of REQ_FUA bios */ + uint64_t fua; +} BioStats; + +typedef struct { + /** Tracked bytes currently allocated. */ + uint64_t bytesUsed; + /** Maximum tracked bytes allocated. */ + uint64_t peakBytesUsed; +} MemoryUsage; + +/** UDS index statistics */ +typedef struct { + /** Number of chunk names stored in the index */ + uint64_t entriesIndexed; + /** Number of post calls that found an existing entry */ + uint64_t postsFound; + /** Number of post calls that added a new entry */ + uint64_t postsNotFound; + /** Number of query calls that found an existing entry */ + uint64_t queriesFound; + /** Number of query calls that added a new entry */ + uint64_t queriesNotFound; + /** Number of update calls that found an existing entry */ + uint64_t updatesFound; + /** Number of update calls that added a new entry */ + uint64_t updatesNotFound; + /** Current number of dedupe queries that are in flight */ + uint32_t currDedupeQueries; + /** Maximum number of dedupe queries that have been in flight */ + uint32_t maxDedupeQueries; +} IndexStatistics; + +typedef struct { + uint32_t version; + uint32_t releaseVersion; + /** The VDO instance */ + uint32_t instance; + /** Current number of active VIOs */ + uint32_t currentVIOsInProgress; + /** Maximum number of active VIOs */ + uint32_t maxVIOs; + /** Number of times the UDS index was too slow in responding */ + uint64_t dedupeAdviceTimeouts; + /** Number of flush requests submitted to the storage device */ + uint64_t flushOut; + /** Logical block size */ + uint64_t logicalBlockSize; + /** Bios submitted into VDO from above */ + BioStats biosIn; + BioStats biosInPartial; + /** Bios submitted onward for user data */ + BioStats biosOut; + /** Bios submitted onward for metadata */ + BioStats biosMeta; + BioStats biosJournal; + BioStats biosPageCache; + BioStats biosOutCompleted; + BioStats biosMetaCompleted; + BioStats biosJournalCompleted; + BioStats biosPageCacheCompleted; + BioStats biosAcknowledged; + BioStats biosAcknowledgedPartial; + /** Current number of bios in progress */ + BioStats biosInProgress; + /** Memory usage stats. */ + MemoryUsage memoryUsage; + /** The statistics for the UDS index */ + IndexStatistics index; +} KernelStatistics; + +/** + * Get the root for all stats proc files. + * + * @return The proc root + **/ +static inline const char *getProcRoot(void) { + return "vdo"; +} + +/** + * Get the proc file path for reading KernelStatistics. + * + * @return The proc file path + **/ +static inline const char *getKernelStatisticsProcFile(void) { + return "kernel_stats"; +} + +#endif /* not KERNEL_STATISTICS_H */ diff --git a/vdo/kernel/kernelTypes.h b/vdo/kernel/kernelTypes.h new file mode 100644 index 0000000..b338440 --- /dev/null +++ b/vdo/kernel/kernelTypes.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelTypes.h#3 $ + */ + +#ifndef KERNEL_TYPES_H +#define KERNEL_TYPES_H + +#include "types.h" + +/** + * The size of a discard request in bytes. + **/ +typedef uint32_t DiscardSize; + +/** + * A time in jiffies. + **/ +typedef uint64_t Jiffies; + +/** + * A timeout in jiffies. + **/ +typedef int64_t TimeoutJiffies; + +typedef struct atomicBioStats AtomicBioStats; +typedef struct bio BIO; +typedef struct dataKVIO DataKVIO; +typedef struct dedupeContext DedupeContext; +typedef struct dedupeIndex DedupeIndex; +typedef struct ioSubmitter IOSubmitter; +typedef struct kernelLayer KernelLayer; +typedef struct kvdo KVDO; +typedef struct kvdoFlush KVDOFlush; +typedef struct kvdoWorkItem KvdoWorkItem; +typedef struct kvdoWorkQueue KvdoWorkQueue; +typedef struct kvio KVIO; + +typedef void (*KVIOCallback)(KVIO *kvio); +typedef void (*DataKVIOCallback)(DataKVIO *dataKVIO); +typedef void (*KvdoWorkFunction)(KvdoWorkItem *workItem); + +/** + * Method type for layer matching methods. + * + * A LayerFilter method returns false if the layer doesn't match. + **/ +typedef bool LayerFilter(KernelLayer *layer, void *context); + +#endif /* KERNEL_TYPES_H */ diff --git a/vdo/kernel/kernelVDO.c b/vdo/kernel/kernelVDO.c new file mode 100644 index 0000000..5e1a72e --- /dev/null +++ b/vdo/kernel/kernelVDO.c @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.c#7 $ + */ + +#include "kernelVDOInternals.h" + +#include + +#include "memoryAlloc.h" + +#include "statistics.h" +#include "threadConfig.h" +#include "vdo.h" +#include "vdoDebug.h" +#include "vdoLoad.h" +#include "vdoResize.h" +#include "vdoResizeLogical.h" +#include "vdoResume.h" +#include "vdoSuspend.h" + +#include "kernelLayer.h" +#include "kvio.h" +#include "logger.h" + +enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; + +/**********************************************************************/ +static void startKVDORequestQueue(void *ptr) +{ + KVDOThread *thread = ptr; + KVDO *kvdo = thread->kvdo; + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + registerAllocatingThread(&thread->allocatingThread, + &layer->allocationsAllowed); + setWorkQueuePrivateData(thread); +} + +/**********************************************************************/ +static void finishKVDORequestQueue(void *ptr) +{ + unregisterAllocatingThread(); +} + +/**********************************************************************/ +static const KvdoWorkQueueType requestQueueType = { + .start = startKVDORequestQueue, + .finish = finishKVDORequestQueue, + .actionTable = { + { .name = "req_completion", + .code = REQ_Q_ACTION_COMPLETION, + .priority = 1 }, + { .name = "req_flush", + .code = REQ_Q_ACTION_FLUSH, + .priority = 2 }, + { .name = "req_map_bio", + .code = REQ_Q_ACTION_MAP_BIO, + .priority = 0 }, + { .name = "req_sync", + .code = REQ_Q_ACTION_SYNC, + .priority = 2 }, + { .name = "req_vio_callback", + .code = REQ_Q_ACTION_VIO_CALLBACK, + .priority = 1 }, + }, +}; + +/**********************************************************************/ +int initializeKVDO(KVDO *kvdo, + const ThreadConfig *threadConfig, + char **reason) +{ + unsigned int baseThreads = threadConfig->baseThreadCount; + int result = ALLOCATE(baseThreads, KVDOThread, + "request processing work queue", + &kvdo->threads); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocation thread structures"; + return result; + } + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + for (kvdo->initializedThreadCount = 0; + kvdo->initializedThreadCount < baseThreads; + kvdo->initializedThreadCount++) { + KVDOThread *thread = &kvdo->threads[kvdo->initializedThreadCount]; + + thread->kvdo = kvdo; + thread->threadID = kvdo->initializedThreadCount; + + char queueName[MAX_QUEUE_NAME_LEN]; + // Copy only LEN - 1 bytes and ensure NULL termination. + getVDOThreadName(threadConfig, kvdo->initializedThreadCount, + queueName, sizeof(queueName)); + int result = makeWorkQueue(layer->threadNamePrefix, queueName, + &layer->wqDirectory, layer, thread, + &requestQueueType, 1, &thread->requestQueue); + if (result != VDO_SUCCESS) { + *reason = "Cannot initialize request queue"; + while (kvdo->initializedThreadCount > 0) { + unsigned int threadToDestroy = kvdo->initializedThreadCount - 1; + thread = &kvdo->threads[threadToDestroy]; + finishWorkQueue(thread->requestQueue); + freeWorkQueue(&thread->requestQueue); + kvdo->initializedThreadCount--; + } + FREE(kvdo->threads); + return result; + } + + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int preloadKVDO(KVDO *kvdo, + PhysicalLayer *common, + const VDOLoadConfig *loadConfig, + bool vioTraceRecording, + char **reason) +{ + KernelLayer *layer = asKernelLayer(common); + init_completion(&layer->callbackSync); + int result = prepareToLoadVDO(kvdo->vdo, loadConfig); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + *reason = "Cannot load metadata from device"; + return result; + } + + setVDOTracingFlags(kvdo->vdo, vioTraceRecording); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason) +{ + KernelLayer *layer = asKernelLayer(common); + init_completion(&layer->callbackSync); + int result = performVDOLoad(kvdo->vdo); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + *reason = "Cannot load metadata from device"; + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int suspendKVDO(KVDO *kvdo) +{ + if (kvdo->vdo == NULL) { + return VDO_SUCCESS; + } + + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performVDOSuspend(kvdo->vdo, !layer->noFlushSuspend); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + char errorName[80] = ""; + char errorMessage[ERRBUF_SIZE] = ""; + logError("%s: Suspend device failed %d (%s: %s)", + __func__, result, + stringErrorName(result, errorName, sizeof(errorName)), + stringError(result, errorMessage, sizeof(errorMessage))); + return result; + } + + // Convert VDO_READ_ONLY to VDO_SUCCESS since a read-only suspension still + // leaves the VDO suspended. + return VDO_SUCCESS; +} + +/**********************************************************************/ +int resumeKVDO(KVDO *kvdo) +{ + if (kvdo->vdo == NULL) { + return VDO_SUCCESS; + } + + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + return performVDOResume(kvdo->vdo); +} + +/**********************************************************************/ +void finishKVDO(KVDO *kvdo) +{ + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + finishWorkQueue(kvdo->threads[i].requestQueue); + } +} + +/**********************************************************************/ +void destroyKVDO(KVDO *kvdo) +{ + destroyVDO(kvdo->vdo); + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + freeWorkQueue(&kvdo->threads[i].requestQueue); + } + FREE(kvdo->threads); + kvdo->threads = NULL; +} + + +/**********************************************************************/ +void dumpKVDOWorkQueue(KVDO *kvdo) +{ + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + dumpWorkQueue(kvdo->threads[i].requestQueue); + } +} + +/**********************************************************************/ +typedef struct { + KvdoWorkItem workItem; + KVDO *kvdo; + void *data; + struct completion *completion; +} SyncQueueWork; + +/** + * Initiate an arbitrary asynchronous base-code operation and wait for + * it. + * + * An async queue operation is performed and we wait for completion. + * + * @param kvdo The kvdo data handle + * @param action The operation to perform + * @param data Unique data that can be used by the operation + * @param threadID The thread on which to perform the operation + * @param completion The completion to wait on + * + * @return VDO_SUCCESS of an error code + **/ +static void performKVDOOperation(KVDO *kvdo, + KvdoWorkFunction action, + void *data, + ThreadID threadID, + struct completion *completion) +{ + SyncQueueWork sync; + + memset(&sync, 0, sizeof(sync)); + setupWorkItem(&sync.workItem, action, NULL, REQ_Q_ACTION_SYNC); + sync.kvdo = kvdo; + sync.data = data; + sync.completion = completion; + + init_completion(completion); + enqueueKVDOWork(kvdo, &sync.workItem, threadID); + wait_for_completion(completion); +} + +/**********************************************************************/ +typedef struct { + bool enable; + bool wasEnabled; +} VDOCompressData; + +/** + * Does the work of calling the base code to set compress state, then + * tells the function waiting on completion to go ahead. + * + * @param item The work item + **/ +static void setCompressingWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOCompressData *data = (VDOCompressData *)work->data; + data->wasEnabled = setVDOCompressing(getVDO(work->kvdo), data->enable); + complete(work->completion); +} + +/***********************************************************************/ +bool setKVDOCompressing(KVDO *kvdo, bool enableCompression) +{ + struct completion compressWait; + VDOCompressData data; + data.enable = enableCompression; + performKVDOOperation(kvdo, setCompressingWork, &data, + getPackerZoneThread(getThreadConfig(kvdo->vdo)), + &compressWait); + return data.wasEnabled; +} + +/**********************************************************************/ +typedef struct { + int result; +} VDOReadOnlyData; + +/**********************************************************************/ +static void enterReadOnlyModeWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOReadOnlyData *data = work->data; + makeVDOReadOnly(getVDO(work->kvdo), data->result); + complete(work->completion); +} + +/***********************************************************************/ +void setKVDOReadOnly(KVDO *kvdo, int result) +{ + struct completion readOnlyWait; + VDOReadOnlyData data; + data.result = result; + performKVDOOperation(kvdo, enterReadOnlyModeWork, &data, + getAdminThread(getThreadConfig(kvdo->vdo)), + &readOnlyWait); +} + +/** + * Does the work of calling the vdo statistics gathering tool + * + * @param item The work item + **/ +static void getVDOStatisticsWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOStatistics *stats = (VDOStatistics *)work->data; + getVDOStatistics(getVDO(work->kvdo), stats); + complete(work->completion); +} + +/***********************************************************************/ +void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats) +{ + struct completion statsWait; + memset(stats, 0, sizeof(VDOStatistics)); + performKVDOOperation(kvdo, getVDOStatisticsWork, stats, + getAdminThread(getThreadConfig(kvdo->vdo)), + &statsWait); +} + +/** + * A structure to invoke an arbitrary VDO action. + **/ +typedef struct vdoActionData { + VDOAction *action; + VDOCompletion *vdoCompletion; + struct completion waiter; +} VDOActionData; + +/** + * Initialize a VDOActionData structure so that the specified action + * can be invoked on the specified completion. + * + * @param data A VDOActionData. + * @param action The VDOAction to execute. + * @param vdoCompletion The VDO completion upon which the action acts. + **/ +static void initializeVDOActionData(VDOActionData *data, + VDOAction *action, + VDOCompletion *vdoCompletion) +{ + *data = (VDOActionData) { + .action = action, + .vdoCompletion = vdoCompletion, + }; +} + +/** + * The VDO callback that completes the KVDO completion. + * + * @param vdoCompletion The VDO completion which was acted upon. + **/ +static void finishVDOAction(VDOCompletion *vdoCompletion) +{ + SyncQueueWork *work = vdoCompletion->parent; + complete(work->completion); +} + +/** + * Perform a VDO base code action as specified by a VDOActionData. + * + * Sets the completion callback and parent inside the VDOActionData + * so that the corresponding kernel completion is completed when + * the VDO completion is. + * + * @param item A KVDO work queue item. + **/ +static void performVDOActionWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOActionData *data = work->data; + ThreadID id = getPhysicalLayer()->getCurrentThreadID(); + + setCallbackWithParent(data->vdoCompletion, finishVDOAction, id, work); + data->action(data->vdoCompletion); +} + +/**********************************************************************/ +int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv) +{ + VDOActionData data; + VDOCommandCompletion cmd; + + int result = initializeVDOCommandCompletion(&cmd, getVDO(kvdo), argc, argv); + if (result != VDO_SUCCESS) { + return result; + } + + initializeVDOActionData(&data, executeVDOExtendedCommand, &cmd.completion); + performKVDOOperation(kvdo, performVDOActionWork, &data, + getAdminThread(getThreadConfig(kvdo->vdo)), + &data.waiter); + + return destroyVDOCommandCompletion(&cmd); +} + +/**********************************************************************/ +void dumpKVDOStatus(KVDO *kvdo) +{ + dumpVDOStatus(kvdo->vdo); +} + +/**********************************************************************/ +bool getKVDOCompressing(KVDO *kvdo) +{ + return getVDOCompressing(kvdo->vdo); +} + +/**********************************************************************/ +int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount) +{ + VDO *vdo = kvdo->vdo; + return prepareToGrowPhysical(vdo, physicalCount); +} + +/**********************************************************************/ +int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount) +{ + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performGrowPhysical(kvdo->vdo, physicalCount); + if (result != VDO_SUCCESS) { + logError("resize operation failed, result = %d", result); + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount) +{ + VDO *vdo = kvdo->vdo; + return prepareToGrowLogical(vdo, logicalCount); +} + +/**********************************************************************/ +int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount) +{ + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performGrowLogical(kvdo->vdo, logicalCount); + if (result != VDO_SUCCESS) { + logError("grow logical operation failed, result = %d", result); + } + + return result; +} + +/**********************************************************************/ +WritePolicy getKVDOWritePolicy(KVDO *kvdo) +{ + return getWritePolicy(kvdo->vdo); +} + +/**********************************************************************/ +void enqueueKVDOThreadWork(KVDOThread *thread, + KvdoWorkItem *item) +{ + enqueueWorkQueue(thread->requestQueue, item); +} + +/**********************************************************************/ +void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID) +{ + enqueueKVDOThreadWork(&kvdo->threads[threadID], item); +} + +/**********************************************************************/ +void enqueueKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + ThreadID threadID = vioAsCompletion(kvio->vio)->callbackThreadID; + BUG_ON(threadID >= kvio->layer->kvdo.initializedThreadCount); + launchKVIO(kvio, work, statsFunction, action, + kvio->layer->kvdo.threads[threadID].requestQueue); +} + +/**********************************************************************/ +static void kvdoEnqueueWork(KvdoWorkItem *workItem) +{ + KvdoEnqueueable *kvdoEnqueueable = container_of(workItem, + KvdoEnqueueable, + workItem); + runCallback(kvdoEnqueueable->enqueueable.completion); +} + +/**********************************************************************/ +void kvdoEnqueue(Enqueueable *enqueueable) +{ + KvdoEnqueueable *kvdoEnqueueable = container_of(enqueueable, + KvdoEnqueueable, + enqueueable); + KernelLayer *layer = asKernelLayer(enqueueable->completion->layer); + ThreadID threadID = enqueueable->completion->callbackThreadID; + if (ASSERT(threadID < layer->kvdo.initializedThreadCount, + "threadID %u (completion type %d) is less than thread count %u", + threadID, enqueueable->completion->type, + layer->kvdo.initializedThreadCount) != UDS_SUCCESS) { + BUG(); + } + + if (enqueueable->completion->type == VIO_COMPLETION) { + vioAddTraceRecord(asVIO(enqueueable->completion), + THIS_LOCATION("$F($cb)")); + } + setupWorkItem(&kvdoEnqueueable->workItem, kvdoEnqueueWork, + (KvdoWorkFunction) enqueueable->completion->callback, + REQ_Q_ACTION_COMPLETION); + enqueueKVDOThreadWork(&layer->kvdo.threads[threadID], + &kvdoEnqueueable->workItem); +} + +/**********************************************************************/ +ThreadID kvdoGetCurrentThreadID(void) +{ + KVDOThread *thread = getWorkQueuePrivateData(); + if (thread == NULL) { + return INVALID_THREAD_ID; + } + + ThreadID threadID = thread->threadID; + if (PARANOID_THREAD_CONSISTENCY_CHECKS) { + KVDO *kvdo = thread->kvdo; + KernelLayer *kernelLayer = asKernelLayer(getPhysicalLayer()); + BUG_ON(&kernelLayer->kvdo != kvdo); + BUG_ON(threadID >= kvdo->initializedThreadCount); + BUG_ON(thread != &kvdo->threads[threadID]); + } + return threadID; +} + +/**********************************************************************/ +static PhysicalLayer *getKernelPhysicalLayer(void) +{ + KVDOThread *thread = getWorkQueuePrivateData(); + if (thread == NULL) { + return NULL; + } + KVDO *kvdo = thread->kvdo; + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + return &layer->common; +} + +void initKernelVDOOnce(void) +{ + registerPhysicalLayerGetter(getKernelPhysicalLayer); +} diff --git a/vdo/kernel/kernelVDO.h b/vdo/kernel/kernelVDO.h new file mode 100644 index 0000000..b65534d --- /dev/null +++ b/vdo/kernel/kernelVDO.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.h#4 $ + */ + +#ifndef KERNEL_VDO_H +#define KERNEL_VDO_H + +#include "completion.h" +#include "kernelTypes.h" +#include "threadRegistry.h" +#include "workQueue.h" + +typedef struct { + KVDO *kvdo; + ThreadID threadID; + KvdoWorkQueue *requestQueue; + RegisteredThread allocatingThread; +} KVDOThread; + +struct kvdo { + KVDOThread *threads; + ThreadID initializedThreadCount; + KvdoWorkItem workItem; + VDOAction *action; + VDOCompletion *completion; + // Base-code device info + VDO *vdo; +}; + +typedef enum reqQAction { + REQ_Q_ACTION_COMPLETION, + REQ_Q_ACTION_FLUSH, + REQ_Q_ACTION_MAP_BIO, + REQ_Q_ACTION_SYNC, + REQ_Q_ACTION_VIO_CALLBACK +} ReqQAction; + +/** + * Initialize the base code interface. + * + * @param [in] kvdo The KVDO to be initialized + * @param [in] threadConfig The base-code thread configuration + * @param [out] reason The reason for failure + * + * @return VDO_SUCCESS or an error code + **/ +int initializeKVDO(KVDO *kvdo, + const ThreadConfig *threadConfig, + char **reason); + +/** + * Load the VDO state from disk but don't alter the on-disk state. This method + * is ultimately called from the constructor for devices which have not been + * resumed. + * + * @param [in] kvdo The KVDO to be started + * @param [in] common The physical layer pointer + * @param [in] loadConfig Load-time parameters for the VDO + * @param [in] vioTraceRecording Debug flag to store + * @param [out] reason The reason for failure + **/ +int preloadKVDO(KVDO *kvdo, + PhysicalLayer *common, + const VDOLoadConfig *loadConfig, + bool vioTraceRecording, + char **reason); + +/** + * Starts the base VDO instance associated with the kernel layer. This method + * is ultimately called from preresume the first time an instance is resumed. + * + * @param [in] kvdo The KVDO to be started + * @param [in] common The physical layer pointer + * @param [out] reason The reason for failure + * + * @return VDO_SUCCESS if started, otherwise error + */ +int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason); + +/** + * Suspend the base VDO instance associated with the kernel layer. + * + * @param kvdo The KVDO to be suspended + * + * @return VDO_SUCCESS if stopped, otherwise error + **/ +int suspendKVDO(KVDO *kvdo); + +/** + * Resume the base VDO instance associated with the kernel layer. + * + * @param kvdo The KVDO to be resumed + * + * @return VDO_SUCCESS or an error + **/ +int resumeKVDO(KVDO *kvdo); + +/** + * Shut down the base code interface. The kvdo object must first be + * stopped. + * + * @param kvdo The KVDO to be shut down + **/ +void finishKVDO(KVDO *kvdo); + +/** + * Free up storage of the base code interface. The KVDO object must + * first have been "finished". + * + * @param kvdo The KVDO object to be destroyed + **/ +void destroyKVDO(KVDO *kvdo); + + +/** + * Dump to the kernel log any work-queue info associated with the base + * code. + * + * @param kvdo The KVDO object to be examined + **/ +void dumpKVDOWorkQueue(KVDO *kvdo); + +/** + * Get the VDO pointer for a kvdo object + * + * @param kvdo The KVDO object + * + * @return the VDO pointer + */ +static inline VDO *getVDO(KVDO *kvdo) +{ + return kvdo->vdo; +} + +/** + * Set whether compression is enabled. + * + * @param kvdo The KVDO object + * @param enableCompression The new compression mode + * + * @return state of compression before new value is set + **/ +bool setKVDOCompressing(KVDO *kvdo, bool enableCompression); + +/** + * Get the current compression mode + * + * @param kvdo The KVDO object to be queried + * + * @return whether compression is currently enabled + */ +bool getKVDOCompressing(KVDO *kvdo); + +/** + * Gets the latest statistics gathered by the base code. + * + * @param kvdo the KVDO object + * @param stats the statistics struct to fill in + */ +void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats); + +/** + * Get the current write policy + * + * @param kvdo The KVDO to be queried + * + * @return the write policy in effect + */ +WritePolicy getKVDOWritePolicy(KVDO *kvdo); + +/** + * Dump base code status information to the kernel log for debugging. + * + * @param kvdo The KVDO to be examined + */ +void dumpKVDOStatus(KVDO *kvdo); + +/** + * Request the base code prepare to grow the physical space. + * + * @param kvdo The KVDO to be updated + * @param physicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount); + +/** + * Notify the base code of resized physical storage. + * + * @param kvdo The KVDO to be updated + * @param physicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount); + +/** + * Request the base code prepare to grow the logical space. + * + * @param kvdo The KVDO to be updated + * @param logicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount); + +/** + * Request the base code grow the logical space. + * + * @param kvdo The KVDO to be updated + * @param logicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount); + +/** + * Request the base code go read-only. + * + * @param kvdo The KVDO to be updated + * @param result The error code causing the read only + */ +void setKVDOReadOnly(KVDO *kvdo, int result); + +/** + * Perform an extended base-code command + * + * @param kvdo The KVDO upon which to perform the operation. + * @param argc The number of arguments to the command. + * @param argv The command arguments. Note that all extended + * command argv[0] strings start with "x-". + * + * @return VDO_SUCCESS or an error code + **/ +int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv); + +/** + * Enqueue a work item to be processed in the base code context. + * + * @param kvdo The KVDO object in which to run the work item + * @param item The work item to be run + * @param threadID The thread on which to run the work item + **/ +void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID); + +/** + * Set up and enqueue a VIO's work item to be processed in the base code + * context. + * + * @param kvio The VIO with the work item to be run + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +void enqueueKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action); + +/** + * Enqueue an arbitrary completion for execution on its indicated + * thread. + * + * @param enqueueable The Enqueueable object containing the completion pointer + **/ +void kvdoEnqueue(Enqueueable *enqueueable); + +/** + * Get the base-code thread index for the current execution context. + * + * @return The thread ID, or (ThreadID)-1 if the current thread is + * not a base-code thread, or in an interrupt context. + **/ +ThreadID kvdoGetCurrentThreadID(void); + +/** + * Do one-time initialization of kernelVDO interface. + **/ +void initKernelVDOOnce(void); + +#endif // KERNEL_VDO_H diff --git a/vdo/kernel/kernelVDOInternals.h b/vdo/kernel/kernelVDOInternals.h new file mode 100644 index 0000000..aefe05a --- /dev/null +++ b/vdo/kernel/kernelVDOInternals.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDOInternals.h#1 $ + */ + +#ifndef KERNEL_VDO_INTERNALS_H +#define KERNEL_VDO_INTERNALS_H + +#include "kernelVDO.h" + +/** + * Enqueue a work item to be performed in the base code in a + * particular thread. + * + * @param thread The KVDO thread on which to run the work item + * @param item The work item to be run + **/ +void enqueueKVDOThreadWork(KVDOThread *thread, KvdoWorkItem *item); + +#endif // KERNEL_VDO_INTERNALS_H diff --git a/vdo/kernel/ktrace.c b/vdo/kernel/ktrace.c new file mode 100644 index 0000000..ebc654a --- /dev/null +++ b/vdo/kernel/ktrace.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.c#2 $ + */ + +#include "ktrace.h" + +#include "memoryAlloc.h" + +#include "dataVIO.h" + +#include "kvio.h" +#include "logger.h" + +enum { + // How much data from a trace can we log in one call without messing + // up the log or losing data? + TRACE_LOG_MAX = 820, + + // What fraction (1 out of TRACE_SAMPLE_INTERVAL VIOs) to trace + TRACE_SAMPLE_INTERVAL = 3, +}; + +bool traceRecording = false; + +static struct { + char buffer[2000]; + unsigned int counter; + struct mutex lock; +} traceLoggingState; + +/** + * Initialize a SampleCounter structure with the given sampling interval. + * + * @param counter The counter to initialize + * @param interval The desired sampling interval + **/ +static void initializeSampleCounter(SampleCounter *counter, + unsigned int interval) +{ + spin_lock_init(&counter->lock); + counter->tick = 0; + counter->interval = interval; +} + +/*************************************************************************/ +bool sampleThisOne(SampleCounter *counter) +{ + bool wantTracing = false; + spin_lock(&counter->lock); + counter->tick++; + if (counter->tick >= counter->interval) { + counter->tick = 0; + wantTracing = true; + } + spin_unlock(&counter->lock); + return wantTracing; +} + +/*************************************************************************/ +static void freeTraceDataBuffer(void *poolData, void *data) +{ + Trace *trace = (Trace *) data; + FREE(trace); +} + +/*************************************************************************/ +static int allocTraceDataBuffer(void *poolData, void **dataPtr) +{ + Trace *trace; + int result = ALLOCATE(1, Trace, __func__, &trace); + if (result != VDO_SUCCESS) { + logError("trace data allocation failure %d", result); + return result; + } + + *dataPtr = trace; + return VDO_SUCCESS; +} + +/*************************************************************************/ +int allocTraceFromPool(KernelLayer *layer, Trace **tracePointer) +{ + int result = allocBufferFromPool(layer->traceBufferPool, + (void **) tracePointer); + if (result == VDO_SUCCESS) { + (*tracePointer)->used = 0; + } + return result; +} + +/*************************************************************************/ +void freeTraceToPool(KernelLayer *layer, Trace *trace) +{ + freeBufferToPool(layer->traceBufferPool, trace); +} + +/*************************************************************************/ +int traceKernelLayerInit(KernelLayer *layer) +{ + layer->vioTraceRecording = traceRecording; + initializeSampleCounter(&layer->traceSampleCounter, TRACE_SAMPLE_INTERVAL); + unsigned int traceRecordsNeeded = 0; + if (layer->vioTraceRecording) { + traceRecordsNeeded += layer->requestLimiter.limit; + } + if (traceRecordsNeeded > 0) { + return makeBufferPool("KVDO Trace Data Pool", traceRecordsNeeded, + allocTraceDataBuffer, freeTraceDataBuffer, NULL, + layer, &layer->traceBufferPool); + } + return VDO_SUCCESS; +} + +/*************************************************************************/ +void initializeTraceLoggingOnce(void) +{ + mutex_init(&traceLoggingState.lock); +} + +/*************************************************************************/ +void logKvioTrace(KVIO *kvio) +{ + KernelLayer *layer = kvio->layer; + + mutex_lock(&traceLoggingState.lock); + traceLoggingState.counter++; + // Log about 0.1% to avoid spewing data faster than syslog can keep up + // (on certain of Permabit's test machines). + // Yes, the 37 is arbitrary and meaningless. + + if (layer->traceLogging && ((traceLoggingState.counter % 1024) == 37)) { + kvioAddTraceRecord(kvio, THIS_LOCATION(NULL)); + size_t traceLen = 0; + formatTrace(kvio->vio->trace, traceLoggingState.buffer, + sizeof(traceLoggingState.buffer), &traceLen); + + if (isMetadata(kvio)) { + logInfo("finishing kvio %s meta @%" PRIptr " %s", + (isWriteVIO(kvio->vio) ? "read" : "write"), + kvio, traceLoggingState.buffer); + } else if (isCompressedWriter(kvio)) { + logInfo("finishing kvio write comp @%" PRIptr " %s", + kvio, traceLoggingState.buffer); + } else { + const char *dupeLabel = ""; + if (isWriteVIO(kvio->vio)) { + DataVIO *dataVIO = vioAsDataVIO(kvio->vio); + if (isTrimDataVIO(dataVIO)) { + dupeLabel = "trim "; + } else if (dataVIO->isZeroBlock) { + dupeLabel = "zero "; + } else if (dataVIO->isDuplicate) { + dupeLabel = "dupe "; + } else { + dupeLabel = "new "; + } + } + + logInfo("finishing kvio %s data %s@%" PRIptr " %.*s", + (isWriteVIO(kvio->vio) ? "read" : "write"), + dupeLabel, kvio, TRACE_LOG_MAX, traceLoggingState.buffer); + char *buf = traceLoggingState.buffer; + while (traceLen > TRACE_LOG_MAX) { + traceLen -= TRACE_LOG_MAX; + buf += TRACE_LOG_MAX; + logInfo("more kvio %" PRIptr " path: %.*s", kvio, TRACE_LOG_MAX, buf); + } + } + } + + mutex_unlock(&traceLoggingState.lock); +} diff --git a/vdo/kernel/ktrace.h b/vdo/kernel/ktrace.h new file mode 100644 index 0000000..99cda7a --- /dev/null +++ b/vdo/kernel/ktrace.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.h#1 $ + */ + +#ifndef KTRACE_H +#define KTRACE_H + +#include + +#include "common.h" +#include "trace.h" + +struct kernelLayer; +struct kvio; + +// Implement event sampling once per N. +typedef struct { + unsigned int interval; + unsigned int tick; + spinlock_t lock; +} SampleCounter; + +/** + * Flag indicating whether newly created VDO devices should record trace info. + **/ +extern bool traceRecording; + +/** + * Updates the counter state and returns true once each time the + * sampling interval is reached. + * + * @param counter The sampling counter info + * + * @return whether to do sampling on this invocation + **/ +bool sampleThisOne(SampleCounter *counter); + +/** + * Initialize trace data in the KernelLayer + * + * @param layer The KernelLayer + * + * @return VDO_SUCCESS, or an error code + **/ +int traceKernelLayerInit(struct kernelLayer *layer); + +/** + * Initialize the mutex used when logging latency tracing data. + **/ +void initializeTraceLoggingOnce(void); + +/** + * Allocate a trace buffer + * + * @param layer The KernelLayer + * @param tracePointer The trace buffer is returned here + * + * @return VDO_SUCCESS or an error code + **/ +int allocTraceFromPool(struct kernelLayer *layer, Trace **tracePointer); + +/** + * Free a trace buffer + * + * @param layer The KernelLayer + * @param trace The trace buffer + **/ +void freeTraceToPool(struct kernelLayer *layer, Trace *trace); + +/** + * Log the trace at kvio freeing time + * + * @param kvio The kvio structure + **/ +void logKvioTrace(struct kvio *kvio); + +#endif /* KTRACE_H */ diff --git a/vdo/kernel/kvdoFlush.c b/vdo/kernel/kvdoFlush.c new file mode 100644 index 0000000..7b38af1 --- /dev/null +++ b/vdo/kernel/kvdoFlush.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.c#6 $ + */ + +#include "kvdoFlush.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "threadConfig.h" + +#include "bio.h" +#include "ioSubmitter.h" + +/** + * A specific (concrete) encapsulation of flush requests. + * + *

We attempt to allocate a KVDOFlush objects for each incoming flush bio. + * In case the allocate fails, a spare object is pre-allocated by and stored + * in the kernel layer. The first time an allocation fails, the spare is used. + * If another allocation fails while the spare is in use, it will merely be + * queued for later processing. + * + *

When a KVDOFlush is complete, it will either be freed, immediately + * re-used for queued flushes, or stashed in the kernel layer as the new spare + * object. This ensures that we will always make forward progress. + **/ +struct kvdoFlush { + KvdoWorkItem workItem; + KernelLayer *layer; + struct bio_list bios; + Jiffies arrivalTime; // Time when earliest bio appeared + VDOFlush vdoFlush; +}; + +/**********************************************************************/ +int makeKVDOFlush(KVDOFlush **flushPtr) +{ + return ALLOCATE(1, KVDOFlush, __func__, flushPtr); +} + +/**********************************************************************/ +bool shouldProcessFlush(KernelLayer *layer) +{ + return (getKVDOWritePolicy(&layer->kvdo) != WRITE_POLICY_SYNC); +} + +/** + * Function call to handle an empty flush request from the request queue. + * + * @param item The work item representing the flush request + **/ +static void kvdoFlushWork(KvdoWorkItem *item) +{ + KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); + flush(kvdoFlush->layer->kvdo.vdo, &kvdoFlush->vdoFlush); +} + +/** + * Initialize a KVDOFlush object, transferring all the bios in the kernel + * layer's waitingFlushes list to it. The caller MUST already hold the layer's + * flushLock. + * + * @param kvdoFlush The flush to initialize + * @param layer The kernel layer on which the flushLock is held + **/ +static void initializeKVDOFlush(KVDOFlush *kvdoFlush, KernelLayer *layer) +{ + kvdoFlush->layer = layer; + bio_list_init(&kvdoFlush->bios); + bio_list_merge(&kvdoFlush->bios, &layer->waitingFlushes); + bio_list_init(&layer->waitingFlushes); + kvdoFlush->arrivalTime = layer->flushArrivalTime; +} + +/**********************************************************************/ +static void enqueueKVDOFlush(KVDOFlush *kvdoFlush) +{ + setupWorkItem(&kvdoFlush->workItem, kvdoFlushWork, NULL, REQ_Q_ACTION_FLUSH); + KVDO *kvdo = &kvdoFlush->layer->kvdo; + enqueueKVDOWork(kvdo, &kvdoFlush->workItem, + getPackerZoneThread(getThreadConfig(kvdo->vdo))); +} + +/**********************************************************************/ +void launchKVDOFlush(KernelLayer *layer, BIO *bio) +{ + // Try to allocate a KVDOFlush to represent the flush request. If the + // allocation fails, we'll deal with it later. + KVDOFlush *kvdoFlush = ALLOCATE_NOWAIT(KVDOFlush, __func__); + + spin_lock(&layer->flushLock); + + // We have a new bio to start. Add it to the list. If it becomes the + // only entry on the list, record the time. + if (bio_list_empty(&layer->waitingFlushes)) { + layer->flushArrivalTime = jiffies; + } + bio_list_add(&layer->waitingFlushes, bio); + + if (kvdoFlush == NULL) { + // The KVDOFlush allocation failed. Try to use the spare KVDOFlush object. + if (layer->spareKVDOFlush == NULL) { + // The spare is already in use. This bio is on waitingFlushes and it + // will be handled by a flush completion or by a bio that can allocate. + spin_unlock(&layer->flushLock); + return; + } + + // Take and use the spare KVDOFlush object. + kvdoFlush = layer->spareKVDOFlush; + layer->spareKVDOFlush = NULL; + } + + // We have flushes to start. Capture them in the KVDOFlush object. + initializeKVDOFlush(kvdoFlush, layer); + + spin_unlock(&layer->flushLock); + + // Finish launching the flushes. + enqueueKVDOFlush(kvdoFlush); +} + +/** + * Release a KVDOFlush object that has completed its work. If there are any + * pending flush requests whose KVDOFlush allocation failed, they will be + * launched by immediately re-using the released KVDOFlush. If there is no + * spare KVDOFlush, the released object will become the spare. Otherwise, the + * KVDOFlush will be freed. + * + * @param kvdoFlush The completed flush object to re-use or free + **/ +static void releaseKVDOFlush(KVDOFlush *kvdoFlush) +{ + KernelLayer *layer = kvdoFlush->layer; + bool relaunchFlush = false; + bool freeFlush = false; + + spin_lock(&layer->flushLock); + if (bio_list_empty(&layer->waitingFlushes)) { + // Nothing needs to be started. Save one spare KVDOFlush object. + if (layer->spareKVDOFlush == NULL) { + // Make the new spare all zero, just like a newly allocated one. + memset(kvdoFlush, 0, sizeof(*kvdoFlush)); + layer->spareKVDOFlush = kvdoFlush; + } else { + freeFlush = true; + } + } else { + // We have flushes to start. Capture them in the KVDOFlush object. + initializeKVDOFlush(kvdoFlush, layer); + relaunchFlush = true; + } + spin_unlock(&layer->flushLock); + + if (relaunchFlush) { + // Finish launching the flushes. + enqueueKVDOFlush(kvdoFlush); + } else if (freeFlush) { + FREE(kvdoFlush); + } +} + +/** + * Function called to complete and free a flush request + * + * @param item The flush-request work item + **/ +static void kvdoCompleteFlushWork(KvdoWorkItem *item) +{ + KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); + KernelLayer *layer = kvdoFlush->layer; + + BIO *bio; + while ((bio = bio_list_pop(&kvdoFlush->bios)) != NULL) { + // We're not acknowledging this bio now, but we'll never touch it + // again, so this is the last chance to account for it. + countBios(&layer->biosAcknowledged, bio); + + // Make sure the bio is a empty flush bio. + prepareFlushBIO(bio, bio->bi_private, getKernelLayerBdev(layer), + bio->bi_end_io); + atomic64_inc(&layer->flushOut); + generic_make_request(bio); + } + + + // Release the KVDOFlush object, freeing it, re-using it as the spare, or + // using it to launch any flushes that had to wait when allocations failed. + releaseKVDOFlush(kvdoFlush); +} + +/**********************************************************************/ +void kvdoCompleteFlush(VDOFlush **kfp) +{ + if (*kfp != NULL) { + KVDOFlush *kvdoFlush = container_of(*kfp, KVDOFlush, vdoFlush); + setupWorkItem(&kvdoFlush->workItem, kvdoCompleteFlushWork, NULL, + BIO_Q_ACTION_FLUSH); + enqueueBioWorkItem(kvdoFlush->layer->ioSubmitter, + &kvdoFlush->workItem); + *kfp = NULL; + } +} + +/**********************************************************************/ +int synchronousFlush(KernelLayer *layer) +{ + BIO bio; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio_init(&bio, 0, 0); +#else + bio_init(&bio); +#endif + int result = 0; + + prepareFlushBIO(&bio, layer, getKernelLayerBdev(layer), NULL); + result = submitBioAndWait(&bio); + atomic64_inc(&layer->flushOut); + if (result != 0) { + logErrorWithStringError(result, "synchronous flush failed"); + result = -EIO; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + bio_uninit(&bio); +#endif + return result; +} diff --git a/vdo/kernel/kvdoFlush.h b/vdo/kernel/kvdoFlush.h new file mode 100644 index 0000000..2d90953 --- /dev/null +++ b/vdo/kernel/kvdoFlush.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.h#1 $ + */ + +#ifndef KVDO_FLUSH_H +#define KVDO_FLUSH_H + +#include "flush.h" + +#include "kernelLayer.h" + +/** + * Create a KVDOFlush. + * + * @param flushPtr A pointer to hold the new flush + **/ +int makeKVDOFlush(KVDOFlush **flushPtr); + +/** + * Answer the question as to whether VDO should be processing REQ_FLUSH + * requests or not. + * + * @param layer The layer + * + * @return true if VDO should process empty flush requests, or false if + * they should just be forwarded to our storage device. + **/ +bool shouldProcessFlush(KernelLayer *layer); + +/** + * Function called to start processing a flush request. It is called when we + * receive an empty flush bio from the block layer, and before acknowledging a + * non-empty bio with the FUA flag set. + * + * @param layer The physical layer + * @param bio The bio containing an empty flush request + **/ +void launchKVDOFlush(KernelLayer *layer, BIO *bio); + +/** + * Function called from base VDO to complete and free a flush request. + * + * @param kfp Pointer to the flush request + **/ +void kvdoCompleteFlush(VDOFlush **kfp); + +/** + * Issue a flush request and wait for it to complete. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + */ +int synchronousFlush(KernelLayer *layer); + +#endif /* KVDO_FLUSH_H */ diff --git a/vdo/kernel/kvio.c b/vdo/kernel/kvio.c new file mode 100644 index 0000000..336f86e --- /dev/null +++ b/vdo/kernel/kvio.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.c#7 $ + */ + +#include "kvio.h" + + +#include "logger.h" +#include "memoryAlloc.h" + +#include "numUtils.h" +#include "vdo.h" +#include "waitQueue.h" + +#include "bio.h" +#include "ioSubmitter.h" +#include "kvdoFlush.h" + +/** + * A function to tell vdo that we have completed the requested async + * operation for a vio + * + * @param item The work item of the VIO to complete + **/ +static void kvdoHandleVIOCallback(KvdoWorkItem *item) +{ + KVIO *kvio = workItemAsKVIO(item); + runCallback(vioAsCompletion(kvio->vio)); +} + +/**********************************************************************/ +void kvdoEnqueueVIOCallback(KVIO *kvio) +{ + enqueueKVIO(kvio, kvdoHandleVIOCallback, + (KvdoWorkFunction) vioAsCompletion(kvio->vio)->callback, + REQ_Q_ACTION_VIO_CALLBACK); +} + +/**********************************************************************/ +void kvdoContinueKvio(KVIO *kvio, int error) +{ + if (unlikely(error != VDO_SUCCESS)) { + setCompletionResult(vioAsCompletion(kvio->vio), error); + } + kvdoEnqueueVIOCallback(kvio); +} + +/**********************************************************************/ +// noinline ensures systemtap can hook in here +static noinline void maybeLogKvioTrace(KVIO *kvio) +{ + if (kvio->layer->traceLogging) { + logKvioTrace(kvio); + } +} + +/**********************************************************************/ +static void freeKVIO(KVIO **kvioPtr) +{ + KVIO *kvio = *kvioPtr; + if (kvio == NULL) { + return; + } + + if (unlikely(kvio->vio->trace != NULL)) { + maybeLogKvioTrace(kvio); + FREE(kvio->vio->trace); + } + + freeBio(kvio->bio, kvio->layer); + FREE(kvio); + *kvioPtr = NULL; +} + +/**********************************************************************/ +void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr) +{ + freeKVIO((KVIO **) metadataKVIOPtr); +} + +/**********************************************************************/ +void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr) +{ + freeKVIO((KVIO **) compressedWriteKVIOPtr); +} + +/**********************************************************************/ +void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO) +{ + // This method assumes that compressed writes never set the flush or FUA + // bits. + CompressedWriteKVIO *compressedWriteKVIO + = allocatingVIOAsCompressedWriteKVIO(allocatingVIO); + KVIO *kvio = compressedWriteKVIOAsKVIO(compressedWriteKVIO); + BIO *bio = kvio->bio; + resetBio(bio, kvio->layer); + setBioOperationWrite(bio); + setBioSector(bio, blockToSector(kvio->layer, kvio->vio->physical)); + submitBio(bio, BIO_Q_ACTION_COMPRESSED_DATA); +} + +/** + * Get the BioQueue action for a metadata VIO based on that VIO's priority. + * + * @param vio The VIO + * + * @return The action with which to submit the VIO's BIO. + **/ +static inline BioQAction getMetadataAction(VIO *vio) +{ + return ((vio->priority == VIO_PRIORITY_HIGH) + ? BIO_Q_ACTION_HIGH : BIO_Q_ACTION_METADATA); +} + +/**********************************************************************/ +void kvdoSubmitMetadataVIO(VIO *vio) +{ + KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); + BIO *bio = kvio->bio; + resetBio(bio, kvio->layer); + + setBioSector(bio, blockToSector(kvio->layer, vio->physical)); + + // Metadata I/Os bypass the read cache. + if (isReadVIO(vio)) { + ASSERT_LOG_ONLY(!vioRequiresFlushBefore(vio), + "read VIO does not require flush before"); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=readMeta")); + setBioOperationRead(bio); + } else { + KernelLayerState state = getKernelLayerState(kvio->layer); + ASSERT_LOG_ONLY(((state == LAYER_RUNNING) + || (state == LAYER_RESUMING) + || (state = LAYER_STARTING)), + "write metadata in allowed state %d", state); + if (vioRequiresFlushBefore(vio)) { + setBioOperationWrite(bio); + setBioOperationFlagPreflush(bio); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=flushWriteMeta")); + } else { + setBioOperationWrite(bio); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=writeMeta")); + } + } + + if (vioRequiresFlushAfter(vio)) { + setBioOperationFlagFua(bio); + } + submitBio(bio, getMetadataAction(vio)); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Handle the completion of a base-code initiated flush by continuing the flush + * VIO. + * + * @param bio The bio to complete + **/ +static void completeFlushBio(BIO *bio) +#else +/** + * Handle the completion of a base-code initiated flush by continuing the flush + * VIO. + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +static void completeFlushBio(BIO *bio, int error) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + int error = getBioResult(bio); +#endif + KVIO *kvio = (KVIO *) bio->bi_private; + // XXX This assumes a VDO-created bio around a buffer contains exactly 1 + // page, which we believe is true, but do not assert. + bio->bi_vcnt = 1; + // Restore the bio's notion of its own data. + resetBio(bio, kvio->layer); + kvdoContinueKvio(kvio, error); +} + +/**********************************************************************/ +void kvdoFlushVIO(VIO *vio) +{ + KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); + BIO *bio = kvio->bio; + KernelLayer *layer = kvio->layer; + resetBio(bio, layer); + prepareFlushBIO(bio, kvio, getKernelLayerBdev(layer), completeFlushBio); + submitBio(bio, getMetadataAction(vio)); +} + +/* + * Hook for a SystemTap probe to potentially restrict the choices + * of which VIOs should have their latencies tracked. + * + * Normally returns true. Even if true is returned, sampleThisOne may + * cut down the monitored VIOs by some fraction so as to reduce the + * impact on system performance. + * + * Must be "noinline" so that SystemTap can find the return + * instruction and modify the return value. + * + * @param kvio The KVIO being initialized + * @param layer The kernel layer + * @param bio The incoming I/O request + * + * @return whether it's useful to track latency for VIOs looking like + * this one + */ +static noinline bool +sampleThisVIO(KVIO *kvio, KernelLayer *layer, BIO *bio) +{ + bool result = true; + // Ensure the arguments and result exist at the same time, for SystemTap. + __asm__ __volatile__("" + : "=g" (result) + : "0" (result), + "g" (kvio), + "g" (layer), + "g" (bio) + : "memory"); + return result; +} + +/**********************************************************************/ +void initializeKVIO(KVIO *kvio, + KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio) +{ + if (layer->vioTraceRecording + && sampleThisVIO(kvio, layer, bio) + && sampleThisOne(&layer->traceSampleCounter)) { + int result = (isDataVIOType(vioType) + ? allocTraceFromPool(layer, &kvio->vio->trace) + : ALLOCATE(1, Trace, "trace", &kvio->vio->trace)); + if (result != VDO_SUCCESS) { + logError("trace record allocation failure %d", result); + } + } + + kvio->bio = bio; + kvio->layer = layer; + if (bio != NULL) { + bio->bi_private = kvio; + } + + initializeVIO(kvio->vio, vioType, priority, parent, getVDO(&layer->kvdo), + &layer->common); + + // XXX: The "init" label should be replaced depending on the + // write/read/flush path followed. + kvioAddTraceRecord(kvio, THIS_LOCATION("$F;io=?init;j=normal")); + + VDOCompletion *completion = vioAsCompletion(kvio->vio); + kvio->enqueueable.enqueueable.completion = completion; + completion->enqueueable = &kvio->enqueueable.enqueueable; +} + +/** + * Construct a metadata KVIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the + * MetadataKVIO + * @param [in] parent The parent of the MetadataKVIO completion + * @param [in] bio The bio to associate with this MetadataKVIO + * @param [out] metadataKVIOPtr A pointer to hold the new MetadataKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeMetadataKVIO(KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio, + MetadataKVIO **metadataKVIOPtr) +{ + // If MetadataKVIO grows past 256 bytes, we'll lose benefits of VDOSTORY-176. + STATIC_ASSERT(sizeof(MetadataKVIO) <= 256); + + // Metadata VIOs should use direct allocation and not use the buffer pool, + // which is reserved for submissions from the linux block layer. + MetadataKVIO *metadataKVIO; + int result = ALLOCATE(1, MetadataKVIO, __func__, &metadataKVIO); + if (result != VDO_SUCCESS) { + logError("metadata KVIO allocation failure %d", result); + return result; + } + + KVIO *kvio = &metadataKVIO->kvio; + kvio->vio = &metadataKVIO->vio; + initializeKVIO(kvio, layer, vioType, priority, parent, bio); + *metadataKVIOPtr = metadataKVIO; + return VDO_SUCCESS; +} + +/** + * Construct a CompressedWriteKVIO. + * + * @param [in] layer The physical layer + * @param [in] parent The parent of the CompressedWriteKVIO + * completion + * @param [in] bio The bio to associate with this + * CompressedWriteKVIO + * @param [out] compressedWriteKVIOPtr A pointer to hold the new + * CompressedWriteKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int +makeCompressedWriteKVIO(KernelLayer *layer, + void *parent, + BIO *bio, + CompressedWriteKVIO **compressedWriteKVIOPtr) +{ + // Compressed write VIOs should use direct allocation and not use the buffer + // pool, which is reserved for submissions from the linux block layer. + CompressedWriteKVIO *compressedWriteKVIO; + int result = ALLOCATE(1, CompressedWriteKVIO, __func__, + &compressedWriteKVIO); + if (result != VDO_SUCCESS) { + logError("compressed write KVIO allocation failure %d", result); + return result; + } + + KVIO *kvio = &compressedWriteKVIO->kvio; + kvio->vio = allocatingVIOAsVIO(&compressedWriteKVIO->allocatingVIO); + initializeKVIO(kvio, layer, VIO_TYPE_COMPRESSED_BLOCK, + VIO_PRIORITY_COMPRESSED_DATA, parent, bio); + *compressedWriteKVIOPtr = compressedWriteKVIO; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoCreateMetadataVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) +{ + int result = ASSERT(isMetadataVIOType(vioType), + "%d is a metadata type", vioType); + if (result != VDO_SUCCESS) { + return result; + } + + BIO *bio; + KernelLayer *kernelLayer = asKernelLayer(layer); + result = createBio(kernelLayer, data, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + MetadataKVIO *metadataKVIO; + result = makeMetadataKVIO(kernelLayer, vioType, priority, parent, bio, + &metadataKVIO); + if (result != VDO_SUCCESS) { + freeBio(bio, kernelLayer); + return result; + } + + *vioPtr = &metadataKVIO->vio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr) +{ + BIO *bio; + KernelLayer *kernelLayer = asKernelLayer(layer); + int result = createBio(kernelLayer, data, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + CompressedWriteKVIO *compressedWriteKVIO; + result = makeCompressedWriteKVIO(kernelLayer, parent, bio, + &compressedWriteKVIO); + if (result != VDO_SUCCESS) { + freeBio(bio, kernelLayer); + return result; + } + + *allocatingVIOPtr = &compressedWriteKVIO->allocatingVIO; + return VDO_SUCCESS; +} diff --git a/vdo/kernel/kvio.h b/vdo/kernel/kvio.h new file mode 100644 index 0000000..64200cd --- /dev/null +++ b/vdo/kernel/kvio.h @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.h#3 $ + */ + +#ifndef KVIO_H +#define KVIO_H + +#include "allocatingVIO.h" +#include "vio.h" + +#include "kernelLayer.h" + +/** + * A specific (semi-opaque) encapsulation of a single block + **/ +struct kvio { + KvdoEnqueueable enqueueable; + VIO *vio; + KernelLayer *layer; + BIO *bio; + + /** + * A bio pointer used in enqueueBioMap (used via submitBio etc), to + * pass information -- which bio to submit to the storage device -- + * across a thread switch. This may match another bio pointer in + * this structure, or could point somewhere else. + **/ + BIO *bioToSubmit; + /** + * A list of enqueued bios with consecutive block numbers, stored by + * enqueueBioMap under the first-enqueued KVIO. The other KVIOs are + * found via their bio entries in this list, and are not added to + * the work queue as separate work items. + **/ + struct bio_list biosMerged; + /** A slot for an arbitrary bit of data, for use by systemtap. */ + long debugSlot; +}; + +typedef struct { + KVIO kvio; + VIO vio; +} MetadataKVIO; + +typedef struct { + KVIO kvio; + AllocatingVIO allocatingVIO; +} CompressedWriteKVIO; + +/** + * Determine whether a KVIO is a data VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a data KVIO + */ +static inline bool isData(KVIO *kvio) +{ + return isDataVIO(kvio->vio); +} + +/** + * Determine whether a KVIO is a compressed block write VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a compressed block writer + */ +static inline bool isCompressedWriter(KVIO *kvio) +{ + return isCompressedWriteVIO(kvio->vio); +} + +/** + * Determine whether a KVIO is a metadata VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a metadata KVIO + */ +static inline bool isMetadata(KVIO *kvio) +{ + return isMetadataVIO(kvio->vio); +} + +/** + * Convert a VIO to a MetadataKVIO. + * + * @param vio The VIO to convert + * + * @return the VIO as a KVIO + **/ +static inline MetadataKVIO *vioAsMetadataKVIO(VIO *vio) +{ + ASSERT_LOG_ONLY(isMetadataVIO(vio), "VIO is a metadata VIO"); + return container_of(vio, MetadataKVIO, vio); +} + +/** + * Convert a MetadataKVIO to a KVIO. + * + * @param metadataKVIO The MetadataKVIO to convert + * + * @return The MetadataKVIO as a KVIO + **/ +static inline KVIO *metadataKVIOAsKVIO(MetadataKVIO *metadataKVIO) +{ + return &metadataKVIO->kvio; +} + +/** + * Returns a pointer to the CompressedWriteKVIO wrapping an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return the CompressedWriteKVIO + **/ +static inline CompressedWriteKVIO * +allocatingVIOAsCompressedWriteKVIO(AllocatingVIO *allocatingVIO) +{ + ASSERT_LOG_ONLY(isCompressedWriteAllocatingVIO(allocatingVIO), + "AllocatingVIO is a compressed write"); + return container_of(allocatingVIO, CompressedWriteKVIO, allocatingVIO); +} + +/** + * Convert a CompressedWriteKVIO to a KVIO. + * + * @param compressedWriteKVIO The CompressedWriteKVIO to convert + * + * @return The CompressedWriteKVIO as a KVIO + **/ +static inline +KVIO *compressedWriteKVIOAsKVIO(CompressedWriteKVIO *compressedWriteKVIO) +{ + return &compressedWriteKVIO->kvio; +} + +/** + * Returns a pointer to the KVIO wrapping a work item + * + * @param item the work item + * + * @return the KVIO + **/ +static inline KVIO *workItemAsKVIO(KvdoWorkItem *item) +{ + return container_of(item, KVIO, enqueueable.workItem); +} + +/** + * Enqueue a KVIO on a work queue. + * + * @param queue The queue + * @param kvio The KVIO + **/ +static inline void enqueueKVIOWork(KvdoWorkQueue *queue, KVIO *kvio) +{ + enqueueWorkQueue(queue, &kvio->enqueueable.workItem); +} + +/** + * Add a trace record for the current source location. + * + * @param kvio The KVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void kvioAddTraceRecord(KVIO *kvio, TraceLocation location) +{ + vioAddTraceRecord(kvio->vio, location); +} + +/** + * Set up the work item for a KVIO. + * + * @param kvio The KVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void setupKVIOWork(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + setupWorkItem(&kvio->enqueueable.workItem, work, statsFunction, action); +} + +/** + * Set up and enqueue a KVIO. + * + * @param kvio The KVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + * @param queue The queue on which to enqueue the KVIO + **/ +static inline void launchKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action, + KvdoWorkQueue *queue) +{ + setupKVIOWork(kvio, work, statsFunction, action); + enqueueKVIOWork(queue, kvio); +} + +/** + * Move a KVIO back to the base threads. + * + * @param kvio The KVIO to enqueue + **/ +void kvdoEnqueueVIOCallback(KVIO *kvio); + +/** + * Handles kvio-related I/O post-processing. + * + * @param kvio The kvio to finalize + * @param error Possible error + **/ +void kvdoContinueKvio(KVIO *kvio, int error); + +/** + * Initialize a KVIO. + * + * @param kvio The KVIO to initialize + * @param layer The physical layer + * @param vioType The type of VIO to create + * @param priority The relative priority to assign to the KVIO + * @param parent The parent of the KVIO completion + * @param bio The bio to associate with this KVIO + **/ +void initializeKVIO(KVIO *kvio, + KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio); + +/** + * Destroy a MetadataKVIO and NULL out the pointer to it. + * + * @param metadataKVIOPtr A pointer to the MetadataKVIO to destroy + **/ +void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr); + +/** + * Destroy a CompressedWriteKVIO and NULL out the pointer to it. + * + * @param compressedWriteKVIOPtr A pointer to the CompressedWriteKVIO to + * destroy + **/ +void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr); + +/** + * Create a new VIO (and its enclosing KVIO) for metadata operations. + * + *

Implements MetadataVIOCreator. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIO + * @param [in] parent The parent to assign to the VIO's completion + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold new VIO + * + * @return VDO_SUCCESS or an error + **/ +int kvdoCreateMetadataVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) + __attribute__((warn_unused_result)); + +/** + * Create a new AllocatingVIO (and its enclosing KVIO) for compressed writes. + * + *

Implements CompressedWriteVIOCreator. + * + * @param [in] layer The physical layer + * @param [in] parent The parent to assign to the AllocatingVIO's + * completion + * @param [in] data The buffer + * @param [out] allocatingVIOPtr A pointer to hold new AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr) + __attribute__((warn_unused_result)); + +/** + * Submit a compressed block write. + * + *

Implements CompressedWriter. + * + * @param allocatingVIO The AllocatingVIO for the compressed write + **/ +void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO); + +/** + * Read or write a single metadata VIO. + * + *

Implements MetadataReader and MetadataWriter + * + * @param vio The VIO to read or write + **/ +void kvdoSubmitMetadataVIO(VIO *vio); + +/** + * Issue an empty flush to the lower layer using the BIO in a metadata VIO. + * + *

Implements MetadataWriter. + * + * @param vio The VIO to flush + **/ +void kvdoFlushVIO(VIO *vio); + +#endif /* KVIO_H */ diff --git a/vdo/kernel/limiter.c b/vdo/kernel/limiter.c new file mode 100644 index 0000000..72a4bb5 --- /dev/null +++ b/vdo/kernel/limiter.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.c#2 $ + */ + +#include "limiter.h" + +#include + +/**********************************************************************/ +void getLimiterValuesAtomically(Limiter *limiter, + uint32_t *active, + uint32_t *maximum) +{ + spin_lock(&limiter->lock); + *active = limiter->active; + *maximum = limiter->maximum; + spin_unlock(&limiter->lock); +} + +/**********************************************************************/ +void initializeLimiter(Limiter *limiter, uint32_t limit) +{ + limiter->active = 0; + limiter->limit = limit; + limiter->maximum = 0; + init_waitqueue_head(&limiter->waiterQueue); + spin_lock_init(&limiter->lock); +} + +/**********************************************************************/ +bool limiterIsIdle(Limiter *limiter) +{ + spin_lock(&limiter->lock); + bool idle = limiter->active == 0; + spin_unlock(&limiter->lock); + return idle; +} + +/**********************************************************************/ +void limiterReleaseMany(Limiter *limiter, uint32_t count) +{ + spin_lock(&limiter->lock); + limiter->active -= count; + spin_unlock(&limiter->lock); + if (waitqueue_active(&limiter->waiterQueue)) { + wake_up_nr(&limiter->waiterQueue, count); + } +} + +/**********************************************************************/ +void limiterWaitForIdle(Limiter *limiter) +{ + spin_lock(&limiter->lock); + while (limiter->active > 0) { + DEFINE_WAIT(wait); + prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->lock); + io_schedule(); + spin_lock(&limiter->lock); + finish_wait(&limiter->waiterQueue, &wait); + }; + spin_unlock(&limiter->lock); +} + +/** + * Take one permit from the limiter, if one is available, and update + * the maximum active count if appropriate. + * + * The limiter's lock must already be locked. + * + * @param limiter The limiter to update + * + * @return true iff the permit was acquired + **/ +static bool takePermitLocked(Limiter *limiter) +{ + if (limiter->active >= limiter->limit) { + return false; + } + limiter->active += 1; + if (limiter->active > limiter->maximum) { + limiter->maximum = limiter->active; + } + return true; +} + +/**********************************************************************/ +void limiterWaitForOneFree(Limiter *limiter) +{ + spin_lock(&limiter->lock); + while (!takePermitLocked(limiter)) { + DEFINE_WAIT(wait); + prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->lock); + io_schedule(); + spin_lock(&limiter->lock); + finish_wait(&limiter->waiterQueue, &wait); + }; + spin_unlock(&limiter->lock); +} + +/**********************************************************************/ +bool limiterPoll(Limiter *limiter) +{ + spin_lock(&limiter->lock); + bool acquired = takePermitLocked(limiter); + spin_unlock(&limiter->lock); + return acquired; +} diff --git a/vdo/kernel/limiter.h b/vdo/kernel/limiter.h new file mode 100644 index 0000000..a9ee8fc --- /dev/null +++ b/vdo/kernel/limiter.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.h#2 $ + */ + +#ifndef LIMITER_H +#define LIMITER_H + +#include + +/* + * A Limiter is a fancy counter used to limit resource usage. We have a + * limit to number of resources that we are willing to use, and a Limiter + * holds us to that limit. + */ + +typedef struct limiter { + // A spinlock controlling access to the contents of this struct + spinlock_t lock; + // The queue of threads waiting for a resource to become available + wait_queue_head_t waiterQueue; + // The number of resources in use + uint32_t active; + // The maximum number number of resources that have ever been in use + uint32_t maximum; + // The limit to the number of resources that are allowed to be used + uint32_t limit; +} Limiter; + +/** + * Get the Limiter variable values (atomically under the lock) + * + * @param limiter The limiter + * @param active The number of requests in progress + * @param maximum The maximum number of requests that have ever been active + **/ +void getLimiterValuesAtomically(Limiter *limiter, + uint32_t *active, + uint32_t *maximum); + +/** + * Initialize a Limiter + * + * @param limiter The limiter + * @param limit The limit to the number of active resources + **/ +void initializeLimiter(Limiter *limiter, uint32_t limit); + +/** + * Determine whether there are any active resources + * + * @param limiter The limiter + * + * @return true if there are no active resources + **/ +bool limiterIsIdle(Limiter *limiter); + +/** + * Release resources, making them available for other uses + * + * @param limiter The limiter + * @param count The number of resources to release + **/ +void limiterReleaseMany(Limiter *limiter, uint32_t count); + +/** + * Release one resource, making it available for another use + * + * @param limiter The limiter + **/ +static inline void limiterRelease(Limiter *limiter) +{ + limiterReleaseMany(limiter, 1); +} + +/** + * Wait until there are no active resources + * + * @param limiter The limiter + **/ +void limiterWaitForIdle(Limiter *limiter); + +/** + * Prepare to start using one resource, waiting if there are too many resources + * already in use. After returning from this routine, the caller may use the + * resource, and must call limiterRelease after freeing the resource. + * + * @param limiter The limiter + **/ +void limiterWaitForOneFree(Limiter *limiter); + +/** + * Attempt to reserve one resource, without waiting. After returning from this + * routine, if allocation was successful, the caller may use the resource, and + * must call limiterRelease after freeing the resource. + * + * @param limiter The limiter + * + * @return true iff the resource was allocated + **/ +bool limiterPoll(Limiter *limiter); + +#endif /* LIMITER_H */ diff --git a/vdo/kernel/logger.c b/vdo/kernel/logger.c new file mode 100644 index 0000000..d18f5ea --- /dev/null +++ b/vdo/kernel/logger.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.c#4 $ + */ + +#include "logger.h" + +#include +#include +#include + +#include "errors.h" +#include "threadDevice.h" + +static const int DEFAULT_PRIORITY = LOG_INFO; + +typedef struct { + const char *name; + const int priority; +} PRIORITY_NAMES; + +static const PRIORITY_NAMES PRIORITIES[] = { + { "ALERT", LOG_ALERT }, + { "CRIT", LOG_CRIT }, + { "CRITICAL", LOG_CRIT }, + { "DEBUG", LOG_DEBUG }, + { "EMERG", LOG_EMERG }, + { "EMERGENCY", LOG_EMERG }, + { "ERR", LOG_ERR }, + { "ERROR", LOG_ERR }, + { "INFO", LOG_INFO }, + { "NOTICE", LOG_NOTICE }, + { "PANIC", LOG_EMERG }, + { "WARN", LOG_WARNING }, + { "WARNING", LOG_WARNING }, + { NULL, -1 }, +}; + +enum { + PRIORITY_COUNT = 8 +}; + +static const char *PRIORITY_STRINGS[] = { + "EMERGENCY", + "ALERT", + "CRITICAL", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", +}; + +static int logLevel = LOG_INFO; + +/**********************************************************************/ +int stringToPriority(const char *string) +{ + for (int i = 0; PRIORITIES[i].name != NULL; i++) { + if (strcasecmp(string, PRIORITIES[i].name) == 0) { + return PRIORITIES[i].priority; + } + } + return DEFAULT_PRIORITY; +} + +/**********************************************************************/ +int getLogLevel(void) +{ + return logLevel; +} + +/**********************************************************************/ +void setLogLevel(int newLogLevel) +{ + logLevel = newLogLevel; +} + +/**********************************************************************/ +const char *priorityToString(int priority) +{ + if ((priority < 0) || (priority >= PRIORITY_COUNT)) { + return "unknown"; + } + return PRIORITY_STRINGS[priority]; +} + +/**********************************************************************/ +static const char *priorityToLogLevel(int priority) +{ + switch (priority) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + return KERN_CRIT; + case LOG_ERR: + return KERN_ERR; + case LOG_WARNING: + return KERN_WARNING; + case LOG_NOTICE: + return KERN_NOTICE; + case LOG_INFO: + return KERN_INFO; + case LOG_DEBUG: + return KERN_DEBUG; + default: + return ""; + } +} + +/**********************************************************************/ +static const char *getCurrentInterruptType(void) +{ + if (in_nmi()) { + return "NMI"; + } + if (in_irq()) { + return "HI"; + } + if (in_softirq()) { + return "SI"; + } + return "INTR"; +} + +/** + * Emit a log message to the kernel log in a format suited to the current + * thread context. Context info formats: + * + * interrupt: kvdo[NMI]: blah + * thread w/dev id: kvdo12:myprog: blah + * kvdo thread: kvdo12:foobarQ: blah + * other thread: kvdo: myprog: blah + * + * Fields: module name, interrupt level, process name, device ID. + * + * @param level A string describing the logging level + * @param moduleName The name of the module doing the logging + * @param prefix The prefix of the log message + * @param vaf1 The first message format descriptor + * @param vaf2 The second message format descriptor + **/ +static void emitLogMessage(const char *level, + const char *moduleName, + const char *prefix, + const struct va_format *vaf1, + const struct va_format *vaf2) +{ + if (in_interrupt()) { + printk("%s%s[%s]: %s%pV%pV\n", + level, moduleName, getCurrentInterruptType(), + prefix, vaf1, vaf2); + return; + } + + // Not at interrupt level; we have a process we can look at, and + // might have a device ID. + int deviceInstance = getThreadDeviceID(); + if (deviceInstance != -1) { + printk("%s%s%u:%s: %s%pV%pV\n", + level, moduleName, deviceInstance, current->comm, + prefix, vaf1, vaf2); + return; + } + + if (((current->flags & PF_KTHREAD) != 0) + && (strncmp(moduleName, current->comm, strlen(moduleName)) == 0)) { + /* + * It's a kernel thread starting with "kvdo" (or whatever). Assume it's + * ours and that its name is sufficient. + */ + printk("%s%s: %s%pV%pV\n", + level, current->comm, + prefix, vaf1, vaf2); + return; + } + + // Identify the module and the process. + printk("%s%s: %s: %s%pV%pV\n", + level, moduleName, current->comm, + prefix, vaf1, vaf2); +} + +/**********************************************************************/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) +{ + if (priority > getLogLevel()) { + return; + } + + /* + * The kernel's printk has some magic for indirection to a secondary + * va_list. It wants us to supply a pointer to the va_list. + * + * However, va_list varies across platforms and can be an array + * type, which makes passing it around as an argument kind of + * tricky, due to the automatic conversion to a pointer. This makes + * taking the address of the argument a dicey thing; if we use "&a" + * it works fine for non-array types, but for array types we get the + * address of a pointer. Functions like va_copy and sprintf don't + * care as they get "va_list" values passed and are written to do + * the right thing, but printk explicitly wants the address of the + * va_list. + * + * So, we copy the va_list values to ensure that "&" consistently + * works the way we want. + */ + va_list args1Copy; + va_copy(args1Copy, args1); + va_list args2Copy; + va_copy(args2Copy, args2); + struct va_format vaf1 = { + .fmt = (fmt1 != NULL) ? fmt1 : "", + .va = &args1Copy, + }; + struct va_format vaf2 = { + .fmt = (fmt2 != NULL) ? fmt2 : "", + .va = &args2Copy, + }; + + if (prefix == NULL) { + prefix = ""; + } + + emitLogMessage(priorityToLogLevel(priority), THIS_MODULE->name, + prefix, &vaf1, &vaf2); + + va_end(args1Copy); + va_end(args2Copy); +} + +/**********************************************************************/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) +{ + va_list ap; + va_start(ap, fmt2); + logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); + va_end(ap); +} + +#pragma GCC diagnostic push +/* + * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems + * to think that this function should get a printf format + * attribute. But we have no second format string, and no additional + * arguments at the call site, and GCC also gets unhappy trying to + * analyze the format and values when there are none. So we'll just + * shut it up. + */ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +/** + * Log a message. + * + * This helper function exists solely to create a valid va_list with + * no useful info. It does the real work of vLogMessage, which wants a + * second va_list object to pass down. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +static void vLogMessageHelper(int priority, + const char *format, + va_list args, + ...) +{ + va_list dummy; + va_start(dummy, args); + logMessagePack(priority, NULL, format, args, NULL, dummy); + va_end(dummy); +} +#pragma GCC diagnostic pop + +/*****************************************************************************/ +void vLogMessage(int priority, const char *format, va_list args) +{ + vLogMessageHelper(priority, format, args); +} + +/**********************************************************************/ +void logMessage(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/**********************************************************************/ +__attribute__((format(printf, 2, 3))) +static void logAtLevel(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/**********************************************************************/ +void logDebug(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_DEBUG, format, args); + va_end(args); +} + +/**********************************************************************/ +void logInfo(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_INFO, format, args); + va_end(args); +} + +/**********************************************************************/ +void logNotice(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_NOTICE, format, args); + va_end(args); +} + +/**********************************************************************/ +void logWarning(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_WARNING, format, args); + va_end(args); +} + +/**********************************************************************/ +void logError(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_ERR, format, args); + va_end(args); +} + +/**********************************************************************/ +void vLogError(const char *format, va_list args) +{ + vLogMessage(LOG_ERR, format, args); +} + +/**********************************************************************/ +void logBacktrace(int priority) +{ + logAtLevel(priority, "[backtrace]"); + if (priority > logLevel) { + return; + } + dump_stack(); +} + +/**********************************************************************/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) +{ + char errbuf[ERRBUF_SIZE] = ""; + logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", + stringError(errnum, errbuf, sizeof(errbuf)), + errnum); + return errnum; +} + +/**********************************************************************/ +int logWithStringError(int priority, int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(priority, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logErrorWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_ERR, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int vLogErrorWithStringError(int errnum, const char *format, va_list args) +{ + vLogWithStringError(LOG_ERR, errnum, format, args); + return errnum; +} + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_WARNING, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_DEBUG, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_INFO, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_NOTICE, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logUnrecoverable(int errnum, const char *format, ...) +{ + if ((errnum == UDS_SUCCESS || errnum == UDS_QUEUED) || (errnum == 0)) { + return errnum; + } + + va_list args; + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return makeUnrecoverable(errnum); +} + +/**********************************************************************/ +void logFatal(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_CRIT, format, args); + va_end(args); +} + +/**********************************************************************/ +void pauseForLogger(void) +{ + // Hopefully, a few milliseconds of sleep will be large enough + // for the kernel log buffer to be flushed. + msleep(4); +} diff --git a/vdo/kernel/logger.h b/vdo/kernel/logger.h new file mode 100644 index 0000000..6e8088e --- /dev/null +++ b/vdo/kernel/logger.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.h#2 $ + */ + +#ifndef LOGGER_H +#define LOGGER_H 1 + +#include +#include +#include +#include + +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ + +// Make it easy to log real pointer values using %px when in development. +#define PRIptr "pK" + +/** + * @file + * + * The functions in this file are not thread safe in the sense that nothing + * prevents multiple threads from opening or closing loggers out from under + * other threads. In reality this isn't a problem since the only calls in + * production code to openLogger() and closeLogger() are made in uds.c while + * uds mutex is held, and uds does not make any logging calls before it calls + * openLogger or after it calls closeLogger(). + * + * All of the log() functions will preserve the callers value of errno. + **/ + +/** + * Get the current logging level. + * + * @return the current logging priority level. + **/ +int getLogLevel(void); + +/** + * Set the current logging level. + * + * @param newLogLevel the new value for the logging priority level. + **/ +void setLogLevel(int newLogLevel); + +/** + * Return the integer logging priority represented by a name. + * + * @param string the name of the logging priority (case insensitive). + * + * @return the integer priority named by string, or DEFAULT_PRIORITY + * if not recognized. + **/ +int stringToPriority(const char *string); + +/** + * Return the printable name of a logging priority. + * + * @return the priority name + **/ +const char *priorityToString(int priority); + +/** + * Log a debug message. + * + * @param format The format of the message (a printf style format) + **/ +void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an informational message. + * + * @param format The format of the message (a printf style format) + **/ +void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a normal (but notable) condition. + * + * @param format The format of the message (a printf style format) + **/ +void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a warning. + * + * @param format The format of the message (a printf style format) + **/ +void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + **/ +void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + * @param args args for format. + **/ + +void vLogError(const char *format, va_list args) + __attribute__((format(printf, 1, 0))); + +/** + * Log a message embedded within another message. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part + **/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) + __attribute__((format(printf, 3, 0), format(printf, 5, 6))); + +/** + * Log a message pack consisting of multiple variable sections. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part, may be NULL + * @param args2 arguments for message second part + **/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) + __attribute__((format(printf, 3, 0))); + +/** + * Log a stack backtrace. + * + * @param priority The priority at which to log the backtrace + **/ +void logBacktrace(int priority); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logWithStringError(int priority, int errnum, const char *format, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args The list of arguments with format. + * + * @return errnum + **/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logErrorWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args a va_list of args for the format. + * @return errnum + **/ +int vLogErrorWithStringError(int errnum, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log an ERROR level message and return makeUnrecoverable(errnum) + * UDS_SUCCESS is ignored and returned. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @return makeUnrecoverable(errnum) or UDS_SUCCESS. + **/ +int logUnrecoverable(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log a fatal error. + * + * @param format The format of the message (a printf style format) + **/ +void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message -- for internal use only. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +void vLogMessage(int priority, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log a message. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + **/ +void logMessage(int priority, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Sleep or delay a short time (likely a few milliseconds) in an attempt allow + * the log buffers to be written out in case they might be overrun. This is + * unnecessary in user-space (and is a no-op there), but is needed when + * quickly issuing a lot of log output in the Linux kernel, as when dumping a + * large number of data structures. + **/ +void pauseForLogger(void); + +#endif /* LOGGER_H */ diff --git a/vdo/kernel/memoryUsage.c b/vdo/kernel/memoryUsage.c new file mode 100644 index 0000000..86521a4 --- /dev/null +++ b/vdo/kernel/memoryUsage.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.c#3 $ + */ + +#include "memoryUsage.h" + +#include "memoryAlloc.h" + +#include "kernelStatistics.h" + +/**********************************************************************/ +MemoryUsage getMemoryUsage() +{ + MemoryUsage memoryUsage; + getMemoryStats(&memoryUsage.bytesUsed, &memoryUsage.peakBytesUsed); + return memoryUsage; +} + diff --git a/vdo/kernel/memoryUsage.h b/vdo/kernel/memoryUsage.h new file mode 100644 index 0000000..336ab0a --- /dev/null +++ b/vdo/kernel/memoryUsage.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.h#1 $ + */ + +#ifndef MEMORY_USAGE_H +#define MEMORY_USAGE_H 1 + +#include "memoryAlloc.h" + +#include "kernelStatistics.h" + +/** + * Get the memory usage for statistics reporting. + * + * @return The memory usage + **/ +MemoryUsage getMemoryUsage(void) + __attribute__((warn_unused_result)); + +#endif /* MEMORY_USAGE_H */ diff --git a/vdo/kernel/poolSysfs.c b/vdo/kernel/poolSysfs.c new file mode 100644 index 0000000..7f37480 --- /dev/null +++ b/vdo/kernel/poolSysfs.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.c#1 $ + */ + +#include "poolSysfs.h" + +#include "memoryAlloc.h" + +#include "vdo.h" + +#include "dedupeIndex.h" + +typedef struct poolAttribute { + struct attribute attr; + ssize_t (*show)(KernelLayer *layer, char *buf); + ssize_t (*store)(KernelLayer *layer, const char *value, size_t count); +} PoolAttribute; + +/**********************************************************************/ +static ssize_t vdoPoolAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); + if (poolAttr->show == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + return poolAttr->show(layer, buf); +} + +/**********************************************************************/ +static ssize_t vdoPoolAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); + if (poolAttr->store == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + return poolAttr->store(layer, buf, length); +} + +static struct sysfs_ops vdoPoolSysfsOps = { + .show = vdoPoolAttrShow, + .store = vdoPoolAttrStore, +}; + +/**********************************************************************/ +static ssize_t poolCompressingShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%s\n", (getKVDOCompressing(&layer->kvdo) ? "1" : "0")); +} + +/**********************************************************************/ +static ssize_t poolDiscardsActiveShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.active); +} + +/**********************************************************************/ +static ssize_t poolDiscardsLimitShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.limit); +} + +/**********************************************************************/ +static ssize_t poolDiscardsLimitStore(KernelLayer *layer, + const char *buf, + size_t length) +{ + unsigned int value; + if ((length > 12) || (sscanf(buf, "%u", &value) != 1) || (value < 1)) { + return -EINVAL; + } + layer->discardLimiter.limit = value; + return length; +} + +/**********************************************************************/ +static ssize_t poolDiscardsMaximumShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.maximum); +} + +/**********************************************************************/ +static ssize_t poolInstanceShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%u\n", layer->instance); +} + +/**********************************************************************/ +static ssize_t poolRequestsActiveShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.active); +} + +/**********************************************************************/ +static ssize_t poolRequestsLimitShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.limit); +} + +/**********************************************************************/ +static ssize_t poolRequestsMaximumShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.maximum); +} + +/**********************************************************************/ +static void vdoPoolRelease(struct kobject *kobj) +{ + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + freeVDO(&layer->kvdo.vdo); + FREE(layer); +} + +static PoolAttribute vdoPoolCompressingAttr = { + .attr = { .name = "compressing", .mode = 0444, }, + .show = poolCompressingShow, +}; + +static PoolAttribute vdoPoolDiscardsActiveAttr = { + .attr = { .name = "discards_active", .mode = 0444, }, + .show = poolDiscardsActiveShow, +}; + +static PoolAttribute vdoPoolDiscardsLimitAttr = { + .attr = { .name = "discards_limit", .mode = 0644, }, + .show = poolDiscardsLimitShow, + .store = poolDiscardsLimitStore, +}; + +static PoolAttribute vdoPoolDiscardsMaximumAttr = { + .attr = { .name = "discards_maximum", .mode = 0444, }, + .show = poolDiscardsMaximumShow, +}; + +static PoolAttribute vdoPoolInstanceAttr = { + .attr = { .name = "instance", .mode = 0444, }, + .show = poolInstanceShow, +}; + +static PoolAttribute vdoPoolRequestsActiveAttr = { + .attr = { .name = "requests_active", .mode = 0444, }, + .show = poolRequestsActiveShow, +}; + +static PoolAttribute vdoPoolRequestsLimitAttr = { + .attr = { .name = "requests_limit", .mode = 0444, }, + .show = poolRequestsLimitShow, +}; + +static PoolAttribute vdoPoolRequestsMaximumAttr = { + .attr = { .name = "requests_maximum", .mode = 0444, }, + .show = poolRequestsMaximumShow, +}; + +static struct attribute *poolAttrs[] = { + &vdoPoolCompressingAttr.attr, + &vdoPoolDiscardsActiveAttr.attr, + &vdoPoolDiscardsLimitAttr.attr, + &vdoPoolDiscardsMaximumAttr.attr, + &vdoPoolInstanceAttr.attr, + &vdoPoolRequestsActiveAttr.attr, + &vdoPoolRequestsLimitAttr.attr, + &vdoPoolRequestsMaximumAttr.attr, + NULL, +}; + +struct kobj_type kernelLayerKobjType = { + .release = vdoPoolRelease, + .sysfs_ops = &vdoPoolSysfsOps, + .default_attrs = poolAttrs, +}; + +/**********************************************************************/ +static void workQueueDirectoryRelease(struct kobject *kobj) +{ + /* + * The workQueueDirectory holds an implicit reference to its parent, + * the kernelLayer object (->kobj), so even if there are some + * external references held to the workQueueDirectory when work + * queue shutdown calls kobject_put on the kernelLayer object, the + * kernelLayer object won't actually be released and won't free the + * KernelLayer storage until the workQueueDirectory object is + * released first. + * + * So, we don't need to do any additional explicit management here. + * + * (But we aren't allowed to use a NULL function pointer to indicate + * a no-op.) + */ +} + +/**********************************************************************/ +static struct attribute *noAttrs[] = { + NULL, +}; + +static struct sysfs_ops noSysfsOps = { + // These should never be reachable since there are no attributes. + .show = NULL, + .store = NULL, +}; + +struct kobj_type workQueueDirectoryKobjType = { + .release = workQueueDirectoryRelease, + .sysfs_ops = &noSysfsOps, + .default_attrs = noAttrs, +}; diff --git a/vdo/kernel/poolSysfs.h b/vdo/kernel/poolSysfs.h new file mode 100644 index 0000000..85fe11c --- /dev/null +++ b/vdo/kernel/poolSysfs.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.h#1 $ + */ + +#ifndef POOL_SYSFS_H +#define POOL_SYSFS_H + +#include + +// The kobj_type used for setting up the kernel layer kobject. +extern struct kobj_type kernelLayerKobjType; +// The kobj_type used for the "work_queues" subdirectory. +extern struct kobj_type workQueueDirectoryKobjType; + +// The sysfs_ops used for the "statistics" subdirectory. +extern struct sysfs_ops poolStatsSysfsOps; +// The attribute used for the "statistics" subdirectory. +extern struct attribute *poolStatsAttrs[]; + +#endif /* POOL_SYSFS_H */ diff --git a/vdo/kernel/poolSysfsStats.c b/vdo/kernel/poolSysfsStats.c new file mode 100644 index 0000000..daa0cf0 --- /dev/null +++ b/vdo/kernel/poolSysfsStats.c @@ -0,0 +1,2628 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "dedupeIndex.h" +#include "logger.h" +#include "poolSysfs.h" +#include "statistics.h" +#include "statusProcfs.h" +#include "threadDevice.h" +#include "vdo.h" + +typedef struct poolStatsAttribute { + struct attribute attr; + ssize_t (*show)(KernelLayer *layer, char *buf); +} PoolStatsAttribute; + +static ssize_t poolStatsAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PoolStatsAttribute *poolStatsAttr = container_of(attr, PoolStatsAttribute, + attr); + + if (poolStatsAttr->show == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); + return poolStatsAttr->show(layer, buf); +} + +struct sysfs_ops poolStatsSysfsOps = { + .show = poolStatsAttrShow, + .store = NULL, +}; + +/**********************************************************************/ +/** Number of blocks used for data */ +static ssize_t poolStatsDataBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.dataBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsDataBlocksUsedAttr = { + .attr = { .name = "data_blocks_used", .mode = 0444, }, + .show = poolStatsDataBlocksUsedShow, +}; + +/**********************************************************************/ +/** Number of blocks used for VDO metadata */ +static ssize_t poolStatsOverheadBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.overheadBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsOverheadBlocksUsedAttr = { + .attr = { .name = "overhead_blocks_used", .mode = 0444, }, + .show = poolStatsOverheadBlocksUsedShow, +}; + +/**********************************************************************/ +/** Number of logical blocks that are currently mapped to physical blocks */ +static ssize_t poolStatsLogicalBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlocksUsedAttr = { + .attr = { .name = "logical_blocks_used", .mode = 0444, }, + .show = poolStatsLogicalBlocksUsedShow, +}; + +/**********************************************************************/ +/** number of physical blocks */ +static ssize_t poolStatsPhysicalBlocksShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.physicalBlocks); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPhysicalBlocksAttr = { + .attr = { .name = "physical_blocks", .mode = 0444, }, + .show = poolStatsPhysicalBlocksShow, +}; + +/**********************************************************************/ +/** number of logical blocks */ +static ssize_t poolStatsLogicalBlocksShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocks); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlocksAttr = { + .attr = { .name = "logical_blocks", .mode = 0444, }, + .show = poolStatsLogicalBlocksShow, +}; + +/**********************************************************************/ +/** Size of the block map page cache, in bytes */ +static ssize_t poolStatsBlockMapCacheSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMapCacheSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCacheSizeAttr = { + .attr = { .name = "block_map_cache_size", .mode = 0444, }, + .show = poolStatsBlockMapCacheSizeShow, +}; + +/**********************************************************************/ +/** String describing the active write policy of the VDO */ +static ssize_t poolStatsWritePolicyShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.writePolicy); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsWritePolicyAttr = { + .attr = { .name = "write_policy", .mode = 0444, }, + .show = poolStatsWritePolicyShow, +}; + +/**********************************************************************/ +/** The physical block size */ +static ssize_t poolStatsBlockSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockSizeAttr = { + .attr = { .name = "block_size", .mode = 0444, }, + .show = poolStatsBlockSizeShow, +}; + +/**********************************************************************/ +/** Number of times the VDO has successfully recovered */ +static ssize_t poolStatsCompleteRecoveriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.completeRecoveries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsCompleteRecoveriesAttr = { + .attr = { .name = "complete_recoveries", .mode = 0444, }, + .show = poolStatsCompleteRecoveriesShow, +}; + +/**********************************************************************/ +/** Number of times the VDO has recovered from read-only mode */ +static ssize_t poolStatsReadOnlyRecoveriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.readOnlyRecoveries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsReadOnlyRecoveriesAttr = { + .attr = { .name = "read_only_recoveries", .mode = 0444, }, + .show = poolStatsReadOnlyRecoveriesShow, +}; + +/**********************************************************************/ +/** String describing the operating mode of the VDO */ +static ssize_t poolStatsModeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.mode); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsModeAttr = { + .attr = { .name = "mode", .mode = 0444, }, + .show = poolStatsModeShow, +}; + +/**********************************************************************/ +/** Whether the VDO is in recovery mode */ +static ssize_t poolStatsInRecoveryModeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%d\n", layer->vdoStatsStorage.inRecoveryMode); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsInRecoveryModeAttr = { + .attr = { .name = "in_recovery_mode", .mode = 0444, }, + .show = poolStatsInRecoveryModeShow, +}; + +/**********************************************************************/ +/** What percentage of recovery mode work has been completed */ +static ssize_t poolStatsRecoveryPercentageShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%u\n", layer->vdoStatsStorage.recoveryPercentage); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsRecoveryPercentageAttr = { + .attr = { .name = "recovery_percentage", .mode = 0444, }, + .show = poolStatsRecoveryPercentageShow, +}; + +/**********************************************************************/ +/** Number of compressed data items written since startup */ +static ssize_t poolStatsPackerCompressedFragmentsWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedFragmentsWrittenAttr = { + .attr = { .name = "packer_compressed_fragments_written", .mode = 0444, }, + .show = poolStatsPackerCompressedFragmentsWrittenShow, +}; + +/**********************************************************************/ +/** Number of blocks containing compressed items written since startup */ +static ssize_t poolStatsPackerCompressedBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedBlocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedBlocksWrittenAttr = { + .attr = { .name = "packer_compressed_blocks_written", .mode = 0444, }, + .show = poolStatsPackerCompressedBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of VIOs that are pending in the packer */ +static ssize_t poolStatsPackerCompressedFragmentsInPackerShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsInPacker); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedFragmentsInPackerAttr = { + .attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, }, + .show = poolStatsPackerCompressedFragmentsInPackerShow, +}; + +/**********************************************************************/ +/** The total number of slabs from which blocks may be allocated */ +static ssize_t poolStatsAllocatorSlabCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabCountAttr = { + .attr = { .name = "allocator_slab_count", .mode = 0444, }, + .show = poolStatsAllocatorSlabCountShow, +}; + +/**********************************************************************/ +/** The total number of slabs from which blocks have ever been allocated */ +static ssize_t poolStatsAllocatorSlabsOpenedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsOpened); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabsOpenedAttr = { + .attr = { .name = "allocator_slabs_opened", .mode = 0444, }, + .show = poolStatsAllocatorSlabsOpenedShow, +}; + +/**********************************************************************/ +/** The number of times since loading that a slab has been re-opened */ +static ssize_t poolStatsAllocatorSlabsReopenedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsReopened); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabsReopenedAttr = { + .attr = { .name = "allocator_slabs_reopened", .mode = 0444, }, + .show = poolStatsAllocatorSlabsReopenedShow, +}; + +/**********************************************************************/ +/** Number of times the on-disk journal was full */ +static ssize_t poolStatsJournalDiskFullShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.diskFull); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalDiskFullAttr = { + .attr = { .name = "journal_disk_full", .mode = 0444, }, + .show = poolStatsJournalDiskFullShow, +}; + +/**********************************************************************/ +/** Number of times the recovery journal requested slab journal commits. */ +static ssize_t poolStatsJournalSlabJournalCommitsRequestedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.slabJournalCommitsRequested); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalSlabJournalCommitsRequestedAttr = { + .attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, }, + .show = poolStatsJournalSlabJournalCommitsRequestedShow, +}; + +/**********************************************************************/ +/** The total number of items on which processing has started */ +static ssize_t poolStatsJournalEntriesStartedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.started); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesStartedAttr = { + .attr = { .name = "journal_entries_started", .mode = 0444, }, + .show = poolStatsJournalEntriesStartedShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has been issued */ +static ssize_t poolStatsJournalEntriesWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.written); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesWrittenAttr = { + .attr = { .name = "journal_entries_written", .mode = 0444, }, + .show = poolStatsJournalEntriesWrittenShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has completed */ +static ssize_t poolStatsJournalEntriesCommittedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.committed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesCommittedAttr = { + .attr = { .name = "journal_entries_committed", .mode = 0444, }, + .show = poolStatsJournalEntriesCommittedShow, +}; + +/**********************************************************************/ +/** The total number of items on which processing has started */ +static ssize_t poolStatsJournalBlocksStartedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.started); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksStartedAttr = { + .attr = { .name = "journal_blocks_started", .mode = 0444, }, + .show = poolStatsJournalBlocksStartedShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has been issued */ +static ssize_t poolStatsJournalBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.written); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksWrittenAttr = { + .attr = { .name = "journal_blocks_written", .mode = 0444, }, + .show = poolStatsJournalBlocksWrittenShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has completed */ +static ssize_t poolStatsJournalBlocksCommittedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.committed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksCommittedAttr = { + .attr = { .name = "journal_blocks_committed", .mode = 0444, }, + .show = poolStatsJournalBlocksCommittedShow, +}; + +/**********************************************************************/ +/** Number of times the on-disk journal was full */ +static ssize_t poolStatsSlabJournalDiskFullCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.diskFullCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalDiskFullCountAttr = { + .attr = { .name = "slab_journal_disk_full_count", .mode = 0444, }, + .show = poolStatsSlabJournalDiskFullCountShow, +}; + +/**********************************************************************/ +/** Number of times an entry was added over the flush threshold */ +static ssize_t poolStatsSlabJournalFlushCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.flushCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalFlushCountAttr = { + .attr = { .name = "slab_journal_flush_count", .mode = 0444, }, + .show = poolStatsSlabJournalFlushCountShow, +}; + +/**********************************************************************/ +/** Number of times an entry was added over the block threshold */ +static ssize_t poolStatsSlabJournalBlockedCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blockedCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalBlockedCountAttr = { + .attr = { .name = "slab_journal_blocked_count", .mode = 0444, }, + .show = poolStatsSlabJournalBlockedCountShow, +}; + +/**********************************************************************/ +/** Number of times a tail block was written */ +static ssize_t poolStatsSlabJournalBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalBlocksWrittenAttr = { + .attr = { .name = "slab_journal_blocks_written", .mode = 0444, }, + .show = poolStatsSlabJournalBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of times we had to wait for the tail to write */ +static ssize_t poolStatsSlabJournalTailBusyCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.tailBusyCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalTailBusyCountAttr = { + .attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, }, + .show = poolStatsSlabJournalTailBusyCountShow, +}; + +/**********************************************************************/ +/** Number of blocks written */ +static ssize_t poolStatsSlabSummaryBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabSummary.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabSummaryBlocksWrittenAttr = { + .attr = { .name = "slab_summary_blocks_written", .mode = 0444, }, + .show = poolStatsSlabSummaryBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of reference blocks written */ +static ssize_t poolStatsRefCountsBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.refCounts.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsRefCountsBlocksWrittenAttr = { + .attr = { .name = "ref_counts_blocks_written", .mode = 0444, }, + .show = poolStatsRefCountsBlocksWrittenShow, +}; + +/**********************************************************************/ +/** number of dirty (resident) pages */ +static ssize_t poolStatsBlockMapDirtyPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.dirtyPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapDirtyPagesAttr = { + .attr = { .name = "block_map_dirty_pages", .mode = 0444, }, + .show = poolStatsBlockMapDirtyPagesShow, +}; + +/**********************************************************************/ +/** number of clean (resident) pages */ +static ssize_t poolStatsBlockMapCleanPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cleanPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCleanPagesAttr = { + .attr = { .name = "block_map_clean_pages", .mode = 0444, }, + .show = poolStatsBlockMapCleanPagesShow, +}; + +/**********************************************************************/ +/** number of free pages */ +static ssize_t poolStatsBlockMapFreePagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.freePages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFreePagesAttr = { + .attr = { .name = "block_map_free_pages", .mode = 0444, }, + .show = poolStatsBlockMapFreePagesShow, +}; + +/**********************************************************************/ +/** number of pages in failed state */ +static ssize_t poolStatsBlockMapFailedPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.failedPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedPagesAttr = { + .attr = { .name = "block_map_failed_pages", .mode = 0444, }, + .show = poolStatsBlockMapFailedPagesShow, +}; + +/**********************************************************************/ +/** number of pages incoming */ +static ssize_t poolStatsBlockMapIncomingPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.incomingPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapIncomingPagesAttr = { + .attr = { .name = "block_map_incoming_pages", .mode = 0444, }, + .show = poolStatsBlockMapIncomingPagesShow, +}; + +/**********************************************************************/ +/** number of pages outgoing */ +static ssize_t poolStatsBlockMapOutgoingPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.outgoingPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapOutgoingPagesAttr = { + .attr = { .name = "block_map_outgoing_pages", .mode = 0444, }, + .show = poolStatsBlockMapOutgoingPagesShow, +}; + +/**********************************************************************/ +/** how many times free page not avail */ +static ssize_t poolStatsBlockMapCachePressureShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cachePressure); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCachePressureAttr = { + .attr = { .name = "block_map_cache_pressure", .mode = 0444, }, + .show = poolStatsBlockMapCachePressureShow, +}; + +/**********************************************************************/ +/** number of getVDOPageAsync() for read */ +static ssize_t poolStatsBlockMapReadCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReadCountAttr = { + .attr = { .name = "block_map_read_count", .mode = 0444, }, + .show = poolStatsBlockMapReadCountShow, +}; + +/**********************************************************************/ +/** number or getVDOPageAsync() for write */ +static ssize_t poolStatsBlockMapWriteCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.writeCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapWriteCountAttr = { + .attr = { .name = "block_map_write_count", .mode = 0444, }, + .show = poolStatsBlockMapWriteCountShow, +}; + +/**********************************************************************/ +/** number of times pages failed to read */ +static ssize_t poolStatsBlockMapFailedReadsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedReads); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedReadsAttr = { + .attr = { .name = "block_map_failed_reads", .mode = 0444, }, + .show = poolStatsBlockMapFailedReadsShow, +}; + +/**********************************************************************/ +/** number of times pages failed to write */ +static ssize_t poolStatsBlockMapFailedWritesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedWrites); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedWritesAttr = { + .attr = { .name = "block_map_failed_writes", .mode = 0444, }, + .show = poolStatsBlockMapFailedWritesShow, +}; + +/**********************************************************************/ +/** number of gets that are reclaimed */ +static ssize_t poolStatsBlockMapReclaimedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.reclaimed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReclaimedAttr = { + .attr = { .name = "block_map_reclaimed", .mode = 0444, }, + .show = poolStatsBlockMapReclaimedShow, +}; + +/**********************************************************************/ +/** number of gets for outgoing pages */ +static ssize_t poolStatsBlockMapReadOutgoingShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readOutgoing); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReadOutgoingAttr = { + .attr = { .name = "block_map_read_outgoing", .mode = 0444, }, + .show = poolStatsBlockMapReadOutgoingShow, +}; + +/**********************************************************************/ +/** number of gets that were already there */ +static ssize_t poolStatsBlockMapFoundInCacheShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.foundInCache); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFoundInCacheAttr = { + .attr = { .name = "block_map_found_in_cache", .mode = 0444, }, + .show = poolStatsBlockMapFoundInCacheShow, +}; + +/**********************************************************************/ +/** number of gets requiring discard */ +static ssize_t poolStatsBlockMapDiscardRequiredShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.discardRequired); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapDiscardRequiredAttr = { + .attr = { .name = "block_map_discard_required", .mode = 0444, }, + .show = poolStatsBlockMapDiscardRequiredShow, +}; + +/**********************************************************************/ +/** number of gets enqueued for their page */ +static ssize_t poolStatsBlockMapWaitForPageShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.waitForPage); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapWaitForPageAttr = { + .attr = { .name = "block_map_wait_for_page", .mode = 0444, }, + .show = poolStatsBlockMapWaitForPageShow, +}; + +/**********************************************************************/ +/** number of gets that have to fetch */ +static ssize_t poolStatsBlockMapFetchRequiredShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.fetchRequired); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFetchRequiredAttr = { + .attr = { .name = "block_map_fetch_required", .mode = 0444, }, + .show = poolStatsBlockMapFetchRequiredShow, +}; + +/**********************************************************************/ +/** number of page fetches */ +static ssize_t poolStatsBlockMapPagesLoadedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesLoaded); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapPagesLoadedAttr = { + .attr = { .name = "block_map_pages_loaded", .mode = 0444, }, + .show = poolStatsBlockMapPagesLoadedShow, +}; + +/**********************************************************************/ +/** number of page saves */ +static ssize_t poolStatsBlockMapPagesSavedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesSaved); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapPagesSavedAttr = { + .attr = { .name = "block_map_pages_saved", .mode = 0444, }, + .show = poolStatsBlockMapPagesSavedShow, +}; + +/**********************************************************************/ +/** the number of flushes issued */ +static ssize_t poolStatsBlockMapFlushCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.flushCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFlushCountAttr = { + .attr = { .name = "block_map_flush_count", .mode = 0444, }, + .show = poolStatsBlockMapFlushCountShow, +}; + +/**********************************************************************/ +/** Number of times the UDS advice proved correct */ +static ssize_t poolStatsHashLockDedupeAdviceValidShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceValid); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockDedupeAdviceValidAttr = { + .attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, }, + .show = poolStatsHashLockDedupeAdviceValidShow, +}; + +/**********************************************************************/ +/** Number of times the UDS advice proved incorrect */ +static ssize_t poolStatsHashLockDedupeAdviceStaleShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceStale); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockDedupeAdviceStaleAttr = { + .attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, }, + .show = poolStatsHashLockDedupeAdviceStaleShow, +}; + +/**********************************************************************/ +/** Number of writes with the same data as another in-flight write */ +static ssize_t poolStatsHashLockConcurrentDataMatchesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentDataMatches); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockConcurrentDataMatchesAttr = { + .attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, }, + .show = poolStatsHashLockConcurrentDataMatchesShow, +}; + +/**********************************************************************/ +/** Number of writes whose hash collided with an in-flight write */ +static ssize_t poolStatsHashLockConcurrentHashCollisionsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentHashCollisions); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockConcurrentHashCollisionsAttr = { + .attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, }, + .show = poolStatsHashLockConcurrentHashCollisionsShow, +}; + +/**********************************************************************/ +/** number of times VDO got an invalid dedupe advice PBN from UDS */ +static ssize_t poolStatsErrorsInvalidAdvicePBNCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.invalidAdvicePBNCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsInvalidAdvicePBNCountAttr = { + .attr = { .name = "errors_invalid_advicePBNCount", .mode = 0444, }, + .show = poolStatsErrorsInvalidAdvicePBNCountShow, +}; + +/**********************************************************************/ +/** number of times a VIO completed with a VDO_NO_SPACE error */ +static ssize_t poolStatsErrorsNoSpaceErrorCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.noSpaceErrorCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsNoSpaceErrorCountAttr = { + .attr = { .name = "errors_no_space_error_count", .mode = 0444, }, + .show = poolStatsErrorsNoSpaceErrorCountShow, +}; + +/**********************************************************************/ +/** number of times a VIO completed with a VDO_READ_ONLY error */ +static ssize_t poolStatsErrorsReadOnlyErrorCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.readOnlyErrorCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsReadOnlyErrorCountAttr = { + .attr = { .name = "errors_read_only_error_count", .mode = 0444, }, + .show = poolStatsErrorsReadOnlyErrorCountShow, +}; + +/**********************************************************************/ +/** The VDO instance */ +static ssize_t poolStatsInstanceShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.instance); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsInstanceAttr = { + .attr = { .name = "instance", .mode = 0444, }, + .show = poolStatsInstanceShow, +}; + +/**********************************************************************/ +/** Current number of active VIOs */ +static ssize_t poolStatsCurrentVIOsInProgressShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.currentVIOsInProgress); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsCurrentVIOsInProgressAttr = { + .attr = { .name = "currentVIOs_in_progress", .mode = 0444, }, + .show = poolStatsCurrentVIOsInProgressShow, +}; + +/**********************************************************************/ +/** Maximum number of active VIOs */ +static ssize_t poolStatsMaxVIOsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.maxVIOs); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMaxVIOsAttr = { + .attr = { .name = "maxVIOs", .mode = 0444, }, + .show = poolStatsMaxVIOsShow, +}; + +/**********************************************************************/ +/** Number of times the UDS index was too slow in responding */ +static ssize_t poolStatsDedupeAdviceTimeoutsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.dedupeAdviceTimeouts); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsDedupeAdviceTimeoutsAttr = { + .attr = { .name = "dedupe_advice_timeouts", .mode = 0444, }, + .show = poolStatsDedupeAdviceTimeoutsShow, +}; + +/**********************************************************************/ +/** Number of flush requests submitted to the storage device */ +static ssize_t poolStatsFlushOutShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.flushOut); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsFlushOutAttr = { + .attr = { .name = "flush_out", .mode = 0444, }, + .show = poolStatsFlushOutShow, +}; + +/**********************************************************************/ +/** Logical block size */ +static ssize_t poolStatsLogicalBlockSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.logicalBlockSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlockSizeAttr = { + .attr = { .name = "logical_block_size", .mode = 0444, }, + .show = poolStatsLogicalBlockSizeShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInReadAttr = { + .attr = { .name = "bios_in_read", .mode = 0444, }, + .show = poolStatsBiosInReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInWriteAttr = { + .attr = { .name = "bios_in_write", .mode = 0444, }, + .show = poolStatsBiosInWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInDiscardAttr = { + .attr = { .name = "bios_in_discard", .mode = 0444, }, + .show = poolStatsBiosInDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInFlushAttr = { + .attr = { .name = "bios_in_flush", .mode = 0444, }, + .show = poolStatsBiosInFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInFuaAttr = { + .attr = { .name = "bios_in_fua", .mode = 0444, }, + .show = poolStatsBiosInFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInPartialReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialReadAttr = { + .attr = { .name = "bios_in_partial_read", .mode = 0444, }, + .show = poolStatsBiosInPartialReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInPartialWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialWriteAttr = { + .attr = { .name = "bios_in_partial_write", .mode = 0444, }, + .show = poolStatsBiosInPartialWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInPartialDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialDiscardAttr = { + .attr = { .name = "bios_in_partial_discard", .mode = 0444, }, + .show = poolStatsBiosInPartialDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInPartialFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialFlushAttr = { + .attr = { .name = "bios_in_partial_flush", .mode = 0444, }, + .show = poolStatsBiosInPartialFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInPartialFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialFuaAttr = { + .attr = { .name = "bios_in_partial_fua", .mode = 0444, }, + .show = poolStatsBiosInPartialFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosOutReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutReadAttr = { + .attr = { .name = "bios_out_read", .mode = 0444, }, + .show = poolStatsBiosOutReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosOutWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutWriteAttr = { + .attr = { .name = "bios_out_write", .mode = 0444, }, + .show = poolStatsBiosOutWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosOutDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutDiscardAttr = { + .attr = { .name = "bios_out_discard", .mode = 0444, }, + .show = poolStatsBiosOutDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosOutFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutFlushAttr = { + .attr = { .name = "bios_out_flush", .mode = 0444, }, + .show = poolStatsBiosOutFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosOutFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutFuaAttr = { + .attr = { .name = "bios_out_fua", .mode = 0444, }, + .show = poolStatsBiosOutFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaReadAttr = { + .attr = { .name = "bios_meta_read", .mode = 0444, }, + .show = poolStatsBiosMetaReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaWriteAttr = { + .attr = { .name = "bios_meta_write", .mode = 0444, }, + .show = poolStatsBiosMetaWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosMetaDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaDiscardAttr = { + .attr = { .name = "bios_meta_discard", .mode = 0444, }, + .show = poolStatsBiosMetaDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosMetaFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaFlushAttr = { + .attr = { .name = "bios_meta_flush", .mode = 0444, }, + .show = poolStatsBiosMetaFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosMetaFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaFuaAttr = { + .attr = { .name = "bios_meta_fua", .mode = 0444, }, + .show = poolStatsBiosMetaFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalReadAttr = { + .attr = { .name = "bios_journal_read", .mode = 0444, }, + .show = poolStatsBiosJournalReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalWriteAttr = { + .attr = { .name = "bios_journal_write", .mode = 0444, }, + .show = poolStatsBiosJournalWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosJournalDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalDiscardAttr = { + .attr = { .name = "bios_journal_discard", .mode = 0444, }, + .show = poolStatsBiosJournalDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosJournalFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalFlushAttr = { + .attr = { .name = "bios_journal_flush", .mode = 0444, }, + .show = poolStatsBiosJournalFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosJournalFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalFuaAttr = { + .attr = { .name = "bios_journal_fua", .mode = 0444, }, + .show = poolStatsBiosJournalFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheReadAttr = { + .attr = { .name = "bios_page_cache_read", .mode = 0444, }, + .show = poolStatsBiosPageCacheReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheWriteAttr = { + .attr = { .name = "bios_page_cache_write", .mode = 0444, }, + .show = poolStatsBiosPageCacheWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosPageCacheDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheDiscardAttr = { + .attr = { .name = "bios_page_cache_discard", .mode = 0444, }, + .show = poolStatsBiosPageCacheDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosPageCacheFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheFlushAttr = { + .attr = { .name = "bios_page_cache_flush", .mode = 0444, }, + .show = poolStatsBiosPageCacheFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosPageCacheFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheFuaAttr = { + .attr = { .name = "bios_page_cache_fua", .mode = 0444, }, + .show = poolStatsBiosPageCacheFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosOutCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedReadAttr = { + .attr = { .name = "bios_out_completed_read", .mode = 0444, }, + .show = poolStatsBiosOutCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosOutCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedWriteAttr = { + .attr = { .name = "bios_out_completed_write", .mode = 0444, }, + .show = poolStatsBiosOutCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosOutCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedDiscardAttr = { + .attr = { .name = "bios_out_completed_discard", .mode = 0444, }, + .show = poolStatsBiosOutCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosOutCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedFlushAttr = { + .attr = { .name = "bios_out_completed_flush", .mode = 0444, }, + .show = poolStatsBiosOutCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosOutCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedFuaAttr = { + .attr = { .name = "bios_out_completed_fua", .mode = 0444, }, + .show = poolStatsBiosOutCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedReadAttr = { + .attr = { .name = "bios_meta_completed_read", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedWriteAttr = { + .attr = { .name = "bios_meta_completed_write", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosMetaCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedDiscardAttr = { + .attr = { .name = "bios_meta_completed_discard", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosMetaCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedFlushAttr = { + .attr = { .name = "bios_meta_completed_flush", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosMetaCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedFuaAttr = { + .attr = { .name = "bios_meta_completed_fua", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedReadAttr = { + .attr = { .name = "bios_journal_completed_read", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedWriteAttr = { + .attr = { .name = "bios_journal_completed_write", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosJournalCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedDiscardAttr = { + .attr = { .name = "bios_journal_completed_discard", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosJournalCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedFlushAttr = { + .attr = { .name = "bios_journal_completed_flush", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosJournalCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedFuaAttr = { + .attr = { .name = "bios_journal_completed_fua", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedReadAttr = { + .attr = { .name = "bios_page_cache_completed_read", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedWriteAttr = { + .attr = { .name = "bios_page_cache_completed_write", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosPageCacheCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedDiscardAttr = { + .attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosPageCacheCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedFlushAttr = { + .attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosPageCacheCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedFuaAttr = { + .attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedReadAttr = { + .attr = { .name = "bios_acknowledged_read", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedWriteAttr = { + .attr = { .name = "bios_acknowledged_write", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosAcknowledgedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedDiscardAttr = { + .attr = { .name = "bios_acknowledged_discard", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosAcknowledgedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedFlushAttr = { + .attr = { .name = "bios_acknowledged_flush", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosAcknowledgedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedFuaAttr = { + .attr = { .name = "bios_acknowledged_fua", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedPartialReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialReadAttr = { + .attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedPartialWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialWriteAttr = { + .attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosAcknowledgedPartialDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialDiscardAttr = { + .attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosAcknowledgedPartialFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFlushAttr = { + .attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosAcknowledgedPartialFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFuaAttr = { + .attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInProgressReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressReadAttr = { + .attr = { .name = "bios_in_progress_read", .mode = 0444, }, + .show = poolStatsBiosInProgressReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInProgressWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressWriteAttr = { + .attr = { .name = "bios_in_progress_write", .mode = 0444, }, + .show = poolStatsBiosInProgressWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInProgressDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressDiscardAttr = { + .attr = { .name = "bios_in_progress_discard", .mode = 0444, }, + .show = poolStatsBiosInProgressDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInProgressFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressFlushAttr = { + .attr = { .name = "bios_in_progress_flush", .mode = 0444, }, + .show = poolStatsBiosInProgressFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInProgressFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressFuaAttr = { + .attr = { .name = "bios_in_progress_fua", .mode = 0444, }, + .show = poolStatsBiosInProgressFuaShow, +}; + +/**********************************************************************/ +/** Tracked bytes currently allocated. */ +static ssize_t poolStatsMemoryUsageBytesUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.bytesUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMemoryUsageBytesUsedAttr = { + .attr = { .name = "memory_usage_bytes_used", .mode = 0444, }, + .show = poolStatsMemoryUsageBytesUsedShow, +}; + +/**********************************************************************/ +/** Maximum tracked bytes allocated. */ +static ssize_t poolStatsMemoryUsagePeakBytesUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.peakBytesUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMemoryUsagePeakBytesUsedAttr = { + .attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, }, + .show = poolStatsMemoryUsagePeakBytesUsedShow, +}; + +/**********************************************************************/ +/** Number of chunk names stored in the index */ +static ssize_t poolStatsIndexEntriesIndexedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.entriesIndexed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexEntriesIndexedAttr = { + .attr = { .name = "index_entries_indexed", .mode = 0444, }, + .show = poolStatsIndexEntriesIndexedShow, +}; + +/**********************************************************************/ +/** Number of post calls that found an existing entry */ +static ssize_t poolStatsIndexPostsFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexPostsFoundAttr = { + .attr = { .name = "index_posts_found", .mode = 0444, }, + .show = poolStatsIndexPostsFoundShow, +}; + +/**********************************************************************/ +/** Number of post calls that added a new entry */ +static ssize_t poolStatsIndexPostsNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexPostsNotFoundAttr = { + .attr = { .name = "index_posts_not_found", .mode = 0444, }, + .show = poolStatsIndexPostsNotFoundShow, +}; + +/**********************************************************************/ +/** Number of query calls that found an existing entry */ +static ssize_t poolStatsIndexQueriesFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexQueriesFoundAttr = { + .attr = { .name = "index_queries_found", .mode = 0444, }, + .show = poolStatsIndexQueriesFoundShow, +}; + +/**********************************************************************/ +/** Number of query calls that added a new entry */ +static ssize_t poolStatsIndexQueriesNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexQueriesNotFoundAttr = { + .attr = { .name = "index_queries_not_found", .mode = 0444, }, + .show = poolStatsIndexQueriesNotFoundShow, +}; + +/**********************************************************************/ +/** Number of update calls that found an existing entry */ +static ssize_t poolStatsIndexUpdatesFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexUpdatesFoundAttr = { + .attr = { .name = "index_updates_found", .mode = 0444, }, + .show = poolStatsIndexUpdatesFoundShow, +}; + +/**********************************************************************/ +/** Number of update calls that added a new entry */ +static ssize_t poolStatsIndexUpdatesNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexUpdatesNotFoundAttr = { + .attr = { .name = "index_updates_not_found", .mode = 0444, }, + .show = poolStatsIndexUpdatesNotFoundShow, +}; + +/**********************************************************************/ +/** Current number of dedupe queries that are in flight */ +static ssize_t poolStatsIndexCurrDedupeQueriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.currDedupeQueries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexCurrDedupeQueriesAttr = { + .attr = { .name = "index_curr_dedupe_queries", .mode = 0444, }, + .show = poolStatsIndexCurrDedupeQueriesShow, +}; + +/**********************************************************************/ +/** Maximum number of dedupe queries that have been in flight */ +static ssize_t poolStatsIndexMaxDedupeQueriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.maxDedupeQueries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexMaxDedupeQueriesAttr = { + .attr = { .name = "index_max_dedupe_queries", .mode = 0444, }, + .show = poolStatsIndexMaxDedupeQueriesShow, +}; + +struct attribute *poolStatsAttrs[] = { + &poolStatsDataBlocksUsedAttr.attr, + &poolStatsOverheadBlocksUsedAttr.attr, + &poolStatsLogicalBlocksUsedAttr.attr, + &poolStatsPhysicalBlocksAttr.attr, + &poolStatsLogicalBlocksAttr.attr, + &poolStatsBlockMapCacheSizeAttr.attr, + &poolStatsWritePolicyAttr.attr, + &poolStatsBlockSizeAttr.attr, + &poolStatsCompleteRecoveriesAttr.attr, + &poolStatsReadOnlyRecoveriesAttr.attr, + &poolStatsModeAttr.attr, + &poolStatsInRecoveryModeAttr.attr, + &poolStatsRecoveryPercentageAttr.attr, + &poolStatsPackerCompressedFragmentsWrittenAttr.attr, + &poolStatsPackerCompressedBlocksWrittenAttr.attr, + &poolStatsPackerCompressedFragmentsInPackerAttr.attr, + &poolStatsAllocatorSlabCountAttr.attr, + &poolStatsAllocatorSlabsOpenedAttr.attr, + &poolStatsAllocatorSlabsReopenedAttr.attr, + &poolStatsJournalDiskFullAttr.attr, + &poolStatsJournalSlabJournalCommitsRequestedAttr.attr, + &poolStatsJournalEntriesStartedAttr.attr, + &poolStatsJournalEntriesWrittenAttr.attr, + &poolStatsJournalEntriesCommittedAttr.attr, + &poolStatsJournalBlocksStartedAttr.attr, + &poolStatsJournalBlocksWrittenAttr.attr, + &poolStatsJournalBlocksCommittedAttr.attr, + &poolStatsSlabJournalDiskFullCountAttr.attr, + &poolStatsSlabJournalFlushCountAttr.attr, + &poolStatsSlabJournalBlockedCountAttr.attr, + &poolStatsSlabJournalBlocksWrittenAttr.attr, + &poolStatsSlabJournalTailBusyCountAttr.attr, + &poolStatsSlabSummaryBlocksWrittenAttr.attr, + &poolStatsRefCountsBlocksWrittenAttr.attr, + &poolStatsBlockMapDirtyPagesAttr.attr, + &poolStatsBlockMapCleanPagesAttr.attr, + &poolStatsBlockMapFreePagesAttr.attr, + &poolStatsBlockMapFailedPagesAttr.attr, + &poolStatsBlockMapIncomingPagesAttr.attr, + &poolStatsBlockMapOutgoingPagesAttr.attr, + &poolStatsBlockMapCachePressureAttr.attr, + &poolStatsBlockMapReadCountAttr.attr, + &poolStatsBlockMapWriteCountAttr.attr, + &poolStatsBlockMapFailedReadsAttr.attr, + &poolStatsBlockMapFailedWritesAttr.attr, + &poolStatsBlockMapReclaimedAttr.attr, + &poolStatsBlockMapReadOutgoingAttr.attr, + &poolStatsBlockMapFoundInCacheAttr.attr, + &poolStatsBlockMapDiscardRequiredAttr.attr, + &poolStatsBlockMapWaitForPageAttr.attr, + &poolStatsBlockMapFetchRequiredAttr.attr, + &poolStatsBlockMapPagesLoadedAttr.attr, + &poolStatsBlockMapPagesSavedAttr.attr, + &poolStatsBlockMapFlushCountAttr.attr, + &poolStatsHashLockDedupeAdviceValidAttr.attr, + &poolStatsHashLockDedupeAdviceStaleAttr.attr, + &poolStatsHashLockConcurrentDataMatchesAttr.attr, + &poolStatsHashLockConcurrentHashCollisionsAttr.attr, + &poolStatsErrorsInvalidAdvicePBNCountAttr.attr, + &poolStatsErrorsNoSpaceErrorCountAttr.attr, + &poolStatsErrorsReadOnlyErrorCountAttr.attr, + &poolStatsInstanceAttr.attr, + &poolStatsCurrentVIOsInProgressAttr.attr, + &poolStatsMaxVIOsAttr.attr, + &poolStatsDedupeAdviceTimeoutsAttr.attr, + &poolStatsFlushOutAttr.attr, + &poolStatsLogicalBlockSizeAttr.attr, + &poolStatsBiosInReadAttr.attr, + &poolStatsBiosInWriteAttr.attr, + &poolStatsBiosInDiscardAttr.attr, + &poolStatsBiosInFlushAttr.attr, + &poolStatsBiosInFuaAttr.attr, + &poolStatsBiosInPartialReadAttr.attr, + &poolStatsBiosInPartialWriteAttr.attr, + &poolStatsBiosInPartialDiscardAttr.attr, + &poolStatsBiosInPartialFlushAttr.attr, + &poolStatsBiosInPartialFuaAttr.attr, + &poolStatsBiosOutReadAttr.attr, + &poolStatsBiosOutWriteAttr.attr, + &poolStatsBiosOutDiscardAttr.attr, + &poolStatsBiosOutFlushAttr.attr, + &poolStatsBiosOutFuaAttr.attr, + &poolStatsBiosMetaReadAttr.attr, + &poolStatsBiosMetaWriteAttr.attr, + &poolStatsBiosMetaDiscardAttr.attr, + &poolStatsBiosMetaFlushAttr.attr, + &poolStatsBiosMetaFuaAttr.attr, + &poolStatsBiosJournalReadAttr.attr, + &poolStatsBiosJournalWriteAttr.attr, + &poolStatsBiosJournalDiscardAttr.attr, + &poolStatsBiosJournalFlushAttr.attr, + &poolStatsBiosJournalFuaAttr.attr, + &poolStatsBiosPageCacheReadAttr.attr, + &poolStatsBiosPageCacheWriteAttr.attr, + &poolStatsBiosPageCacheDiscardAttr.attr, + &poolStatsBiosPageCacheFlushAttr.attr, + &poolStatsBiosPageCacheFuaAttr.attr, + &poolStatsBiosOutCompletedReadAttr.attr, + &poolStatsBiosOutCompletedWriteAttr.attr, + &poolStatsBiosOutCompletedDiscardAttr.attr, + &poolStatsBiosOutCompletedFlushAttr.attr, + &poolStatsBiosOutCompletedFuaAttr.attr, + &poolStatsBiosMetaCompletedReadAttr.attr, + &poolStatsBiosMetaCompletedWriteAttr.attr, + &poolStatsBiosMetaCompletedDiscardAttr.attr, + &poolStatsBiosMetaCompletedFlushAttr.attr, + &poolStatsBiosMetaCompletedFuaAttr.attr, + &poolStatsBiosJournalCompletedReadAttr.attr, + &poolStatsBiosJournalCompletedWriteAttr.attr, + &poolStatsBiosJournalCompletedDiscardAttr.attr, + &poolStatsBiosJournalCompletedFlushAttr.attr, + &poolStatsBiosJournalCompletedFuaAttr.attr, + &poolStatsBiosPageCacheCompletedReadAttr.attr, + &poolStatsBiosPageCacheCompletedWriteAttr.attr, + &poolStatsBiosPageCacheCompletedDiscardAttr.attr, + &poolStatsBiosPageCacheCompletedFlushAttr.attr, + &poolStatsBiosPageCacheCompletedFuaAttr.attr, + &poolStatsBiosAcknowledgedReadAttr.attr, + &poolStatsBiosAcknowledgedWriteAttr.attr, + &poolStatsBiosAcknowledgedDiscardAttr.attr, + &poolStatsBiosAcknowledgedFlushAttr.attr, + &poolStatsBiosAcknowledgedFuaAttr.attr, + &poolStatsBiosAcknowledgedPartialReadAttr.attr, + &poolStatsBiosAcknowledgedPartialWriteAttr.attr, + &poolStatsBiosAcknowledgedPartialDiscardAttr.attr, + &poolStatsBiosAcknowledgedPartialFlushAttr.attr, + &poolStatsBiosAcknowledgedPartialFuaAttr.attr, + &poolStatsBiosInProgressReadAttr.attr, + &poolStatsBiosInProgressWriteAttr.attr, + &poolStatsBiosInProgressDiscardAttr.attr, + &poolStatsBiosInProgressFlushAttr.attr, + &poolStatsBiosInProgressFuaAttr.attr, + &poolStatsMemoryUsageBytesUsedAttr.attr, + &poolStatsMemoryUsagePeakBytesUsedAttr.attr, + &poolStatsIndexEntriesIndexedAttr.attr, + &poolStatsIndexPostsFoundAttr.attr, + &poolStatsIndexPostsNotFoundAttr.attr, + &poolStatsIndexQueriesFoundAttr.attr, + &poolStatsIndexQueriesNotFoundAttr.attr, + &poolStatsIndexUpdatesFoundAttr.attr, + &poolStatsIndexUpdatesNotFoundAttr.attr, + &poolStatsIndexCurrDedupeQueriesAttr.attr, + &poolStatsIndexMaxDedupeQueriesAttr.attr, + NULL, +}; diff --git a/vdo/kernel/statusCodeBlocks.h b/vdo/kernel/statusCodeBlocks.h new file mode 100644 index 0000000..bca19c5 --- /dev/null +++ b/vdo/kernel/statusCodeBlocks.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusCodeBlocks.h#1 $ + */ + +#ifndef STATUS_CODE_BLOCKS_H +#define STATUS_CODE_BLOCKS_H + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +#endif // STATUS_CODE_BLOCKS_H diff --git a/vdo/kernel/statusProcfs.c b/vdo/kernel/statusProcfs.c new file mode 100644 index 0000000..70e8c9b --- /dev/null +++ b/vdo/kernel/statusProcfs.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.c#4 $ + * + * Proc filesystem interface to the old GET_DEDUPE_STATS and + * GET_KERNEL_STATS ioctls, which can no longer be supported in 4.4 + * and later kernels. These files return the same data as the old + * ioctls do, in order to require minimal changes to our (and + * customers') utilties and test code. + * + * +--+----- /proc/vdo procfsRoot + * | + * +-+----- vdo config->poolName + * | + * +------- dedupe_stats GET_DEDUPE_STATS ioctl + * +------- kernel_stats GET_KERNEL_STATS ioctl + * + */ +#include "statusProcfs.h" + +#include + +#include "memoryAlloc.h" + +#include "releaseVersions.h" +#include "statistics.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "ioSubmitter.h" +#include "kernelStatistics.h" +#include "logger.h" +#include "memoryUsage.h" +#include "threadDevice.h" +#include "vdoCommon.h" + +static struct proc_dir_entry *procfsRoot = NULL; + +/**********************************************************************/ +static int statusDedupeShow(struct seq_file *m, void *v) +{ + KernelLayer *layer = (KernelLayer *) m->private; + VDOStatistics *stats; + size_t len = sizeof(VDOStatistics); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = ALLOCATE(1, VDOStatistics, __func__, &stats); + if (result == VDO_SUCCESS) { + getKVDOStatistics(&layer->kvdo, stats); + seq_write(m, stats, len); + FREE(stats); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static int statusDedupeOpen(struct inode *inode, struct file *file) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + return single_open(file, statusDedupeShow, PDE_DATA(inode)); +#else + return single_open(file, statusDedupeShow, PDE(inode)->data); +#endif +} + +static const struct file_operations vdoProcfsDedupeOps = { + .open = statusDedupeOpen, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/**********************************************************************/ +static void copyBioStat(BioStats *b, const AtomicBioStats *a) +{ + b->read = atomic64_read(&a->read); + b->write = atomic64_read(&a->write); + b->discard = atomic64_read(&a->discard); + b->flush = atomic64_read(&a->flush); + b->fua = atomic64_read(&a->fua); +} + +/**********************************************************************/ +static BioStats subtractBioStats(BioStats minuend, BioStats subtrahend) +{ + return (BioStats) { + .read = minuend.read - subtrahend.read, + .write = minuend.write - subtrahend.write, + .discard = minuend.discard - subtrahend.discard, + .flush = minuend.flush - subtrahend.flush, + .fua = minuend.fua - subtrahend.fua, + }; +} + +/**********************************************************************/ +void getKernelStats(KernelLayer *layer, KernelStatistics *stats) +{ + stats->version = STATISTICS_VERSION; + stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + stats->instance = layer->instance; + getLimiterValuesAtomically(&layer->requestLimiter, + &stats->currentVIOsInProgress, &stats->maxVIOs); + // albireoTimeoutReport gives the number of timeouts, and dedupeContextBusy + // gives the number of queries not made because of earlier timeouts. + stats->dedupeAdviceTimeouts = (getEventCount(&layer->albireoTimeoutReporter) + + atomic64_read(&layer->dedupeContextBusy)); + stats->flushOut = atomic64_read(&layer->flushOut); + stats->logicalBlockSize = layer->deviceConfig->logicalBlockSize; + copyBioStat(&stats->biosIn, &layer->biosIn); + copyBioStat(&stats->biosInPartial, &layer->biosInPartial); + copyBioStat(&stats->biosOut, &layer->biosOut); + copyBioStat(&stats->biosMeta, &layer->biosMeta); + copyBioStat(&stats->biosJournal, &layer->biosJournal); + copyBioStat(&stats->biosPageCache, &layer->biosPageCache); + copyBioStat(&stats->biosOutCompleted, &layer->biosOutCompleted); + copyBioStat(&stats->biosMetaCompleted, &layer->biosMetaCompleted); + copyBioStat(&stats->biosJournalCompleted, &layer->biosJournalCompleted); + copyBioStat(&stats->biosPageCacheCompleted, + &layer->biosPageCacheCompleted); + copyBioStat(&stats->biosAcknowledged, &layer->biosAcknowledged); + copyBioStat(&stats->biosAcknowledgedPartial, + &layer->biosAcknowledgedPartial); + stats->biosInProgress = subtractBioStats(stats->biosIn, + stats->biosAcknowledged); + stats->memoryUsage = getMemoryUsage(); + getIndexStatistics(layer->dedupeIndex, &stats->index); +} + +/**********************************************************************/ +static int statusKernelShow(struct seq_file *m, void *v) +{ + KernelLayer *layer = (KernelLayer *) m->private; + KernelStatistics *stats; + size_t len = sizeof(KernelStatistics); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = ALLOCATE(1, KernelStatistics, __func__, &stats); + if (result == VDO_SUCCESS) { + getKernelStats(layer, stats); + seq_write(m, stats, len); + FREE(stats); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static int statusKernelOpen(struct inode *inode, struct file *file) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + return single_open(file, statusKernelShow, PDE_DATA(inode)); +#else + return single_open(file, statusKernelShow, PDE(inode)->data); +#endif +} + +static const struct file_operations vdoProcfsKernelOps = { + .open = statusKernelOpen, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/**********************************************************************/ +int vdoInitProcfs() +{ + const char *procfsName = getProcRoot(); + procfsRoot = proc_mkdir(procfsName, NULL); + if (procfsRoot == NULL) { + logWarning("Could not create proc filesystem root %s\n", procfsName); + return -ENOMEM; + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +void vdoDestroyProcfs() +{ + remove_proc_entry(getProcRoot(), NULL); + procfsRoot = NULL; +} + +/**********************************************************************/ +int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private) +{ + int result = VDO_SUCCESS; + + if (procfsRoot != NULL) { + struct proc_dir_entry *fsDir; + fsDir = proc_mkdir(name, procfsRoot); + if (fsDir == NULL) { + result = -ENOMEM; + } else { + if (proc_create_data(getVDOStatisticsProcFile(), 0644, fsDir, + &vdoProcfsDedupeOps, layer) == NULL) { + result = -ENOMEM; + } else if (proc_create_data(getKernelStatisticsProcFile(), 0644, fsDir, + &vdoProcfsKernelOps, layer) == NULL) { + result = -ENOMEM; + } + } + if (result < 0) { + vdoDestroyProcfsEntry(name, fsDir); + } else { + *private = fsDir; + } + } else { + logWarning("No proc filesystem root set, skipping %s\n", name); + } + return result; +} + +/**********************************************************************/ +void vdoDestroyProcfsEntry(const char *name, void *private) +{ + if (procfsRoot != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + remove_proc_subtree(name, procfsRoot); +#else + struct proc_dir_entry *fsDir = (struct proc_dir_entry *) private; + remove_proc_entry(getVDOStatisticsProcFile(), fsDir); + remove_proc_entry(getKernelStatisticsProcFile(), fsDir); + remove_proc_entry(name, procfsRoot); +#endif + } +} diff --git a/vdo/kernel/statusProcfs.h b/vdo/kernel/statusProcfs.h new file mode 100644 index 0000000..a884c8e --- /dev/null +++ b/vdo/kernel/statusProcfs.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.h#1 $ + * + */ + +#ifndef STATUS_PROC_H +#define STATUS_PROC_H + +#include +#include +#include "kernelLayer.h" + +/** + * Initializes the /proc/vdo directory. Should be called once when the + * module is loaded. + * + * @return 0 on success, nonzero on failure + */ +int vdoInitProcfs(void); + +/** + * Destroys the /proc/vdo directory. Should be called once when the + * module is unloaded. + */ +void vdoDestroyProcfs(void); + +/** + * Creates a subdirectory in the /proc/vdo filesystem for a particular + * vdo. + * + * @param layer the kernel layer + * @param name the subdirectory name + * @param private pointer to private storage for procfs data + * + * @return 0 on success, nonzero on failure + */ +int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private); + +/** + * Destroys a subdirectory in the /proc/vdo filesystem for a + * particular vdo. + * + * @param name the subdirectory name + * @param private private storage for procfs data + */ +void vdoDestroyProcfsEntry(const char *name, void *private); + +/** + * Retrieves the current kernel statistics. + * + * @param layer the kernel layer + * @param stats pointer to the structure to fill in + */ +void getKernelStats(KernelLayer *layer, KernelStatistics *stats); + +#endif /* STATUS_PROC_H */ diff --git a/vdo/kernel/sysfs.c b/vdo/kernel/sysfs.c new file mode 100644 index 0000000..9244bf1 --- /dev/null +++ b/vdo/kernel/sysfs.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.c#5 $ + */ + +#include "sysfs.h" + +#include +#include + +#include "dedupeIndex.h" +#include "dmvdo.h" +#include "logger.h" + +extern int defaultMaxRequestsActive; + +typedef struct vdoAttribute { + struct attribute attr; + ssize_t (*show)(struct kvdoDevice *d, struct attribute *attr, char *buf); + ssize_t (*store)(struct kvdoDevice *d, const char *value, size_t count); + // Location of value, if .show == showInt or showUInt or showBool. + void *valuePtr; +} VDOAttribute; + +static char *statusStrings[] = { + "UNINITIALIZED", + "READY", + "SHUTTING DOWN", +}; + +/**********************************************************************/ +static ssize_t vdoStatusShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", statusStrings[device->status]); +} + +/**********************************************************************/ +static ssize_t vdoLogLevelShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", priorityToString(getLogLevel())); +} + +/**********************************************************************/ +static ssize_t vdoLogLevelStore(struct kvdoDevice *device, + const char *buf, size_t n) +{ + static char internalBuf[11]; + + if (n > 10) { + return -EINVAL; + } + + memset(internalBuf, '\000', sizeof(internalBuf)); + memcpy(internalBuf, buf, n); + if (internalBuf[n - 1] == '\n') { + internalBuf[n - 1] = '\000'; + } + setLogLevel(stringToPriority(internalBuf)); + return n; +} + +/**********************************************************************/ +static ssize_t scanInt(const char *buf, + size_t n, + int *valuePtr, + int minimum, + int maximum) +{ + if (n > 12) { + return -EINVAL; + } + unsigned int value; + if (sscanf(buf, "%d", &value) != 1) { + return -EINVAL; + } + if (value < minimum) { + value = minimum; + } else if (value > maximum) { + value = maximum; + } + *valuePtr = value; + return n; +} + +/**********************************************************************/ +static ssize_t showInt(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%d\n", *(int *)vdoAttr->valuePtr); +} + +/**********************************************************************/ +static ssize_t scanUInt(const char *buf, + size_t n, + unsigned int *valuePtr, + unsigned int minimum, + unsigned int maximum) +{ + if (n > 12) { + return -EINVAL; + } + unsigned int value; + if (sscanf(buf, "%u", &value) != 1) { + return -EINVAL; + } + if (value < minimum) { + value = minimum; + } else if (value > maximum) { + value = maximum; + } + *valuePtr = value; + return n; +} + +/**********************************************************************/ +static ssize_t showUInt(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%u\n", *(unsigned int *)vdoAttr->valuePtr); +} + +/**********************************************************************/ +static ssize_t scanBool(const char *buf, size_t n, bool *valuePtr) +{ + unsigned int intValue = 0; + n = scanUInt(buf, n, &intValue, 0, 1); + if (n > 0) { + *valuePtr = (intValue != 0); + } + return n; +} + +/**********************************************************************/ +static ssize_t showBool(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%u\n", *(bool *)vdoAttr->valuePtr ? 1 : 0); +} + +/**********************************************************************/ +static ssize_t vdoTraceRecordingStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + return scanBool(buf, n, &traceRecording); +} + +/**********************************************************************/ +static ssize_t vdoMaxReqActiveStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + /* + * The base code has some hardcoded assumptions about the maximum + * number of requests that can be in progress. Maybe someday we'll + * do calculations with the actual number; for now, just make sure + * the assumption holds. + */ + return scanInt(buf, n, &defaultMaxRequestsActive, 1, MAXIMUM_USER_VIOS); +} + +/**********************************************************************/ +static ssize_t vdoAlbireoTimeoutIntervalStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + unsigned int value; + ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); + if (result > 0) { + setAlbireoTimeoutInterval(value); + } + return result; +} + +/**********************************************************************/ +static ssize_t vdoMinAlbireoTimerIntervalStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + unsigned int value; + ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); + if (result > 0) { + setMinAlbireoTimerInterval(value); + } + return result; +} + +/**********************************************************************/ +static ssize_t vdoVersionShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", CURRENT_VERSION); +} + +/**********************************************************************/ +static ssize_t vdoAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + if (vdoAttr->show == NULL) { + return -EINVAL; + } + + struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); + return (*vdoAttr->show)(device, attr, buf); +} + +/**********************************************************************/ +static ssize_t vdoAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + if (vdoAttr->store == NULL) { + return -EINVAL; + } + + struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); + return (*vdoAttr->store)(device, buf, length); +} + +static VDOAttribute vdoStatusAttr = { + .attr = { .name = "status", .mode = 0444, }, + .show = vdoStatusShow, +}; + +static VDOAttribute vdoLogLevelAttr = { + .attr = {.name = "log_level", .mode = 0644, }, + .show = vdoLogLevelShow, + .store = vdoLogLevelStore, +}; + +static VDOAttribute vdoMaxReqActiveAttr = { + .attr = {.name = "max_requests_active", .mode = 0644, }, + .show = showInt, + .store = vdoMaxReqActiveStore, + .valuePtr = &defaultMaxRequestsActive, +}; + +static VDOAttribute vdoAlbireoTimeoutInterval = { + .attr = {.name = "deduplication_timeout_interval", .mode = 0644, }, + .show = showUInt, + .store = vdoAlbireoTimeoutIntervalStore, + .valuePtr = &albireoTimeoutInterval, +}; + +static VDOAttribute vdoMinAlbireoTimerInterval = { + .attr = {.name = "min_deduplication_timer_interval", .mode = 0644, }, + .show = showUInt, + .store = vdoMinAlbireoTimerIntervalStore, + .valuePtr = &minAlbireoTimerInterval, +}; + +static VDOAttribute vdoTraceRecording = { + .attr = {.name = "trace_recording", .mode = 0644, }, + .show = showBool, + .store = vdoTraceRecordingStore, + .valuePtr = &traceRecording, +}; + +static VDOAttribute vdoVersionAttr = { + .attr = { .name = "version", .mode = 0444, }, + .show = vdoVersionShow, +}; + +static struct attribute *defaultAttrs[] = { + &vdoStatusAttr.attr, + &vdoLogLevelAttr.attr, + &vdoMaxReqActiveAttr.attr, + &vdoAlbireoTimeoutInterval.attr, + &vdoMinAlbireoTimerInterval.attr, + &vdoTraceRecording.attr, + &vdoVersionAttr.attr, + NULL +}; + +static struct sysfs_ops vdoSysfsOps = { + .show = vdoAttrShow, + .store = vdoAttrStore, +}; + +/**********************************************************************/ +static void vdoRelease(struct kobject *kobj) +{ + return; +} + +struct kobj_type vdo_ktype = { + .release = vdoRelease, + .sysfs_ops = &vdoSysfsOps, + .default_attrs = defaultAttrs, +}; + +/**********************************************************************/ +int vdoInitSysfs(struct kobject *deviceObject) +{ + kobject_init(deviceObject, &vdo_ktype); + int result = kobject_add(deviceObject, NULL, THIS_MODULE->name); + if (result < 0) { + logError("kobject_add failed with status %d", -result); + kobject_put(deviceObject); + } + logDebug("added sysfs objects"); + return result; +}; + +/**********************************************************************/ +void vdoPutSysfs(struct kobject *deviceObject) +{ + kobject_put(deviceObject); +} diff --git a/vdo/kernel/sysfs.h b/vdo/kernel/sysfs.h new file mode 100644 index 0000000..3dbac04 --- /dev/null +++ b/vdo/kernel/sysfs.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.h#2 $ + */ + +#ifndef ALBIREO_SYSFS_H +#define ALBIREO_SYSFS_H + +#include "kernelLayer.h" + +struct kvdoDevice; + +/** +* Initializes the sysfs objects global to all vdo devices. +* +* @param deviceObject the kobject of the kvdoDevice to initialize. +*/ +int vdoInitSysfs(struct kobject *deviceObject); + +/** + * Releases the global sysfs objects. + * + * @param deviceObject the kobject of the kvdoDevice to release. + */ +void vdoPutSysfs(struct kobject *deviceObject); + +#endif /* ALBIREO_SYSFS_H */ diff --git a/vdo/kernel/threadDevice.c b/vdo/kernel/threadDevice.c new file mode 100644 index 0000000..49fb909 --- /dev/null +++ b/vdo/kernel/threadDevice.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.c#1 $ + */ + +#include "threadDevice.h" + +#include "threadRegistry.h" + +/* + * A registry of all threads temporarily associated with particular + * VDO devices. + */ +static ThreadRegistry deviceIDThreadRegistry; + +/**********************************************************************/ +void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr) +{ + registerThread(&deviceIDThreadRegistry, newThread, idPtr); +} + +/**********************************************************************/ +void unregisterThreadDeviceID(void) +{ + unregisterThread(&deviceIDThreadRegistry); +} + +/**********************************************************************/ +int getThreadDeviceID(void) +{ + const unsigned int *pointer = lookupThread(&deviceIDThreadRegistry); + return pointer ? *pointer : -1; +} + +/**********************************************************************/ +void initializeThreadDeviceRegistry(void) +{ + initializeThreadRegistry(&deviceIDThreadRegistry); +} diff --git a/vdo/kernel/threadDevice.h b/vdo/kernel/threadDevice.h new file mode 100644 index 0000000..61b4ce6 --- /dev/null +++ b/vdo/kernel/threadDevice.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.h#1 $ + */ + +#include "kernelLayer.h" + +/** + * Temporarily register the current thread as being associated with a + * VDO device id number, for logging purposes. + * + * Any such registered thread must later be unregistered via + * unregisterThreadDeviceID. + * + * The pointed-to ID number should be nonzero. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param idPtr Location where the ID number is stored + **/ +void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr); + +/** + * Temporarily register the current thread as being associated with an + * existing VDO device, for logging purposes. + * + * Any such registered thread must later be unregistered via + * unregisterThreadDeviceID. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param layer The KernelLayer object for the VDO device + **/ +static inline void registerThreadDevice(RegisteredThread *newThread, + KernelLayer *layer) +{ + registerThreadDeviceID(newThread, &layer->instance); +} + +/** + * Cancel registration of the current thread as being associated with + * a VDO device or device ID number. + **/ +void unregisterThreadDeviceID(void); + +/** + * Get the VDO device ID number temporarily associated with the + * current thread, if any. + * + * @return the device ID number, if any, or -1 + **/ +int getThreadDeviceID(void); + +/** + * Initialize the thread device-ID registry. + **/ +void initializeThreadDeviceRegistry(void); diff --git a/vdo/kernel/threadRegistry.c b/vdo/kernel/threadRegistry.c new file mode 100644 index 0000000..6184d3c --- /dev/null +++ b/vdo/kernel/threadRegistry.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.c#1 $ + */ + +#include "threadRegistry.h" + +#include +#include + +#include "permassert.h" + +/* + * We need to be careful when using other facilities that may use + * threadRegistry functions in their normal operation. For example, + * we do not want to invoke the logger while holding a lock. + */ + +/*****************************************************************************/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer) +{ + INIT_LIST_HEAD(&newThread->links); + newThread->pointer = pointer; + newThread->task = current; + + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + // This should not have been there. + // We'll complain after releasing the lock. + list_del_init(&thread->links); + foundIt = true; + break; + } + } + list_add_tail(&newThread->links, ®istry->links); + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); +} + +/*****************************************************************************/ +void unregisterThread(ThreadRegistry *registry) +{ + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + list_del_init(&thread->links); + foundIt = true; + break; + } + } + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(foundIt, "thread found in registry"); +} + +/*****************************************************************************/ +void initializeThreadRegistry(ThreadRegistry *registry) +{ + INIT_LIST_HEAD(®istry->links); + rwlock_init(®istry->lock); +} + +/*****************************************************************************/ +const void *lookupThread(ThreadRegistry *registry) +{ + const void *result = NULL; + read_lock(®istry->lock); + RegisteredThread *thread; + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + result = thread->pointer; + break; + } + } + read_unlock(®istry->lock); + return result; +} diff --git a/vdo/kernel/threadRegistry.h b/vdo/kernel/threadRegistry.h new file mode 100644 index 0000000..f32325e --- /dev/null +++ b/vdo/kernel/threadRegistry.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.h#1 $ + */ + +#ifndef THREAD_REGISTRY_H +#define THREAD_REGISTRY_H 1 + +#include +#include + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. + */ + +typedef struct threadRegistry { + struct list_head links; + rwlock_t lock; +} ThreadRegistry; + +typedef struct registeredThread { + struct list_head links; + const void *pointer; + struct task_struct *task; +} RegisteredThread; + +/*****************************************************************************/ + +/** + * Initialize a registry of threads and associated data pointers. + * + * @param registry The registry to initialize + **/ +void initializeThreadRegistry(ThreadRegistry *registry); + +/** + * Register the current thread and associate it with a data pointer. + * + * This call will log messages if the thread is already registered. + * + * @param registry The thread registry + * @param newThread RegisteredThread structure to use for the current thread + * @param pointer The value to associated with the current thread + **/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer); + +/** + * Remove the registration for the current thread. + * + * A message may be logged if the thread was not registered. + * + * @param registry The thread registry + **/ +void unregisterThread(ThreadRegistry *registry); + +/** + * Fetch a pointer that may have been registered for the current + * thread. If the thread is not registered, a null pointer is + * returned. + * + * @param registry The thread registry + * + * @return the registered pointer, if any, or NULL + **/ +const void *lookupThread(ThreadRegistry *registry); + +#endif /* THREAD_REGISTRY_H */ diff --git a/vdo/kernel/threads.c b/vdo/kernel/threads.c new file mode 100644 index 0000000..2f905ed --- /dev/null +++ b/vdo/kernel/threads.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.c#1 $ + */ + +#include "threads.h" + +#include +#include + +/**********************************************************************/ +pid_t getThreadId(void) +{ + return in_interrupt() ? -1 : current->pid; +} diff --git a/vdo/kernel/threads.h b/vdo/kernel/threads.h new file mode 100644 index 0000000..25f8b47 --- /dev/null +++ b/vdo/kernel/threads.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.h#1 $ + */ + +#ifndef THREADS_H +#define THREADS_H + +#include + +/** + * Return the id of the current thread. + * In kernel interrupt context, returns -1. + * + * @return the thread id + **/ +pid_t getThreadId(void) + __attribute__((warn_unused_result)); + +#endif /* THREADS_H */ diff --git a/vdo/kernel/udsIndex.c b/vdo/kernel/udsIndex.c new file mode 100644 index 0000000..a202446 --- /dev/null +++ b/vdo/kernel/udsIndex.c @@ -0,0 +1,835 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.c#16 $ + */ + +#include "udsIndex.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" +#include "numeric.h" +#include "stringUtils.h" +#include "uds-block.h" + +/*****************************************************************************/ + +typedef struct udsAttribute { + struct attribute attr; + const char *(*showString)(DedupeIndex *); +} UDSAttribute; + +/*****************************************************************************/ + +enum { UDS_Q_ACTION }; + +/*****************************************************************************/ + +// These are the values in the atomic dedupeContext.requestState field +enum { + // The UdsRequest object is not in use. + UR_IDLE = 0, + // The UdsRequest object is in use, and VDO is waiting for the result. + UR_BUSY = 1, + // The UdsRequest object is in use, but has timed out. + UR_TIMED_OUT = 2, +}; + +/*****************************************************************************/ + +typedef enum { + // The UDS index is closed + IS_CLOSED = 0, + // The UDS index session is opening or closing + IS_CHANGING = 1, + // The UDS index is open. There is a UDS index session. + IS_OPENED = 2, +} IndexState; + +/*****************************************************************************/ + +typedef struct udsIndex { + DedupeIndex common; + struct kobject dedupeObject; + RegisteredThread allocatingThread; + char *indexName; + UdsConfiguration configuration; + struct uds_parameters udsParams; + struct uds_index_session *indexSession; + atomic_t active; + // This spinlock protects the state fields and the starting of dedupe + // requests. + spinlock_t stateLock; + KvdoWorkItem workItem; // protected by stateLock + KvdoWorkQueue *udsQueue; // protected by stateLock + unsigned int maximum; // protected by stateLock + IndexState indexState; // protected by stateLock + IndexState indexTarget; // protected by stateLock + bool changing; // protected by stateLock + bool createFlag; // protected by stateLock + bool dedupeFlag; // protected by stateLock + bool deduping; // protected by stateLock + bool errorFlag; // protected by stateLock + bool suspended; // protected by stateLock + // This spinlock protects the pending list, the pending flag in each KVIO, + // and the timeout list. + spinlock_t pendingLock; + struct list_head pendingHead; // protected by pendingLock + struct timer_list pendingTimer; // protected by pendingLock + bool startedTimer; // protected by pendingLock +} UDSIndex; + +/*****************************************************************************/ + +// Version 1: user space albireo index (limited to 32 bytes) +// Version 2: kernel space albireo index (limited to 16 bytes) +enum { + UDS_ADVICE_VERSION = 2, + // version byte + state byte + 64-bit little-endian PBN + UDS_ADVICE_SIZE = 1 + 1 + sizeof(uint64_t), +}; + +/*****************************************************************************/ + + // We want to ensure that there is only one copy of the following constants. +static const char *CLOSED = "closed"; +static const char *CLOSING = "closing"; +static const char *ERROR = "error"; +static const char *OFFLINE = "offline"; +static const char *ONLINE = "online"; +static const char *OPENING = "opening"; +static const char *SUSPENDED = "suspended"; +static const char *UNKNOWN = "unknown"; + +/*****************************************************************************/ +static const char *indexStateToString(UDSIndex *index, IndexState state) +{ + if (index->suspended) { + return SUSPENDED; + } + + switch (state) { + case IS_CLOSED: + // Closed. The errorFlag tells if it is because of an error. + return index->errorFlag ? ERROR : CLOSED; + case IS_CHANGING: + // The indexTarget tells if we are opening or closing the index. + return index->indexTarget == IS_OPENED ? OPENING : CLOSING; + case IS_OPENED: + // Opened. The dedupeFlag tells if we are online or offline. + return index->dedupeFlag ? ONLINE : OFFLINE; + default: + return UNKNOWN; + } +} + +/** + * Encode VDO duplicate advice into the newMetadata field of a UDS request. + * + * @param request The UDS request to receive the encoding + * @param advice The advice to encode + **/ +static void encodeUDSAdvice(UdsRequest *request, DataLocation advice) +{ + size_t offset = 0; + struct udsChunkData *encoding = &request->newMetadata; + encoding->data[offset++] = UDS_ADVICE_VERSION; + encoding->data[offset++] = advice.state; + encodeUInt64LE(encoding->data, &offset, advice.pbn); + BUG_ON(offset != UDS_ADVICE_SIZE); +} + +/** + * Decode VDO duplicate advice from the oldMetadata field of a UDS request. + * + * @param request The UDS request containing the encoding + * @param advice The DataLocation to receive the decoded advice + * + * @return true if valid advice was found and decoded + **/ +static bool decodeUDSAdvice(const UdsRequest *request, DataLocation *advice) +{ + if ((request->status != UDS_SUCCESS) || !request->found) { + return false; + } + + size_t offset = 0; + const struct udsChunkData *encoding = &request->oldMetadata; + byte version = encoding->data[offset++]; + if (version != UDS_ADVICE_VERSION) { + logError("invalid UDS advice version code %u", version); + return false; + } + + advice->state = encoding->data[offset++]; + decodeUInt64LE(encoding->data, &offset, &advice->pbn); + BUG_ON(offset != UDS_ADVICE_SIZE); + return true; +} + +/*****************************************************************************/ +static void finishIndexOperation(UdsRequest *udsRequest) +{ + DataKVIO *dataKVIO = container_of(udsRequest, DataKVIO, + dedupeContext.udsRequest); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + if (compareAndSwap32(&dedupeContext->requestState, UR_BUSY, UR_IDLE)) { + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + + spin_lock_bh(&index->pendingLock); + if (dedupeContext->isPending) { + list_del(&dedupeContext->pendingList); + dedupeContext->isPending = false; + } + spin_unlock_bh(&index->pendingLock); + + dedupeContext->status = udsRequest->status; + if ((udsRequest->type == UDS_POST) || (udsRequest->type == UDS_QUERY)) { + DataLocation advice; + if (decodeUDSAdvice(udsRequest, &advice)) { + setDedupeAdvice(dedupeContext, &advice); + } else { + setDedupeAdvice(dedupeContext, NULL); + } + } + invokeDedupeCallback(dataKVIO); + atomic_dec(&index->active); + } else { + compareAndSwap32(&dedupeContext->requestState, UR_TIMED_OUT, UR_IDLE); + } +} + +/*****************************************************************************/ +static void startExpirationTimer(UDSIndex *index, DataKVIO *dataKVIO) +{ + if (!index->startedTimer) { + index->startedTimer = true; + mod_timer(&index->pendingTimer, + getAlbireoTimeout(dataKVIO->dedupeContext.submissionTime)); + } +} + +/*****************************************************************************/ +static void startIndexOperation(KvdoWorkItem *item) +{ + KVIO *kvio = workItemAsKVIO(item); + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + + spin_lock_bh(&index->pendingLock); + list_add_tail(&dedupeContext->pendingList, &index->pendingHead); + dedupeContext->isPending = true; + startExpirationTimer(index, dataKVIO); + spin_unlock_bh(&index->pendingLock); + + UdsRequest *udsRequest = &dedupeContext->udsRequest; + int status = udsStartChunkOperation(udsRequest); + if (status != UDS_SUCCESS) { + udsRequest->status = status; + finishIndexOperation(udsRequest); + } +} + +/*****************************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) +static void timeoutIndexOperations(struct timer_list *t) +#else +static void timeoutIndexOperations(unsigned long arg) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + UDSIndex *index = from_timer(index, t, pendingTimer); +#else + UDSIndex *index = (UDSIndex *) arg; +#endif + LIST_HEAD(expiredHead); + uint64_t timeoutJiffies = msecs_to_jiffies(albireoTimeoutInterval); + unsigned long earliestSubmissionAllowed = jiffies - timeoutJiffies; + spin_lock_bh(&index->pendingLock); + index->startedTimer = false; + while (!list_empty(&index->pendingHead)) { + DataKVIO *dataKVIO = list_first_entry(&index->pendingHead, DataKVIO, + dedupeContext.pendingList); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + if (earliestSubmissionAllowed <= dedupeContext->submissionTime) { + startExpirationTimer(index, dataKVIO); + break; + } + list_del(&dedupeContext->pendingList); + dedupeContext->isPending = false; + list_add_tail(&dedupeContext->pendingList, &expiredHead); + } + spin_unlock_bh(&index->pendingLock); + while (!list_empty(&expiredHead)) { + DataKVIO *dataKVIO = list_first_entry(&expiredHead, DataKVIO, + dedupeContext.pendingList); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + list_del(&dedupeContext->pendingList); + if (compareAndSwap32(&dedupeContext->requestState, + UR_BUSY, UR_TIMED_OUT)) { + dedupeContext->status = ETIMEDOUT; + invokeDedupeCallback(dataKVIO); + atomic_dec(&index->active); + kvdoReportDedupeTimeout(dataKVIOAsKVIO(dataKVIO)->layer, 1); + } + } +} + +/*****************************************************************************/ +static void enqueueIndexOperation(DataKVIO *dataKVIO, + UdsCallbackType operation) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + dedupeContext->status = UDS_SUCCESS; + dedupeContext->submissionTime = jiffies; + if (compareAndSwap32(&dedupeContext->requestState, UR_IDLE, UR_BUSY)) { + UdsRequest *udsRequest = &dataKVIO->dedupeContext.udsRequest; + udsRequest->chunkName = *dedupeContext->chunkName; + udsRequest->callback = finishIndexOperation; + udsRequest->session = index->indexSession; + udsRequest->type = operation; + udsRequest->update = true; + if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { + encodeUDSAdvice(udsRequest, getDedupeAdvice(dedupeContext)); + } + + setupWorkItem(&kvio->enqueueable.workItem, startIndexOperation, NULL, + UDS_Q_ACTION); + + spin_lock(&index->stateLock); + if (index->deduping) { + enqueueWorkQueue(index->udsQueue, &kvio->enqueueable.workItem); + unsigned int active = atomic_inc_return(&index->active); + if (active > index->maximum) { + index->maximum = active; + } + kvio = NULL; + } else { + atomicStore32(&dedupeContext->requestState, UR_IDLE); + } + spin_unlock(&index->stateLock); + } else { + // A previous user of the KVIO had a dedupe timeout + // and its request is still outstanding. + atomic64_inc(&kvio->layer->dedupeContextBusy); + } + if (kvio != NULL) { + invokeDedupeCallback(dataKVIO); + } +} + +/*****************************************************************************/ +static void closeIndex(UDSIndex *index) +{ + // Change the index state so that getIndexStatistics will not try to + // use the index session we are closing. + index->indexState = IS_CHANGING; + spin_unlock(&index->stateLock); + int result = udsCloseIndex(index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error closing index %s", + index->indexName); + } + spin_lock(&index->stateLock); + index->indexState = IS_CLOSED; + index->errorFlag |= result != UDS_SUCCESS; + // ASSERTION: We leave in IS_CLOSED state. +} + +/*****************************************************************************/ +static void openIndex(UDSIndex *index) +{ + // ASSERTION: We enter in IS_CLOSED state. + bool createFlag = index->createFlag; + index->createFlag = false; + // Change the index state so that the it will be reported to the outside + // world as "opening". + index->indexState = IS_CHANGING; + index->errorFlag = false; + // Open the index session, while not holding the stateLock + spin_unlock(&index->stateLock); + + int result = udsOpenIndex(createFlag ? UDS_CREATE : UDS_LOAD, + index->indexName, &index->udsParams, + index->configuration, index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error opening index %s", + index->indexName); + } + spin_lock(&index->stateLock); + if (!createFlag) { + switch (result) { + case UDS_CORRUPT_COMPONENT: + case UDS_NO_INDEX: + // Either there is no index, or there is no way we can recover the index. + // We will be called again and try to create a new index. + index->indexState = IS_CLOSED; + index->createFlag = true; + return; + default: + break; + } + } + if (result == UDS_SUCCESS) { + index->indexState = IS_OPENED; + } else { + index->indexState = IS_CLOSED; + index->indexTarget = IS_CLOSED; + index->errorFlag = true; + spin_unlock(&index->stateLock); + logInfo("Setting UDS index target state to error"); + spin_lock(&index->stateLock); + } + // ASSERTION: On success, we leave in IS_OPEN state. + // ASSERTION: On failure, we leave in IS_CLOSED state. +} + +/*****************************************************************************/ +static void changeDedupeState(KvdoWorkItem *item) +{ + UDSIndex *index = container_of(item, UDSIndex, workItem); + spin_lock(&index->stateLock); + // Loop until the index is in the target state and the create flag is + // clear. + while (!index->suspended && + ((index->indexState != index->indexTarget) || + index->createFlag)) { + if (index->indexState == IS_OPENED) { + closeIndex(index); + } else { + openIndex(index); + } + } + index->changing = false; + index->deduping = index->dedupeFlag && (index->indexState == IS_OPENED); + spin_unlock(&index->stateLock); +} + + +/*****************************************************************************/ +static void launchDedupeStateChange(UDSIndex *index) +{ + // ASSERTION: We enter with the state_lock held. + if (index->changing || index->suspended) { + // Either a change is already in progress, or changes are + // not allowed. + return; + } + + if (index->createFlag || + (index->indexState != index->indexTarget)) { + index->changing = true; + index->deduping = false; + setupWorkItem(&index->workItem, + changeDedupeState, + NULL, + UDS_Q_ACTION); + enqueueWorkQueue(index->udsQueue, &index->workItem); + return; + } + + // Online vs. offline changes happen immediately + index->deduping = (index->dedupeFlag && !index->suspended && + (index->indexState == IS_OPENED)); + + // ASSERTION: We exit with the state_lock held. +} + +/*****************************************************************************/ +static void setTargetState(UDSIndex *index, + IndexState target, + bool changeDedupe, + bool dedupe, + bool setCreate) +{ + spin_lock(&index->stateLock); + const char *oldState = indexStateToString(index, index->indexTarget); + if (changeDedupe) { + index->dedupeFlag = dedupe; + } + if (setCreate) { + index->createFlag = true; + } + index->indexTarget = target; + launchDedupeStateChange(index); + const char *newState = indexStateToString(index, index->indexTarget); + spin_unlock(&index->stateLock); + if (oldState != newState) { + logInfo("Setting UDS index target state to %s", newState); + } +} + +/*****************************************************************************/ +static void suspendUDSIndex(DedupeIndex *dedupeIndex, bool saveFlag) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + index->suspended = true; + IndexState indexState = index->indexState; + spin_unlock(&index->stateLock); + if (indexState != IS_CLOSED) { + int result = udsSuspendIndexSession(index->indexSession, saveFlag); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error suspending dedupe index"); + } + } +} + +/*****************************************************************************/ +static void resumeUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + int result = udsResumeIndexSession(index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error resuming dedupe index"); + } + spin_lock(&index->stateLock); + index->suspended = false; + launchDedupeStateChange(index); + spin_unlock(&index->stateLock); +} + +/*****************************************************************************/ + +/*****************************************************************************/ +static void dumpUDSIndex(DedupeIndex *dedupeIndex, bool showQueue) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + const char *state = indexStateToString(index, index->indexState); + const char *target = (index->changing + ? indexStateToString(index, index->indexTarget) + : NULL); + spin_unlock(&index->stateLock); + logInfo("UDS index: state: %s", state); + if (target != NULL) { + logInfo("UDS index: changing to state: %s", target); + } + if (showQueue) { + dumpWorkQueue(index->udsQueue); + } +} + +/*****************************************************************************/ +static void finishUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_CLOSED, false, false, false); + udsDestroyIndexSession(index->indexSession); + finishWorkQueue(index->udsQueue); +} + +/*****************************************************************************/ +static void freeUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + freeWorkQueue(&index->udsQueue); + spin_lock_bh(&index->pendingLock); + if (index->startedTimer) { + del_timer_sync(&index->pendingTimer); + } + spin_unlock_bh(&index->pendingLock); + kobject_put(&index->dedupeObject); +} + +/*****************************************************************************/ +static const char *getUDSStateName(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + const char *state = indexStateToString(index, index->indexState); + spin_unlock(&index->stateLock); + return state; +} + +/*****************************************************************************/ +static void getUDSStatistics(DedupeIndex *dedupeIndex, IndexStatistics *stats) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + IndexState indexState = index->indexState; + stats->maxDedupeQueries = index->maximum; + spin_unlock(&index->stateLock); + stats->currDedupeQueries = atomic_read(&index->active); + if (indexState == IS_OPENED) { + UdsIndexStats indexStats; + int result = udsGetIndexStats(index->indexSession, &indexStats); + if (result == UDS_SUCCESS) { + stats->entriesIndexed = indexStats.entriesIndexed; + } else { + logErrorWithStringError(result, "Error reading index stats"); + } + UdsContextStats contextStats; + result = udsGetIndexSessionStats(index->indexSession, &contextStats); + if (result == UDS_SUCCESS) { + stats->postsFound = contextStats.postsFound; + stats->postsNotFound = contextStats.postsNotFound; + stats->queriesFound = contextStats.queriesFound; + stats->queriesNotFound = contextStats.queriesNotFound; + stats->updatesFound = contextStats.updatesFound; + stats->updatesNotFound = contextStats.updatesNotFound; + } else { + logErrorWithStringError(result, "Error reading context stats"); + } + } +} + + +/*****************************************************************************/ +static int processMessage(DedupeIndex *dedupeIndex, const char *name) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + if (strcasecmp(name, "index-close") == 0) { + setTargetState(index, IS_CLOSED, false, false, false); + return 0; + } else if (strcasecmp(name, "index-create") == 0) { + setTargetState(index, IS_OPENED, false, false, true); + return 0; + } else if (strcasecmp(name, "index-disable") == 0) { + setTargetState(index, IS_OPENED, true, false, false); + return 0; + } else if (strcasecmp(name, "index-enable") == 0) { + setTargetState(index, IS_OPENED, true, true, false); + return 0; + } + return -EINVAL; +} + +/*****************************************************************************/ +static void udsPost(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_POST); +} + +/*****************************************************************************/ +static void udsQuery(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_QUERY); +} + +/*****************************************************************************/ +static void startUDSIndex(DedupeIndex *dedupeIndex, bool createFlag) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_OPENED, true, true, createFlag); +} + +/*****************************************************************************/ +static void stopUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_CLOSED, false, false, false); +} + +/*****************************************************************************/ +static void udsUpdate(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_UPDATE); +} + +/*****************************************************************************/ +static void dedupeKobjRelease(struct kobject *kobj) +{ + UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); +} + +/*****************************************************************************/ +static ssize_t dedupeStatusShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + UDSAttribute *ua = container_of(attr, UDSAttribute, attr); + UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); + if (ua->showString != NULL) { + return sprintf(buf, "%s\n", ua->showString(&index->common)); + } else { + return -EINVAL; + } +} + +/*****************************************************************************/ +static ssize_t dedupeStatusStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + return -EINVAL; +} + +/*****************************************************************************/ + +static struct sysfs_ops dedupeSysfsOps = { + .show = dedupeStatusShow, + .store = dedupeStatusStore, +}; + +static UDSAttribute dedupeStatusAttribute = { + .attr = {.name = "status", .mode = 0444, }, + .showString = getUDSStateName, +}; + +static struct attribute *dedupeAttributes[] = { + &dedupeStatusAttribute.attr, + NULL, +}; + +static struct kobj_type dedupeKobjType = { + .release = dedupeKobjRelease, + .sysfs_ops = &dedupeSysfsOps, + .default_attrs = dedupeAttributes, +}; + +/*****************************************************************************/ +static void startUDSQueue(void *ptr) +{ + /* + * Allow the UDS dedupe worker thread to do memory allocations. It will + * only do allocations during the UDS calls that open or close an index, + * but those allocations can safely sleep while reserving a large amount + * of memory. We could use an allocationsAllowed boolean (like the base + * threads do), but it would be an unnecessary embellishment. + */ + UDSIndex *index = ptr; + registerAllocatingThread(&index->allocatingThread, NULL); +} + +/*****************************************************************************/ +static void finishUDSQueue(void *ptr) +{ + unregisterAllocatingThread(); +} + +/*****************************************************************************/ +int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) +{ + UDSIndex *index; + int result = ALLOCATE(1, UDSIndex, "UDS index data", &index); + if (result != UDS_SUCCESS) { + return result; + } + + result = allocSprintf("index name", &index->indexName, + "dev=%s offset=4096 size=%llu", + layer->deviceConfig->parentDeviceName, + getIndexRegionSize(layer->geometry) * VDO_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + logError("Creating index name failed (%d)", result); + FREE(index); + return result; + } + + index->udsParams = (struct uds_parameters) UDS_PARAMETERS_INITIALIZER; + indexConfigToUdsParameters(&layer->geometry.indexConfig, &index->udsParams); + result = indexConfigToUdsConfiguration(&layer->geometry.indexConfig, + &index->configuration); + if (result != VDO_SUCCESS) { + FREE(index->indexName); + FREE(index); + return result; + } + udsConfigurationSetNonce(index->configuration, + (UdsNonce) layer->geometry.nonce); + + result = udsCreateIndexSession(&index->indexSession); + if (result != UDS_SUCCESS) { + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + static const KvdoWorkQueueType udsQueueType = { + .start = startUDSQueue, + .finish = finishUDSQueue, + .actionTable = { + { .name = "uds_action", .code = UDS_Q_ACTION, .priority = 0 }, + }, + }; + result = makeWorkQueue(layer->threadNamePrefix, "dedupeQ", + &layer->wqDirectory, layer, index, &udsQueueType, 1, + &index->udsQueue); + if (result != VDO_SUCCESS) { + logError("UDS index queue initialization failed (%d)", result); + udsDestroyIndexSession(index->indexSession); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + kobject_init(&index->dedupeObject, &dedupeKobjType); + result = kobject_add(&index->dedupeObject, &layer->kobj, "dedupe"); + if (result != VDO_SUCCESS) { + freeWorkQueue(&index->udsQueue); + udsDestroyIndexSession(index->indexSession); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + index->common.dump = dumpUDSIndex; + index->common.free = freeUDSIndex; + index->common.getDedupeStateName = getUDSStateName; + index->common.getStatistics = getUDSStatistics; + index->common.message = processMessage; + index->common.post = udsPost; + index->common.query = udsQuery; + index->common.resume = resumeUDSIndex; + index->common.start = startUDSIndex; + index->common.stop = stopUDSIndex; + index->common.suspend = suspendUDSIndex; + index->common.finish = finishUDSIndex; + index->common.update = udsUpdate; + + INIT_LIST_HEAD(&index->pendingHead); + spin_lock_init(&index->pendingLock); + spin_lock_init(&index->stateLock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + timer_setup(&index->pendingTimer, timeoutIndexOperations, 0); +#else + setup_timer(&index->pendingTimer, timeoutIndexOperations, + (unsigned long) index); +#endif + + *indexPtr = &index->common; + return VDO_SUCCESS; +} diff --git a/vdo/kernel/udsIndex.h b/vdo/kernel/udsIndex.h new file mode 100644 index 0000000..19a7470 --- /dev/null +++ b/vdo/kernel/udsIndex.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.h#1 $ + */ + +#ifndef UDS_INDEX_H +#define UDS_INDEX_H + +#include "dedupeIndex.h" + +/** + * Make a UDS index + * + * @param layer the kernel layer + * @param indexPtr dedupe index returned here + * + * @return VDO_SUCCESS or an error code + **/ +int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) + __attribute__ ((__warn_unused_result__)); + +#endif /* UDS_INDEX_H */ diff --git a/vdo/kernel/vdoCommon.h b/vdo/kernel/vdoCommon.h new file mode 100644 index 0000000..c83e066 --- /dev/null +++ b/vdo/kernel/vdoCommon.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoCommon.h#1 $ + */ + +#ifndef VDO_COMMON_H +#define VDO_COMMON_H + +enum { + // Whether the bio acknowledgement queue is used for acks of reads. + USE_BIO_ACK_QUEUE_FOR_READ = 0, +}; + +#endif /* VDO_COMMON_H */ diff --git a/vdo/kernel/vdoStringUtils.c b/vdo/kernel/vdoStringUtils.c new file mode 100644 index 0000000..d12580c --- /dev/null +++ b/vdo/kernel/vdoStringUtils.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.c#1 $ + */ + +#include "vdoStringUtils.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "statusCodes.h" + +/**********************************************************************/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) +{ + size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); + if (n >= (size_t) (bufEnd - buffer)) { + buffer = bufEnd; + } else { + buffer += n; + } + return buffer; +} + +/**********************************************************************/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); + va_end(ap); + return pos; +} + +/**********************************************************************/ +void freeStringArray(char **stringArray) +{ + for (unsigned int offset = 0; stringArray[offset] != NULL; offset++) { + FREE(stringArray[offset]); + } + FREE(stringArray); +} + +/**********************************************************************/ +int splitString(const char *string, char separator, char ***substringArrayPtr) +{ + unsigned int substringCount = 1; + for (const char *s = string; *s != 0; s++) { + if (*s == separator) { + substringCount++; + } + } + + char **substrings; + int result = ALLOCATE(substringCount + 1, char *, "string-splitting array", + &substrings); + if (result != UDS_SUCCESS) { + return result; + } + unsigned int currentSubstring = 0; + for (const char *s = string; *s != 0; s++) { + if (*s == separator) { + ptrdiff_t length = s - string; + result = ALLOCATE(length + 1, char, "split string", + &substrings[currentSubstring]); + if (result != UDS_SUCCESS) { + freeStringArray(substrings); + return result; + } + // Trailing NUL is already in place after allocation; deal with + // the zero or more non-NUL bytes in the string. + if (length > 0) { + memcpy(substrings[currentSubstring], string, length); + } + string = s + 1; + currentSubstring++; + BUG_ON(currentSubstring >= substringCount); + } + } + // Process final string, with no trailing separator. + BUG_ON(currentSubstring != (substringCount - 1)); + ptrdiff_t length = strlen(string); + result = ALLOCATE(length + 1, char, "split string", + &substrings[currentSubstring]); + if (result != UDS_SUCCESS) { + freeStringArray(substrings); + return result; + } + memcpy(substrings[currentSubstring], string, length); + currentSubstring++; + // substrings[currentSubstring] is NULL already + *substringArrayPtr = substrings; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int joinStrings(char **substringArray, + size_t arrayLength, + char separator, + char **stringPtr) +{ + size_t stringLength = 0; + for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { + stringLength += strlen(substringArray[i]) + 1; + } + + char *output; + int result = ALLOCATE(stringLength, char, __func__, &output); + if (result != VDO_SUCCESS) { + return result; + } + + char *currentPosition = &output[0]; + for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { + currentPosition = appendToBuffer(currentPosition, output + stringLength, + "%s", substringArray[i]); + *currentPosition = separator; + currentPosition++; + } + + // We output one too many separators; replace the last with a zero byte. + if (currentPosition != output) { + *(currentPosition - 1) = '\0'; + } + + *stringPtr = output; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int stringToUInt(const char *input, unsigned int *valuePtr) +{ + unsigned long longValue; + int result = kstrtoul(input, 10, &longValue); + if (result != 0) { + return result; + } + + if (longValue > UINT_MAX) { + return -ERANGE; + } + + *valuePtr = longValue; + return UDS_SUCCESS; +} diff --git a/vdo/kernel/vdoStringUtils.h b/vdo/kernel/vdoStringUtils.h new file mode 100644 index 0000000..067ed9e --- /dev/null +++ b/vdo/kernel/vdoStringUtils.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.h#1 $ + */ + +#ifndef VDO_STRING_UTILS_H +#define VDO_STRING_UTILS_H + +#include +#include + +/** + * Helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...); + +/** + * Variable-arglist helper to append a string to a buffer. + * If insufficient space is available, the contents are silently truncated. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * @param args printf arguments + * + * @return the updated buffer position after the append + **/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args); + +/** + * Split the input string into substrings, separated at occurrences of + * the indicated character, returning a null-terminated list of string + * pointers. + * + * The string pointers and the pointer array itself should both be + * freed with FREE() when no longer needed. This can be done with + * freeStringArray (below) if the pointers in the array are not + * changed. Since the array and copied strings are allocated by this + * function, it may only be used in contexts where allocation is + * permitted. + * + * Empty substrings are not ignored; that is, returned substrings may + * be empty strings if the separator occurs twice in a row. + * + * @param [in] string The input string to be broken apart + * @param [in] separator The separator character + * @param [out] substringArrayPtr The NULL-terminated substring array + * + * @return UDS_SUCCESS or -ENOMEM + **/ +int splitString(const char *string, char separator, char ***substringArrayPtr) + __attribute__((warn_unused_result)); + +/** + * Join the input substrings into one string, joined with the indicated + * character, returning a string. + * + * @param [in] substringArray The NULL-terminated substring array + * @param [in] arrayLength A bound on the number of valid elements + * in substringArray, in case it is not + * NULL-terminated. + * @param [in] separator The separator character + * @param [out] stringPtr A pointer to hold the joined string + * + * @return VDO_SUCCESS or an error + **/ +int joinStrings(char **substringArray, + size_t arrayLength, + char separator, + char **stringPtr) + __attribute__((warn_unused_result)); + +/** + * Free a list of non-NULL string pointers, and then the list itself. + * + * @param stringArray The string list + **/ +void freeStringArray(char **stringArray); + +/** + * Parse a string as an "unsigned int" value, yielding the value. + * On overflow, -ERANGE is returned. On invalid number, -EINVAL is + * returned. + * + * @param [in] input The string to be processed + * @param [out] valuePtr The value of the number read + * + * @return UDS_SUCCESS or -EINVAL or -ERANGE. + **/ +int stringToUInt(const char *input, unsigned int *valuePtr) + __attribute__((warn_unused_result)); + +#endif /* VDO_STRING_UTILS_H */ diff --git a/vdo/kernel/verify.c b/vdo/kernel/verify.c new file mode 100644 index 0000000..672ac91 --- /dev/null +++ b/vdo/kernel/verify.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.c#3 $ + */ + +#include "verify.h" + +#include "logger.h" + +#include "dataKVIO.h" +#include "numeric.h" + +/** + * Compare blocks of memory for equality. + * + * This assumes the blocks are likely to be large; it's not well + * optimized for comparing just a few bytes. This is desirable + * because the Linux kernel memcmp() routine on x86 is not well + * optimized for large blocks, and the performance penalty turns out + * to be significant if you're doing lots of 4KB comparisons. + * + * @param pointerArgument1 first data block + * @param pointerArgument2 second data block + * @param length length of the data block + * + * @return true iff the two blocks are equal + **/ +__attribute__((warn_unused_result)) +static bool memoryEqual(void *pointerArgument1, + void *pointerArgument2, + size_t length) +{ + byte *pointer1 = pointerArgument1; + byte *pointer2 = pointerArgument2; + while (length >= sizeof(uint64_t)) { + /* + * GET_UNALIGNED is just for paranoia. (1) On x86_64 it is + * treated the same as an aligned access. (2) In this use case, + * one or both of the inputs will almost(?) always be aligned. + */ + if (GET_UNALIGNED(uint64_t, pointer1) + != GET_UNALIGNED(uint64_t, pointer2)) { + return false; + } + pointer1 += sizeof(uint64_t); + pointer2 += sizeof(uint64_t); + length -= sizeof(uint64_t); + } + while (length > 0) { + if (*pointer1 != *pointer2) { + return false; + } + pointer1++; + pointer2++; + length--; + } + return true; +} + +/** + * Verify the Albireo-provided deduplication advice, and invoke a + * callback once the answer is available. + * + * After we've compared the stored data with the data to be written, + * or after we've failed to be able to do so, the stored VIO callback + * is queued to be run in the main (kvdoReqQ) thread. + * + * If the advice turns out to be stale and the deduplication session + * is still active, submit a correction. (Currently the correction + * must be sent before the callback can be invoked, if the dedupe + * session is still live.) + * + * @param item The workitem from the queue + **/ +static void verifyDuplicationWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F;j=dedupe;cb=verify")); + + if (likely(memoryEqual(dataKVIO->dataBlock, dataKVIO->readBlock.data, + VDO_BLOCK_SIZE))) { + // Leave dataKVIO->dataVIO.isDuplicate set to true. + } else { + dataKVIO->dataVIO.isDuplicate = false; + } + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/** + * Verify the Albireo-provided deduplication advice, and invoke a + * callback once the answer is available. + * + * @param dataKVIO The DataKVIO that we are looking to dedupe. + **/ +static void verifyReadBlockCallback(DataKVIO *dataKVIO) +{ + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + int err = dataKVIO->readBlock.status; + if (unlikely(err != 0)) { + logDebug("%s: err %d", __func__, err); + dataKVIO->dataVIO.isDuplicate = false; + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, verifyDuplicationWork, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +/**********************************************************************/ +void kvdoVerifyDuplication(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->isDuplicate, "advice to verify must be valid"); + ASSERT_LOG_ONLY(dataVIO->duplicate.state != MAPPING_STATE_UNMAPPED, + "advice to verify must not be a discard"); + ASSERT_LOG_ONLY(dataVIO->duplicate.pbn != ZERO_BLOCK, + "advice to verify must not point to the zero block"); + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "zeroed block should not have advice to verify"); + + TraceLocation location + = THIS_LOCATION("verifyDuplication;dup=update(verify);io=verify"); + dataVIOAddTraceRecord(dataVIO, location); + kvdoReadBlock(dataVIO, dataVIO->duplicate.pbn, dataVIO->duplicate.state, + BIO_Q_ACTION_VERIFY, verifyReadBlockCallback); +} + +/**********************************************************************/ +bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) +{ + dataVIOAddTraceRecord(second, THIS_LOCATION(NULL)); + DataKVIO *a = dataVIOAsDataKVIO(first); + DataKVIO *b = dataVIOAsDataKVIO(second); + return memoryEqual(a->dataBlock, b->dataBlock, VDO_BLOCK_SIZE); +} diff --git a/vdo/kernel/verify.h b/vdo/kernel/verify.h new file mode 100644 index 0000000..5b03dd7 --- /dev/null +++ b/vdo/kernel/verify.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.h#1 $ + */ + +#include "kernelLayer.h" + +/** + * Verify the Albireo-provided deduplication advice, and invoke a callback once + * the answer is available. This is done through a call to kvdoReadBlock() + * which will eventually call back to verifyDuplication() once the block is + * read and possibly uncompressed. + * + * @param dataVIO The DataVIO with advice filled in. + **/ +void kvdoVerifyDuplication(DataVIO *dataVIO); + +/** + * Implements DataVIOComparator. + * + * @param first The first DataVIO to compare + * @param second The second DataVIO to compare + * + * @return true if the contents of the two DataVIOs are the same + **/ +bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) + __attribute__((warn_unused_result)); diff --git a/vdo/kernel/workItemStats.c b/vdo/kernel/workItemStats.c new file mode 100644 index 0000000..2027cd8 --- /dev/null +++ b/vdo/kernel/workItemStats.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.c#4 $ + */ + +#include "workItemStats.h" + +#include "atomic.h" +#include "logger.h" + +/** + * Scan the work queue stats table for the provided work function and + * priority value. If it's not found, see if an empty slot is + * available. + * + * @param table The work queue's function table + * @param work The function we want to record stats for + * @param priority The priority of the work item + * + * @return The index of the slot to use (matching or empty), or + * NUM_WORK_QUEUE_ITEM_STATS if the table is full of + * non-matching entries. + **/ +static inline unsigned int scanStatTable(const KvdoWorkFunctionTable *table, + KvdoWorkFunction work, + unsigned int priority) +{ + unsigned int i; + /* + * See comments in getStatTableIndex regarding order of memory + * accesses. Work function first, then a barrier, then priority. + */ + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (table->functions[i] == NULL) { + return i; + } else if (table->functions[i] == work) { + smp_rmb(); + if (table->priorities[i] == priority) { + return i; + } + } + } + return NUM_WORK_QUEUE_ITEM_STATS; +} + +/** + * Scan the work queue stats table for the provided work function and + * priority value. Assign an empty slot if necessary. + * + * @param stats The stats structure + * @param work The function we want to record stats for + * @param priority The priority of the work item + * + * @return The index of the matching slot, or NUM_WORK_QUEUE_ITEM_STATS + * if the table is full of non-matching entries. + **/ +static unsigned int getStatTableIndex(KvdoWorkItemStats *stats, + KvdoWorkFunction work, + unsigned int priority) +{ + KvdoWorkFunctionTable *functionTable = &stats->functionTable; + + unsigned int index = scanStatTable(functionTable, work, priority); + if (unlikely(index == NUM_WORK_QUEUE_ITEM_STATS) + || likely(functionTable->functions[index] != NULL)) { + return index; + } + + unsigned long flags = 0; + // The delayed-work-item processing uses queue->lock in some cases, + // and one case may call into this function, so we can't reuse + // queue->lock here. + spin_lock_irqsave(&functionTable->lock, flags); + // Recheck now that we've got the lock... + index = scanStatTable(functionTable, work, priority); + if ((index == NUM_WORK_QUEUE_ITEM_STATS) + || (functionTable->functions[index] != NULL)) { + spin_unlock_irqrestore(&functionTable->lock, flags); + return index; + } + + /* + * An uninitialized priority is indistinguishable from a zero + * priority. So store the priority first, and enforce the ordering, + * so that a non-null work function pointer indicates we've finished + * filling in the value. (And, to make this work, we have to read + * the work function first and priority second, when comparing.) + */ + functionTable->priorities[index] = priority; + smp_wmb(); + functionTable->functions[index] = work; + spin_unlock_irqrestore(&functionTable->lock, flags); + return index; +} + +/** + * Get counters on work items, identified by index into the internal + * array. + * + * @param [in] stats The collected statistics + * @param [in] index The index + * @param [out] enqueuedPtr The total work items enqueued + * @param [out] processedPtr The number of work items processed + * @param [out] pendingPtr The number of work items still pending + **/ +static void getWorkItemCountsByItem(const KvdoWorkItemStats *stats, + unsigned int index, + uint64_t *enqueuedPtr, + uint64_t *processedPtr, + unsigned int *pendingPtr) +{ + uint64_t enqueued = atomic64_read(&stats->enqueued[index]); + uint64_t processed = stats->times[index].count; + unsigned int pending; + if (enqueued < processed) { + // Probably just out of sync. + pending = 1; + } else { + pending = enqueued - processed; + // Pedantic paranoia: Check for overflow of the 32-bit "pending". + if ((pending + processed) < enqueued) { + pending = UINT_MAX; + } + } + *enqueuedPtr = enqueued; + *processedPtr = processed; + *pendingPtr = pending; +} + +/** + * Get counters on work items not covered by any index value. + * + * @param [in] stats The collected statistics + * @param [out] enqueuedPtr The total work items enqueued + * @param [out] processedPtr The number of work items processed + **/ +static void getOtherWorkItemCounts(const KvdoWorkItemStats *stats, + uint64_t *enqueuedPtr, + uint64_t *processedPtr) +{ + unsigned int pending; + getWorkItemCountsByItem(stats, NUM_WORK_QUEUE_ITEM_STATS, + enqueuedPtr, processedPtr, &pending); +} + +/** + * Get timing stats on work items, identified by index into the + * internal array. + * + * @param [in] stats The collected statistics + * @param [in] index The index into the array + * @param [out] min The minimum execution time + * @param [out] mean The mean execution time + * @param [out] max The maximum execution time + **/ +static void getWorkItemTimesByItem(const KvdoWorkItemStats *stats, + unsigned int index, + uint64_t *min, + uint64_t *mean, + uint64_t *max) +{ + *min = stats->times[index].min; + *mean = getSampleAverage(&stats->times[index]); + *max = stats->times[index].max; +} + +/**********************************************************************/ +void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, + KvdoWorkItem *item, + int priority) +{ + item->statTableIndex = getStatTableIndex(stats, item->statsFunction, + priority); + atomic64_add(1, &stats->enqueued[item->statTableIndex]); +} + +/**********************************************************************/ +char *getFunctionName(void *pointer, char *buffer, size_t bufferLength) +{ + if (pointer == NULL) { + /* + * Format "%ps" logs a null pointer as "(null)" with a bunch of + * leading spaces. We sometimes use this when logging lots of + * data; don't be so verbose. + */ + strncpy(buffer, "-", bufferLength); + } else { + /* + * Use a non-const array instead of a string literal below to + * defeat gcc's format checking, which doesn't understand that + * "%ps" actually does support a precision spec in Linux kernel + * code. + */ + static char truncatedFunctionNameFormatString[] = "%.*ps"; + snprintf(buffer, bufferLength, + truncatedFunctionNameFormatString, + bufferLength - 1, + pointer); + + char *space = strchr(buffer, ' '); + if (space != NULL) { + *space = '\0'; + } + } + + return buffer; +} + +/**********************************************************************/ +size_t formatWorkItemStats(const KvdoWorkItemStats *stats, + char *buffer, + size_t length) +{ + const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; + size_t currentOffset = 0; + + uint64_t enqueued, processed; + int i; + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (functionIDs->functions[i] == NULL) { + break; + } + if (atomic64_read(&stats->enqueued[i]) == 0) { + continue; + } + /* + * The reporting of all of "pending", "enqueued" and "processed" + * here seems redundant, but "pending" is limited to 0 in the case + * where "processed" exceeds "enqueued", either through current + * activity and a lack of synchronization when fetching stats, or + * a coding bug. This report is intended largely for debugging, so + * we'll go ahead and print the not-necessarily-redundant values. + */ + unsigned int pending; + getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); + + // Format: fn prio enq proc timeo [ min max mean ] + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t min, mean, max; + getWorkItemTimesByItem(stats, i, &min, &mean, &max); + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36ps %d %10llu %10" PRIu64 + " %10llu %10llu %10" PRIu64 + "\n", + functionIDs->functions[i], + functionIDs->priorities[i], + enqueued, processed, + min, max, mean); + } else { + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36ps %d %10llu %10" PRIu64 + "\n", + functionIDs->functions[i], + functionIDs->priorities[i], + enqueued, processed); + } + if (currentOffset >= length) { + break; + } + } + if ((i == NUM_WORK_QUEUE_ITEM_STATS) && (currentOffset < length)) { + uint64_t enqueued, processed; + getOtherWorkItemCounts(stats, &enqueued, &processed); + if (enqueued > 0) { + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36s %d %10llu %10" PRIu64 + "\n", + "OTHER", 0, + enqueued, processed); + } + } + return currentOffset; +} + +/**********************************************************************/ +void logWorkItemStats(const KvdoWorkItemStats *stats) +{ + uint64_t totalEnqueued = 0; + uint64_t totalProcessed = 0; + + const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; + + int i; + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (functionIDs->functions[i] == NULL) { + break; + } + if (atomic64_read(&stats->enqueued[i]) == 0) { + continue; + } + /* + * The reporting of all of "pending", "enqueued" and "processed" + * here seems redundant, but "pending" is limited to 0 in the case + * where "processed" exceeds "enqueued", either through current + * activity and a lack of synchronization when fetching stats, or + * a coding bug. This report is intended largely for debugging, so + * we'll go ahead and print the not-necessarily-redundant values. + */ + uint64_t enqueued, processed; + unsigned int pending; + getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); + totalEnqueued += enqueued; + totalProcessed += processed; + + static char work[256]; // arbitrary size + getFunctionName(functionIDs->functions[i], work, sizeof(work)); + + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t min, mean, max; + getWorkItemTimesByItem(stats, i, &min, &mean, &max); + logInfo(" priority %d: %u pending" + " %llu enqueued %llu processed" + " %s" + " times %llu/%llu/%lluns", + functionIDs->priorities[i], + pending, enqueued, processed, work, + min, mean, max); + } else { + logInfo(" priority %d: %u pending" + " %llu enqueued %llu processed" + " %s", + functionIDs->priorities[i], + pending, enqueued, processed, work); + } + } + if (i == NUM_WORK_QUEUE_ITEM_STATS) { + uint64_t enqueued, processed; + getOtherWorkItemCounts(stats, &enqueued, &processed); + if (enqueued > 0) { + totalEnqueued += enqueued; + totalProcessed += processed; + logInfo(" ... others: %llu enqueued %llu processed", + enqueued, processed); + } + } + logInfo(" total: %llu enqueued %llu processed", + totalEnqueued, totalProcessed); +} diff --git a/vdo/kernel/workItemStats.h b/vdo/kernel/workItemStats.h new file mode 100644 index 0000000..0898f3b --- /dev/null +++ b/vdo/kernel/workItemStats.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.h#2 $ + */ + +#ifndef WORK_ITEM_STATS_H +#define WORK_ITEM_STATS_H + +#include "timeUtils.h" + +#include "workQueue.h" + +enum { + // Whether to enable tracking of per-work-function run-time stats. + ENABLE_PER_FUNCTION_TIMING_STATS = 0, + // How many work function/priority pairs to track call stats for + NUM_WORK_QUEUE_ITEM_STATS = 18, +}; + +typedef struct simpleStats { + uint64_t count; + uint64_t sum; + uint64_t min; + uint64_t max; +} SimpleStats; + +/* + * We track numbers of work items handled (and optionally the + * wall-clock time to run the work functions), broken down by + * individual work functions (or alternate functions that the caller + * wants recorded, like the VIO completion callback function if we're + * just enqueueing a work function that invokes that indirectly) and + * priority. + * + * The first part of this structure manages the function/priority + * pairs, and is read frequently but updated rarely (once for each + * pair, plus possibly spin lock contention). + * + * The second part holds counters, and is updated often; different + * parts are updated by various threads as described below. The last + * element of each array, index NUM_WORK_QUEUE_ITEM_STATS, is updated + * only if we have filled the arrays and can't add the current work + * function/priority. See how the statTableIndex field is set in + * workItemStats.c. + * + * All fields may additionally be read when reporting statistics + * (including optionally reporting stats when the worker thread shuts + * down), but that's rare and shouldn't significantly affect cache + * contention issues. + * + * There is no "pending" count per work function here. For reporting + * statistics, it can be approximated by looking at the other fields. + * Do not rely on them being precise and synchronized, though. + */ +typedef struct kvdoWorkItemStatsFunctionTable { + /* + * The spin lock is used to protect .functions and .priorities + * during updates. All three are modified by producers (enqueueing + * threads) but only rarely. The .functions and .priorities arrays + * are read by producers very frequently. + */ + spinlock_t lock; + KvdoWorkFunction functions[NUM_WORK_QUEUE_ITEM_STATS]; + uint8_t priorities[NUM_WORK_QUEUE_ITEM_STATS]; +} KvdoWorkFunctionTable; + +typedef struct kvdoWorkItemStats { + /* + * Table of functions and priorities, for determining the index to + * use into the counter arrays below. + * + * This table is read by producers (usually multiple entries) for + * every work item enqueued, and when reporting stats. It is updated + * by producers, and only the first time a new (work-function, + * priority) combination is seen. + */ + KvdoWorkFunctionTable functionTable; + // Skip to (somewhere on) the next cache line + char pad[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /* + * The .enqueued field is updated by producers only, once per work + * item processed; __sync operations are used to update these + * values. + */ + atomic64_t enqueued[NUM_WORK_QUEUE_ITEM_STATS + 1]; + // Skip to (somewhere on) the next cache line + char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /* + * These values are updated only by the consumer (worker thread). We + * overload the .times[].count field as a count of items processed, + * so if we're not doing the optional processing-time tracking + * (controlled via an option in workQueue.c), we need to explicitly + * update the count. + * + * Since only one thread can ever update these values, no + * synchronization is used. + */ + SimpleStats times[NUM_WORK_QUEUE_ITEM_STATS + 1]; +} KvdoWorkItemStats; + +/** + * Initialize a statistics structure for tracking sample + * values. Assumes the storage was already zeroed out at allocation + * time. + * + * @param stats The statistics structure + **/ +static inline void initSimpleStats(SimpleStats *stats) +{ + // Assume other fields are initialized to zero at allocation. + stats->min = UINT64_MAX; +} + +/** + * Update the statistics being tracked for a new sample value. + * + * @param stats The statistics structure + * @param value The new value to be folded in + **/ +static inline void addSample(SimpleStats *stats, uint64_t value) +{ + stats->count++; + stats->sum += value; + if (stats->min > value) { + stats->min = value; + } + if (stats->max < value) { + stats->max = value; + } +} + +/** + * Return the average of the samples collected. + * + * @param stats The statistics structure + * + * @return The average sample value + **/ +static inline uint64_t getSampleAverage(const SimpleStats *stats) +{ + uint64_t slop = stats->count / 2; + return (stats->sum + slop) / stats->count; +} + +/** + * Update all work queue statistics (work-item and otherwise) after + * enqueueing a work item. + * + * @param stats The statistics structure + * @param item The work item enqueued + * @param priority The work item's priority + **/ +void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, + KvdoWorkItem *item, + int priority); + +/** + * Update all work queue statistics (work-item and otherwise) after enqueueing + * a work item. + * + * This is a very lightweight function (after optimizing away conditionals and + * no-ops) and is called for every work item processed, hence the inline + * definition. + * + * This function requires that recordStartTime and + * updateWorkItemStatsForWorkTime below both get called as well; in some cases + * counters may be updated in updateWorkItemStatsForWorkTime rather than here. + * + * @param stats The statistics structure + * @param item The work item enqueued + **/ +static inline void updateWorkItemStatsForDequeue(KvdoWorkItemStats *stats, + KvdoWorkItem *item) +{ + // The times[].count field is overloaded as a count of items + // processed. + if (!ENABLE_PER_FUNCTION_TIMING_STATS) { + stats->times[item->statTableIndex].count++; + } else { + // In this case, updateWorkItemStatsForWorkTime will bump the counter. + } +} + +/** + * Record the starting time for processing a work item, if timing + * stats are enabled and if we haven't run out of room for recording + * stats in the table. + * + * @param index The work item's index into the internal array + * + * @return The current time, or zero + **/ +static inline uint64_t recordStartTime(unsigned int index) +{ + return (ENABLE_PER_FUNCTION_TIMING_STATS ? currentTime(CLOCK_MONOTONIC) : 0); +} + +/** + * Update the work queue statistics with the wall-clock time for + * processing a work item, if timing stats are enabled and if we + * haven't run out of room for recording stats in the table. + * + * @param stats The statistics structure + * @param index The work item's index into the internal array + * @param startTime The start time as reported by recordStartTime + **/ +static inline void updateWorkItemStatsForWorkTime(KvdoWorkItemStats *stats, + unsigned int index, + uint64_t startTime) +{ + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t endTime = currentTime(CLOCK_MONOTONIC); + addSample(&stats->times[index], endTime - startTime); + } +} + +/** + * Convert the pointer into a string representation, using a function + * name if available. + * + * @param pointer The pointer to be converted + * @param buffer The output buffer + * @param bufferLength The size of the output buffer + **/ +char *getFunctionName(void *pointer, char *buffer, size_t bufferLength); + +/** + * Dump statistics broken down by work function and priority into the + * kernel log. + * + * @param stats The statistics structure + **/ +void logWorkItemStats(const KvdoWorkItemStats *stats); + +/** + * Format counters for per-work-function stats for reporting via /sys. + * + * @param [in] stats The statistics structure + * @param [out] buffer The output buffer + * @param [in] length The size of the output buffer + * + * @return The size of the string actually written + **/ +size_t formatWorkItemStats(const KvdoWorkItemStats *stats, + char *buffer, + size_t length); + +#endif // WORK_ITEM_STATS_H diff --git a/vdo/kernel/workQueue.c b/vdo/kernel/workQueue.c new file mode 100644 index 0000000..8be3285 --- /dev/null +++ b/vdo/kernel/workQueue.c @@ -0,0 +1,1152 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.c#11 $ + */ + +#include "workQueue.h" + +#include +#include +#include + +#include "atomic.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" + +#include "numeric.h" +#include "workItemStats.h" +#include "workQueueHandle.h" +#include "workQueueInternals.h" +#include "workQueueStats.h" +#include "workQueueSysfs.h" + +enum { + // Time between work queue heartbeats in usec. The default kernel + // configurations generally have 1ms or 4ms tick rates, so let's make this a + // multiple for accuracy. + FUNNEL_HEARTBEAT_INTERVAL = 4000, + + // Time to wait for a work queue to flush remaining items during shutdown. + // Specified in milliseconds. + FUNNEL_FINISH_SLEEP = 5000, +}; + +static struct mutex queueDataLock; +static SimpleWorkQueue queueData; + +static void freeSimpleWorkQueue(SimpleWorkQueue *queue); +static void finishSimpleWorkQueue(SimpleWorkQueue *queue); + +// work item lists (used for delayed work items) + +/**********************************************************************/ +static void initializeWorkItemList(KvdoWorkItemList *list) +{ + list->tail = NULL; +} + +/**********************************************************************/ +static void addToWorkItemList(KvdoWorkItemList *list, KvdoWorkItem *item) +{ + if (list->tail == NULL) { + item->next = item; + } else { + KvdoWorkItem *head = list->tail->next; + list->tail->next = item; + item->next = head; + } + list->tail = item; +} + +/**********************************************************************/ +static bool isWorkItemListEmpty(KvdoWorkItemList *list) +{ + return list->tail == NULL; +} + +/**********************************************************************/ +static KvdoWorkItem *workItemListPoll(KvdoWorkItemList *list) +{ + KvdoWorkItem *tail = list->tail; + if (tail == NULL) { + return NULL; + } + // Extract and return head of list. + KvdoWorkItem *head = tail->next; + // Only one entry? + if (head == tail) { + list->tail = NULL; + } else { + tail->next = head->next; + } + head->next = NULL; + return head; +} + +/**********************************************************************/ +static KvdoWorkItem *workItemListPeek(KvdoWorkItemList *list) +{ + KvdoWorkItem *tail = list->tail; + return tail ? tail->next : NULL; +} + +// Finding the SimpleWorkQueue to actually operate on. + +/** + * Pick the next subordinate service queue in rotation. + * + * This doesn't need to be 100% precise in distributing work items around, so + * playing loose with concurrent field modifications isn't going to hurt us. + * (Avoiding the atomic ops may help us a bit in performance, but we'll still + * have contention over the fields.) + * + * @param queue The round-robin-type work queue + * + * @return A subordinate work queue + **/ +static inline SimpleWorkQueue *nextServiceQueue(RoundRobinWorkQueue *queue) +{ + unsigned int index = (queue->serviceQueueRotor++ % queue->numServiceQueues); + return queue->serviceQueues[index]; +} + +/** + * Find a simple work queue on which to operate. + * + * If the argument is already a simple work queue, use it. If it's a + * round-robin work queue, pick the next subordinate service queue and use it. + * + * @param queue a work queue (round-robin or simple) + * + * @return a simple work queue + **/ +static inline SimpleWorkQueue *pickSimpleQueue(KvdoWorkQueue *queue) +{ + return (queue->roundRobinMode + ? nextServiceQueue(asRoundRobinWorkQueue(queue)) + : asSimpleWorkQueue(queue)); +} + +// Processing normal work items. + +/** + * Scan the work queue's work item lists, and dequeue and return the next + * waiting work item, if any. + * + * We scan the funnel queues from highest priority to lowest, once; there is + * therefore a race condition where a high-priority work item can be enqueued + * followed by a lower-priority one, and we'll grab the latter (but we'll catch + * the high-priority item on the next call). If strict enforcement of + * priorities becomes necessary, this function will need fixing. + * + * @param queue the work queue + * + * @return a work item pointer, or NULL + **/ +static KvdoWorkItem *pollForWorkItem(SimpleWorkQueue *queue) +{ + KvdoWorkItem *item = NULL; + for (int i = READ_ONCE(queue->numPriorityLists) - 1; i >= 0; i--) { + FunnelQueueEntry *link = funnelQueuePoll(queue->priorityLists[i]); + if (link != NULL) { + item = container_of(link, KvdoWorkItem, workQueueEntryLink); + break; + } + } + + return item; +} + +/** + * Add a work item into the queue, and inform the caller of any additional + * processing necessary. + * + * If the worker thread may not be awake, true is returned, and the caller + * should attempt a wakeup. + * + * @param queue The work queue + * @param item The work item to add + * + * @return true iff the caller should wake the worker thread + **/ +__attribute__((warn_unused_result)) +static bool enqueueWorkQueueItem(SimpleWorkQueue *queue, KvdoWorkItem *item) +{ + ASSERT_LOG_ONLY(item->myQueue == NULL, + "item %" PRIptr " (fn %" PRIptr "/%" PRIptr + ") to enqueue (%" PRIptr + ") is not already queued (%" PRIptr ")", + item, item->work, item->statsFunction, queue, + item->myQueue); + if (ASSERT(item->action < WORK_QUEUE_ACTION_COUNT, + "action is in range for queue") != VDO_SUCCESS) { + item->action = 0; + } + unsigned int priority = READ_ONCE(queue->priorityMap[item->action]); + + // Update statistics. + updateStatsForEnqueue(&queue->stats, item, priority); + + item->myQueue = &queue->common; + + // Funnel queue handles the synchronization for the put. + funnelQueuePut(queue->priorityLists[priority], &item->workQueueEntryLink); + + /* + * Due to how funnel-queue synchronization is handled (just atomic + * operations), the simplest safe implementation here would be to wake-up any + * waiting threads after enqueueing each item. Even if the funnel queue is + * not empty at the time of adding an item to the queue, the consumer thread + * may not see this since it is not guaranteed to have the same view of the + * queue as a producer thread. + * + * However, the above is wasteful so instead we attempt to minimize the + * number of thread wakeups. This is normally unsafe due to the above + * consumer-producer synchronization constraints. To correct this a timeout + * mechanism is used to wake the thread periodically to handle the occasional + * race condition that triggers and results in this thread not being woken + * properly. + * + * In most cases, the above timeout will not occur prior to some other work + * item being added after the queue is set to idle state, so thread wakeups + * will generally be triggered much faster than this interval. The timeout + * provides protection against the cases where more work items are either not + * added or are added too infrequently. + * + * This is also why we can get away with the normally-unsafe optimization for + * the common case by checking queue->idle first without synchronization. The + * race condition exists, but another work item getting enqueued can wake us + * up, and if we don't get that either, we still have the timeout to fall + * back on. + * + * Developed and tuned for some x86 boxes; untested whether this is any + * better or worse for other platforms, with or without the explicit memory + * barrier. + */ + smp_mb(); + return ((atomic_read(&queue->idle) == 1) + && (atomic_cmpxchg(&queue->idle, 1, 0) == 1)); +} + +/** + * Compute an approximate indication of the number of pending work items. + * + * No synchronization is used, so it's guaranteed to be correct only if there + * is no activity. + * + * @param queue The work queue to examine + * + * @return the estimate of the number of pending work items + **/ +static unsigned int getPendingCount(SimpleWorkQueue *queue) +{ + KvdoWorkItemStats *stats = &queue->stats.workItemStats; + long long pending = 0; + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + pending += atomic64_read(&stats->enqueued[i]); + pending -= stats->times[i].count; + } + if (pending < 0) { + /* + * If we fetched numbers that were changing, we can get negative results. + * Just return an indication that there's some activity. + */ + pending = 1; + } + return pending; +} + +/** + * Run any start hook that may be defined for the work queue. + * + * @param queue The work queue + **/ +static void runStartHook(SimpleWorkQueue *queue) +{ + if (queue->type->start != NULL) { + queue->type->start(queue->private); + } +} + +/** + * Run any finish hook that may be defined for the work queue. + * + * @param queue The work queue + **/ +static void runFinishHook(SimpleWorkQueue *queue) +{ + if (queue->type->finish != NULL) { + queue->type->finish(queue->private); + } +} + +/** + * If the work queue has a suspend hook, invoke it, and when it finishes, check + * again for any pending work items. + * + * We assume a check for pending work items has just been done and turned up + * empty; so, if no suspend hook exists, we can just return NULL without doing + * another check. + * + * @param [in] queue The work queue preparing to suspend + * + * @return the newly found work item, if any + **/ +static KvdoWorkItem *runSuspendHook(SimpleWorkQueue *queue) +{ + if (queue->type->suspend == NULL) { + return NULL; + } + + queue->type->suspend(queue->private); + return pollForWorkItem(queue); +} + +/** + * Check whether a work queue has delayed work items pending. + * + * @param queue The work queue + * + * @return true iff delayed work items are pending + **/ +static bool hasDelayedWorkItems(SimpleWorkQueue *queue) +{ + bool result; + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + result = !isWorkItemListEmpty(&queue->delayedItems); + spin_unlock_irqrestore(&queue->lock, flags); + return result; +} + +/** + * Wait for the next work item to process, or until kthread_should_stop + * indicates that it's time for us to shut down. + * + * If kthread_should_stop says it's time to stop but we have pending work + * items, return a work item. + * + * Update statistics relating to scheduler interactions. + * + * @param [in] queue The work queue to wait on + * @param [in] timeoutInterval How long to wait each iteration + * + * @return the next work item, or NULL to indicate shutdown is requested + **/ +static KvdoWorkItem *waitForNextWorkItem(SimpleWorkQueue *queue, + TimeoutJiffies timeoutInterval) +{ + KvdoWorkItem *item = runSuspendHook(queue); + if (item != NULL) { + return item; + } + + DEFINE_WAIT(wait); + while (true) { + atomic64_set(&queue->firstWakeup, 0); + prepare_to_wait(&queue->waitingWorkerThreads, &wait, TASK_INTERRUPTIBLE); + /* + * Don't set the idle flag until a wakeup will not be lost. + * + * Force synchronization between setting the idle flag and checking the + * funnel queue; the producer side will do them in the reverse order. + * (There's still a race condition we've chosen to allow, because we've got + * a timeout below that unwedges us if we hit it, but this may narrow the + * window a little.) + */ + atomic_set(&queue->idle, 1); + memoryFence(); // store-load barrier between "idle" and funnel queue + + item = pollForWorkItem(queue); + if (item != NULL) { + break; + } + + /* + * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state + * up above. Otherwise, schedule() will put the thread to sleep and might + * miss a wakeup from kthread_stop() call in finishWorkQueue(). + * + * If there are delayed work items, we need to wait for them to + * get run. Then, when we check kthread_should_stop again, we'll + * finally exit. + */ + if (kthread_should_stop() && !hasDelayedWorkItems(queue)) { + /* + * Recheck once again in case we *just* converted a delayed work item to + * a regular enqueued work item. + * + * It's important that processDelayedWorkItems holds the spin lock until + * it finishes enqueueing the work item to run. + * + * Funnel queues aren't synchronized between producers and consumer. + * Normally a producer interrupted mid-update can hide a later producer's + * entry until the first completes. This would be a problem, except that + * when kthread_stop is called, we should already have ceased adding new + * work items and have waited for all the regular work items to finish; + * (recurring) delayed work items should be the only exception. + * + * Worker thread shutdown would be simpler if even the delayed work items + * were required to be completed and not re-queued before shutting down a + * work queue. + */ + item = pollForWorkItem(queue); + break; + } + + /* + * We don't need to update the wait count atomically since this is the only + * place it is modified and there is only one thread involved. + */ + queue->stats.waits++; + uint64_t timeBeforeSchedule = currentTime(CLOCK_MONOTONIC); + atomic64_add(timeBeforeSchedule - queue->mostRecentWakeup, + &queue->stats.runTime); + // Wake up often, to address the missed-wakeup race. + schedule_timeout(timeoutInterval); + queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); + uint64_t callDurationNS = queue->mostRecentWakeup - timeBeforeSchedule; + enterHistogramSample(queue->stats.scheduleTimeHistogram, + callDurationNS / 1000); + + /* + * Check again before resetting firstWakeup for more accurate + * stats. (It's still racy, which can't be fixed without requiring + * tighter synchronization between producer and consumer sides.) + */ + item = pollForWorkItem(queue); + if (item != NULL) { + break; + } + } + + if (item != NULL) { + uint64_t firstWakeup = atomic64_read(&queue->firstWakeup); + /* + * We sometimes register negative wakeup latencies without this fencing. + * Whether it's forcing full serialization between the read of firstWakeup + * and the "rdtsc" that might be used depending on the clock source that + * helps, or some extra nanoseconds of delay covering for high-resolution + * clocks not being quite in sync between CPUs, is not yet clear. + */ + loadFence(); + if (firstWakeup != 0) { + enterHistogramSample(queue->stats.wakeupLatencyHistogram, + (currentTime(CLOCK_MONOTONIC) - firstWakeup) / 1000); + enterHistogramSample(queue->stats.wakeupQueueLengthHistogram, + getPendingCount(queue)); + } + } + finish_wait(&queue->waitingWorkerThreads, &wait); + atomic_set(&queue->idle, 0); + + return item; +} + +/** + * Get the next work item to process, possibly waiting for one, unless + * kthread_should_stop indicates that it's time for us to shut down. + * + * If kthread_should_stop says it's time to stop but we have pending work + * items, return a work item. + * + * @param [in] queue The work queue to wait on + * @param [in] timeoutInterval How long to wait each iteration + * + * @return the next work item, or NULL to indicate shutdown is requested + **/ +static KvdoWorkItem *getNextWorkItem(SimpleWorkQueue *queue, + TimeoutJiffies timeoutInterval) +{ + KvdoWorkItem *item = pollForWorkItem(queue); + if (item != NULL) { + return item; + } + return waitForNextWorkItem(queue, timeoutInterval); +} + +/** + * Execute a work item from a work queue, and do associated bookkeeping. + * + * @param [in] queue the work queue the item is from + * @param [in] item the work item to run + **/ +static void processWorkItem(SimpleWorkQueue *queue, + KvdoWorkItem *item) +{ + if (ASSERT(item->myQueue == &queue->common, + "item %" PRIptr " from queue %" PRIptr + " marked as being in this queue (%" PRIptr ")", + item, queue, item->myQueue) == UDS_SUCCESS) { + updateStatsForDequeue(&queue->stats, item); + item->myQueue = NULL; + } + + // Save the index, so we can use it after the work function. + unsigned int index = item->statTableIndex; + uint64_t workStartTime = recordStartTime(index); + item->work(item); + // We just surrendered control of the work item; no more access. + item = NULL; + updateWorkItemStatsForWorkTime(&queue->stats.workItemStats, index, + workStartTime); + + /* + * Be friendly to a CPU that has other work to do, if the kernel has told us + * to. This speeds up some performance tests; that "other work" might include + * other VDO threads. + * + * N.B.: We compute the pending count info here without any synchronization, + * but it's for stats reporting only, so being imprecise isn't too big a + * deal, as long as reads and writes are atomic operations. + */ + if (need_resched()) { + uint64_t timeBeforeReschedule = currentTime(CLOCK_MONOTONIC); + // Record the queue length we have *before* rescheduling. + unsigned int queueLen = getPendingCount(queue); + cond_resched(); + uint64_t timeAfterReschedule = currentTime(CLOCK_MONOTONIC); + + enterHistogramSample(queue->stats.rescheduleQueueLengthHistogram, + queueLen); + uint64_t runTimeNS = timeBeforeReschedule - queue->mostRecentWakeup; + enterHistogramSample(queue->stats.runTimeBeforeRescheduleHistogram, + runTimeNS / 1000); + atomic64_add(runTimeNS, &queue->stats.runTime); + uint64_t callTimeNS = timeAfterReschedule - timeBeforeReschedule; + enterHistogramSample(queue->stats.rescheduleTimeHistogram, + callTimeNS / 1000); + atomic64_add(callTimeNS, &queue->stats.rescheduleTime); + queue->mostRecentWakeup = timeAfterReschedule; + } +} + +/** + * Main loop of the work queue worker thread. + * + * Waits for work items and runs them, until told to stop. + * + * @param queue The work queue to run + **/ +static void serviceWorkQueue(SimpleWorkQueue *queue) +{ + TimeoutJiffies timeoutInterval = + maxLong(2, usecs_to_jiffies(FUNNEL_HEARTBEAT_INTERVAL + 1) - 1); + + runStartHook(queue); + + while (true) { + KvdoWorkItem *item = getNextWorkItem(queue, timeoutInterval); + if (item == NULL) { + // No work items but kthread_should_stop was triggered. + break; + } + // Process the work item + processWorkItem(queue, item); + } + + runFinishHook(queue); +} + +/** + * Initialize per-thread data for a new worker thread and run the work queue. + * Called in a new thread created by kthread_run(). + * + * @param ptr A pointer to the KvdoWorkQueue to run. + * + * @return 0 (indicating success to kthread_run()) + **/ +static int workQueueRunner(void *ptr) +{ + SimpleWorkQueue *queue = ptr; + kobject_get(&queue->common.kobj); + + WorkQueueStackHandle queueHandle; + initializeWorkQueueStackHandle(&queueHandle, queue); + queue->stats.startTime = queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + queue->started = true; + spin_unlock_irqrestore(&queue->lock, flags); + wake_up(&queue->startWaiters); + serviceWorkQueue(queue); + + // Zero out handle structure for safety. + memset(&queueHandle, 0, sizeof(queueHandle)); + + kobject_put(&queue->common.kobj); + return 0; +} + +// Preparing work items + +/**********************************************************************/ +void setupWorkItem(KvdoWorkItem *item, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + ASSERT_LOG_ONLY(item->myQueue == NULL, + "setupWorkItem not called on enqueued work item"); + item->work = work; + item->statsFunction = ((statsFunction == NULL) ? work : statsFunction); + item->statTableIndex = 0; + item->action = action; + item->myQueue = NULL; + item->executionTime = 0; + item->next = NULL; +} + +// Thread management + +/**********************************************************************/ +static inline void wakeWorkerThread(SimpleWorkQueue *queue) +{ + smp_mb(); + atomic64_cmpxchg(&queue->firstWakeup, 0, currentTime(CLOCK_MONOTONIC)); + // Despite the name, there's a maximum of one thread in this list. + wake_up(&queue->waitingWorkerThreads); +} + +// Delayed work items + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) +/** + * Timer function invoked when a delayed work item is ready to run. + * + * @param timer The timer which has just finished + **/ +static void processDelayedWorkItems(struct timer_list *timer) +#else +/** + * Timer function invoked when a delayed work item is ready to run. + * + * @param data The queue pointer, as an unsigned long + **/ +static void processDelayedWorkItems(unsigned long data) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + SimpleWorkQueue *queue = from_timer(queue, timer, delayedItemsTimer); +#else + SimpleWorkQueue *queue = (SimpleWorkQueue *) data; +#endif + Jiffies nextExecutionTime = 0; + bool reschedule = false; + bool needsWakeup = false; + + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + while (!isWorkItemListEmpty(&queue->delayedItems)) { + KvdoWorkItem *item = workItemListPeek(&queue->delayedItems); + if (item->executionTime > jiffies) { + nextExecutionTime = item->executionTime; + reschedule = true; + break; + } + workItemListPoll(&queue->delayedItems); + item->executionTime = 0; // not actually looked at... + item->myQueue = NULL; + needsWakeup |= enqueueWorkQueueItem(queue, item); + } + spin_unlock_irqrestore(&queue->lock, flags); + if (reschedule) { + mod_timer(&queue->delayedItemsTimer, nextExecutionTime); + } + if (needsWakeup) { + wakeWorkerThread(queue); + } +} + +// Creation & teardown + +/**********************************************************************/ +static bool queueStarted(SimpleWorkQueue *queue) +{ + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + bool started = queue->started; + spin_unlock_irqrestore(&queue->lock, flags); + return started; +} + +/** + * Create a simple work queue with a worker thread. + * + * @param [in] threadNamePrefix The per-device prefix to use in thread names + * @param [in] name The queue name + * @param [in] parentKobject The parent sysfs node + * @param [in] owner The kernel layer owning the work queue + * @param [in] private Private data of the queue for use by work + * items or other queue-specific functions + * @param [in] type The work queue type defining the lifecycle + * functions, queue actions, priorities, and + * timeout behavior + * @param [out] queuePtr Where to store the queue handle + * + * @return VDO_SUCCESS or an error code + **/ +static int makeSimpleWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + SimpleWorkQueue **queuePtr) +{ + SimpleWorkQueue *queue; + int result = ALLOCATE(1, SimpleWorkQueue, "simple work queue", &queue); + if (result != UDS_SUCCESS) { + return result; + } + + queue->type = type; + queue->private = private; + queue->common.owner = owner; + + unsigned int numPriorityLists = 1; + for (int i = 0; i < WORK_QUEUE_ACTION_COUNT; i++) { + const KvdoWorkQueueAction *action = &queue->type->actionTable[i]; + if (action->name == NULL) { + break; + } + unsigned int code = action->code; + unsigned int priority = action->priority; + + result = ASSERT(code < WORK_QUEUE_ACTION_COUNT, + "invalid action code %u in work queue initialization", + code); + if (result != VDO_SUCCESS) { + FREE(queue); + return result; + } + result = ASSERT(priority < WORK_QUEUE_PRIORITY_COUNT, + "invalid action priority %u in work queue initialization", + priority); + if (result != VDO_SUCCESS) { + FREE(queue); + return result; + } + queue->priorityMap[code] = priority; + if (numPriorityLists <= priority) { + numPriorityLists = priority + 1; + } + } + + result = duplicateString(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + FREE(queue); + return -ENOMEM; + } + + init_waitqueue_head(&queue->waitingWorkerThreads); + init_waitqueue_head(&queue->startWaiters); + spin_lock_init(&queue->lock); + + initializeWorkItemList(&queue->delayedItems); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + timer_setup(&queue->delayedItemsTimer, processDelayedWorkItems, 0); +#else + setup_timer(&queue->delayedItemsTimer, processDelayedWorkItems, + (unsigned long) queue); +#endif + + kobject_init(&queue->common.kobj, &simpleWorkQueueKobjType); + result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); + if (result != 0) { + logError("Cannot add sysfs node: %d", result); + freeSimpleWorkQueue(queue); + return result; + } + queue->numPriorityLists = numPriorityLists; + for (int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { + result = makeFunnelQueue(&queue->priorityLists[i]); + if (result != UDS_SUCCESS) { + freeSimpleWorkQueue(queue); + return result; + } + } + result = initializeWorkQueueStats(&queue->stats, &queue->common.kobj); + if (result != 0) { + logError("Cannot initialize statistics tracking: %d", result); + freeSimpleWorkQueue(queue); + return result; + } + + queue->started = false; + struct task_struct *thread = NULL; + thread = kthread_run(workQueueRunner, queue, "%s:%s", threadNamePrefix, + queue->common.name); + + if (IS_ERR(thread)) { + freeSimpleWorkQueue(queue); + return (int) PTR_ERR(thread); + } + queue->thread = thread; + atomic_set(&queue->threadID, thread->pid); + /* + * If we don't wait to ensure the thread is running VDO code, a + * quick kthread_stop (due to errors elsewhere) could cause it to + * never get as far as running VDO, skipping the cleanup code. + * + * Eventually we should just make that path safe too, and then we + * won't need this synchronization. + */ + wait_event(queue->startWaiters, queueStarted(queue) == true); + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + unsigned int threadCount, + KvdoWorkQueue **queuePtr) +{ + if (threadCount == 1) { + SimpleWorkQueue *simpleQueue; + int result = makeSimpleWorkQueue(threadNamePrefix, name, parentKobject, + owner, private, type, &simpleQueue); + if (result == VDO_SUCCESS) { + *queuePtr = &simpleQueue->common; + } + return result; + } + + RoundRobinWorkQueue *queue; + int result = ALLOCATE(1, RoundRobinWorkQueue, "round-robin work queue", + &queue); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(threadCount, SimpleWorkQueue *, "subordinate work queues", + &queue->serviceQueues); + if (result != UDS_SUCCESS) { + FREE(queue); + return result; + } + + queue->numServiceQueues = threadCount; + queue->common.roundRobinMode = true; + queue->common.owner = owner; + + result = duplicateString(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + FREE(queue->serviceQueues); + FREE(queue); + return -ENOMEM; + } + + kobject_init(&queue->common.kobj, &roundRobinWorkQueueKobjType); + result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); + if (result != 0) { + logError("Cannot add sysfs node: %d", result); + finishWorkQueue(&queue->common); + kobject_put(&queue->common.kobj); + return result; + } + + *queuePtr = &queue->common; + + char threadName[TASK_COMM_LEN]; + for (unsigned int i = 0; i < threadCount; i++) { + snprintf(threadName, sizeof(threadName), "%s%u", name, i); + result = makeSimpleWorkQueue(threadNamePrefix, threadName, + &queue->common.kobj, owner, private, type, + &queue->serviceQueues[i]); + if (result != VDO_SUCCESS) { + queue->numServiceQueues = i; + // Destroy previously created subordinates. + finishWorkQueue(*queuePtr); + freeWorkQueue(queuePtr); + return result; + } + queue->serviceQueues[i]->parentQueue = *queuePtr; + } + + return VDO_SUCCESS; +} + +/** + * Shut down a simple work queue's worker thread. + * + * @param queue The work queue to shut down + **/ +static void finishSimpleWorkQueue(SimpleWorkQueue *queue) +{ + // Tell the worker thread to shut down. + if (queue->thread != NULL) { + atomic_set(&queue->threadID, 0); + // Waits for thread to exit. + kthread_stop(queue->thread); + } + + queue->thread = NULL; +} + +/** + * Shut down a round-robin work queue's service queues. + * + * @param queue The work queue to shut down + **/ +static void finishRoundRobinWorkQueue(RoundRobinWorkQueue *queue) +{ + SimpleWorkQueue **queueTable = queue->serviceQueues; + unsigned int count = queue->numServiceQueues; + + for (unsigned int i = 0; i < count; i++) { + finishSimpleWorkQueue(queueTable[i]); + } +} + +/**********************************************************************/ +void finishWorkQueue(KvdoWorkQueue *queue) +{ + if (queue->roundRobinMode) { + finishRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); + } else { + finishSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +/** + * Tear down a simple work queue, and decrement the kobject reference + * count on it. + * + * @param queue The work queue + **/ +static void freeSimpleWorkQueue(SimpleWorkQueue *queue) +{ + for (unsigned int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { + freeFunnelQueue(queue->priorityLists[i]); + } + cleanupWorkQueueStats(&queue->stats); + kobject_put(&queue->common.kobj); +} + +/** + * Tear down a round-robin work queue and its service queues, and + * decrement the kobject reference count on it. + * + * @param queue The work queue + **/ +static void freeRoundRobinWorkQueue(RoundRobinWorkQueue *queue) +{ + SimpleWorkQueue **queueTable = queue->serviceQueues; + unsigned int count = queue->numServiceQueues; + + queue->serviceQueues = NULL; + for (unsigned int i = 0; i < count; i++) { + freeSimpleWorkQueue(queueTable[i]); + } + FREE(queueTable); + kobject_put(&queue->common.kobj); +} + +/**********************************************************************/ +void freeWorkQueue(KvdoWorkQueue **queuePtr) +{ + KvdoWorkQueue *queue = *queuePtr; + if (queue == NULL) { + return; + } + *queuePtr = NULL; + + finishWorkQueue(queue); + + if (queue->roundRobinMode) { + freeRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); + } else { + freeSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +// Debugging dumps + +/**********************************************************************/ +static void dumpSimpleWorkQueue(SimpleWorkQueue *queue) +{ + mutex_lock(&queueDataLock); + // Take a snapshot to reduce inconsistency in logged numbers. + queueData = *queue; + const char *threadStatus; + + char taskStateReport = '-'; + if (queueData.thread != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) + taskStateReport = task_state_to_char(queue->thread); +#else + unsigned int taskState = queue->thread->state & TASK_REPORT; + taskState &= 0x1ff; + unsigned int taskStateIndex; + if (taskState != 0) { + taskStateIndex = __ffs(taskState)+1; + BUG_ON(taskStateIndex >= sizeof(TASK_STATE_TO_CHAR_STR)); + } else { + taskStateIndex = 0; + } + taskStateReport = TASK_STATE_TO_CHAR_STR[taskStateIndex]; +#endif + } + + if (queueData.thread == NULL) { + threadStatus = "no threads"; + } else if (atomic_read(&queueData.idle)) { + threadStatus = "idle"; + } else { + threadStatus = "running"; + } + logInfo("workQ %" PRIptr " (%s) %u entries %llu waits, %s (%c)", + &queue->common, + queueData.common.name, + getPendingCount(&queueData), + queueData.stats.waits, + threadStatus, + taskStateReport); + + logWorkItemStats(&queueData.stats.workItemStats); + logWorkQueueStats(queue); + + mutex_unlock(&queueDataLock); + + // ->lock spin lock status? + // ->waitingWorkerThreads wait queue status? anyone waiting? +} + +/**********************************************************************/ +void dumpWorkQueue(KvdoWorkQueue *queue) +{ + if (queue->roundRobinMode) { + RoundRobinWorkQueue *roundRobinQueue = asRoundRobinWorkQueue(queue); + for (unsigned int i = 0; i < roundRobinQueue->numServiceQueues; i++) { + dumpSimpleWorkQueue(roundRobinQueue->serviceQueues[i]); + } + } else { + dumpSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +/**********************************************************************/ +void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length) +{ + size_t currentLength + = snprintf(buffer, length, "%.*s/", TASK_COMM_LEN, + item->myQueue == NULL ? "-" : item->myQueue->name); + if (currentLength < length) { + getFunctionName(item->statsFunction, buffer + currentLength, + length - currentLength); + } +} + +// Work submission + +/**********************************************************************/ +void enqueueWorkQueue(KvdoWorkQueue *kvdoWorkQueue, KvdoWorkItem *item) +{ + SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); + + item->executionTime = 0; + + if (enqueueWorkQueueItem(queue, item)) { + wakeWorkerThread(queue); + } +} + +/**********************************************************************/ +void enqueueWorkQueueDelayed(KvdoWorkQueue *kvdoWorkQueue, + KvdoWorkItem *item, + Jiffies executionTime) +{ + if (executionTime <= jiffies) { + enqueueWorkQueue(kvdoWorkQueue, item); + return; + } + + SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); + bool rescheduleTimer = false; + unsigned long flags; + + item->executionTime = executionTime; + + // Lock if the work item is delayed. All delayed items are handled via a + // single linked list. + spin_lock_irqsave(&queue->lock, flags); + + if (isWorkItemListEmpty(&queue->delayedItems)) { + rescheduleTimer = true; + } + /* + * XXX We should keep the list sorted, but at the moment the list won't + * grow above a single entry anyway. + */ + item->myQueue = &queue->common; + addToWorkItemList(&queue->delayedItems, item); + + spin_unlock_irqrestore(&queue->lock, flags); + + if (rescheduleTimer) { + mod_timer(&queue->delayedItemsTimer, executionTime); + } +} + +// Misc + + +/**********************************************************************/ +KvdoWorkQueue *getCurrentWorkQueue(void) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + return (queue == NULL) ? NULL : &queue->common; +} + +/**********************************************************************/ +KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue) +{ + return queue->owner; +} + +/**********************************************************************/ +void *getWorkQueuePrivateData(void) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + return (queue != NULL) ? queue->private : NULL; +} + +/**********************************************************************/ +void setWorkQueuePrivateData(void *newData) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + BUG_ON(queue == NULL); + queue->private = newData; +} + +/**********************************************************************/ +void initWorkQueueOnce(void) +{ + // We can't use DEFINE_MUTEX because it's not compatible with c99 mode. + mutex_init(&queueDataLock); + initWorkQueueStackHandleOnce(); +} diff --git a/vdo/kernel/workQueue.h b/vdo/kernel/workQueue.h new file mode 100644 index 0000000..4043295 --- /dev/null +++ b/vdo/kernel/workQueue.h @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.h#2 $ + */ + +#ifndef ALBIREO_WORK_QUEUE_H +#define ALBIREO_WORK_QUEUE_H + +#include +#include /* for TASK_COMM_LEN */ + +#include "kernelTypes.h" +#include "util/funnelQueue.h" + +enum { + MAX_QUEUE_NAME_LEN = TASK_COMM_LEN, + /** Maximum number of action definitions per work queue type */ + WORK_QUEUE_ACTION_COUNT = 8, + /** Number of priority values available */ + WORK_QUEUE_PRIORITY_COUNT = 4, +}; + +struct kvdoWorkItem { + /** Entry link for lock-free work queue */ + FunnelQueueEntry workQueueEntryLink; + /** Function to be called */ + KvdoWorkFunction work; + /** Optional alternate function for display in queue stats */ + void *statsFunction; + /** An index into the statistics table; filled in by workQueueStats code */ + unsigned int statTableIndex; + /** + * The action code given to setupWorkItem, from which a priority will be + * determined. + **/ + unsigned int action; + /** The work queue in which the item is enqueued, or NULL if not enqueued. */ + KvdoWorkQueue *myQueue; + /** + * Time at which to execute in jiffies for a delayed work item, or zero to + * queue for execution ASAP. + **/ + Jiffies executionTime; + /** List management for delayed or expired work items */ + KvdoWorkItem *next; + /** Time of enqueueing, in ns, for recording queue (waiting) time stats */ + uint64_t enqueueTime; +}; + +/** + * Table entries defining an action. + * + * Actions are intended to distinguish general classes of activity for + * prioritization purposes, but not necessarily to indicate specific work + * functions. They are indicated to setupWorkItem numerically, using an + * enumerator defined per kind of work queue -- bio submission work queue + * actions use BioQAction, cpu actions use CPUQAction, etc. For example, for + * the CPU work queues, data compression can be prioritized separately from + * final cleanup processing of a KVIO or from dedupe verification; base code + * threads prioritize all VIO callback invocation the same, but separate from + * sync or heartbeat operations. The bio acknowledgement work queue, on the + * other hand, only does one thing, so it only defines one action code. + * + * Action codes values must be small integers, 0 through + * WORK_QUEUE_ACTION_COUNT-1, and should not be duplicated for a queue type. + * + * A table of KvdoWorkQueueAction entries embedded in KvdoWorkQueueType + * specifies the name, code, and priority for each type of action in the work + * queue. The table can have at most WORK_QUEUE_ACTION_COUNT entries, but a + * NULL name indicates an earlier end to the table. + * + * Priorities may be specified as values from 0 through + * WORK_QUEUE_PRIORITY_COUNT-1, higher values indicating higher priority. + * Priorities are just strong suggestions; it's possible for a lower-priority + * work item scheduled right after a high-priority one to be run first, if the + * worker thread happens to be scanning its queues at just the wrong moment, + * but the high-priority item will be picked up next. + * + * Internally, the priorities in this table are used to initialize another + * table in the constructed work queue object, and in internal builds, + * device-mapper messages can be sent to change the priority for an action, + * identified by name, in a running VDO device. Doing so does not affect the + * priorities for other devices, or for future VDO device creation. + **/ +typedef struct kvdoWorkQueueAction { + /** Name of the action */ + char *name; + + /** The action code (per-type enum) */ + unsigned int code; + + /** The initial priority for this action */ + unsigned int priority; +} KvdoWorkQueueAction; + +typedef void (*KvdoWorkQueueFunction)(void *); + +/** + * Static attributes of a work queue that are fixed at compile time + * for a given call site. (Attributes that may be computed at run time + * are passed as separate arguments.) + **/ +typedef struct kvdoWorkQueueType { + /** A function to call in the new thread before servicing requests */ + KvdoWorkQueueFunction start; + + /** A function to call in the new thread when shutting down */ + KvdoWorkQueueFunction finish; + + /** A function to call in the new thread after running out of work */ + KvdoWorkQueueFunction suspend; + + /** Table of actions for this work queue */ + KvdoWorkQueueAction actionTable[WORK_QUEUE_ACTION_COUNT]; +} KvdoWorkQueueType; + +/** + * Create a work queue. + * + * If multiple threads are requested, work items will be distributed to them in + * round-robin fashion. + * + * @param [in] threadNamePrefix The per-device prefix to use in thread names + * @param [in] name The queue name + * @param [in] parentKobject The parent sysfs node + * @param [in] owner The kernel layer owning the work queue + * @param [in] private Private data of the queue for use by work + * items or other queue-specific functions + * @param [in] type The work queue type defining the lifecycle + * functions, queue actions, priorities, and + * timeout behavior + * @param [in] threadCount Number of service threads to set up + * @param [out] queuePtr Where to store the queue handle + * + * @return VDO_SUCCESS or an error code + **/ +int makeWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + unsigned int threadCount, + KvdoWorkQueue **queuePtr); + +/** + * Set up the fields of a work queue item. + * + * Before the first setup call (setupWorkItem or setupWorkItemWithTimeout), the + * work item must have been initialized to all-zero. Resetting a + * previously-used work item does not require another memset. + * + * The action code is typically defined in a work-queue-type-specific + * enumeration; see the description of KvdoWorkQueueAction. + * + * @param item The work item to initialize + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, for determination of priority + **/ +void setupWorkItem(KvdoWorkItem *item, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action); + +/** + * Add a work item to a work queue. + * + * If the work item has a timeout that has already passed, the timeout + * handler function may be invoked at this time. + * + * @param queue The queue handle + * @param item The work item to be processed + **/ +void enqueueWorkQueue(KvdoWorkQueue *queue, KvdoWorkItem *item); + +/** + * Add a work item to a work queue, to be run at a later point in time. + * + * Currently delayed work items are used only in a very limited fashion -- at + * most one at a time for any of the work queue types that use them -- and some + * shortcuts have been taken that assume that that's the case. Multiple delayed + * work items should work, but they will execute in the order they were + * enqueued. + * + * @param queue The queue handle + * @param item The work item to be processed + * @param executionTime When to run the work item (jiffies) + **/ +void enqueueWorkQueueDelayed(KvdoWorkQueue *queue, + KvdoWorkItem *item, + Jiffies executionTime); + +/** + * Shut down a work queue's worker thread. + * + * Alerts the worker thread that it should shut down, and then waits + * for it to do so. + * + * There should not be any new enqueueing of work items done once this + * function is called. Any pending delayed work items will be + * processed, as scheduled, before the worker thread shuts down, but + * they must not re-queue themselves to run again. + * + * @param queue The work queue to shut down + **/ +void finishWorkQueue(KvdoWorkQueue *queue); + +/** + * Free a work queue and null out the reference to it. + * + * @param queuePtr Where the queue handle is found + **/ +void freeWorkQueue(KvdoWorkQueue **queuePtr); + +/** + * Print work queue state and statistics to the kernel log. + * + * @param queue The work queue to examine + **/ +void dumpWorkQueue(KvdoWorkQueue *queue); + +/** + * Write to the buffer some info about the work item, for logging. + * Since the common use case is dumping info about a lot of work items + * to syslog all at once, the format favors brevity over readability. + * + * @param item The work item + * @param buffer The message buffer to fill in + * @param length The length of the message buffer + **/ +void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length); + + +/** + * Initialize work queue internals at module load time. + **/ +void initWorkQueueOnce(void); + +/** + * Checks whether two work items have the same action codes + * + * @param item1 The first item + * @param item2 The second item + * + * @return TRUE if the actions are the same, FALSE otherwise + */ +static inline bool areWorkItemActionsEqual(KvdoWorkItem *item1, + KvdoWorkItem *item2) +{ + return item1->action == item2->action; +} + +/** + * Returns the private data for the current thread's work queue. + * + * @return The private data pointer, or NULL if none or if the current + * thread is not a work queue thread. + **/ +void *getWorkQueuePrivateData(void); + +/** + * Updates the private data pointer for the current thread's work queue. + * + * @param newData The new private data pointer + **/ +void setWorkQueuePrivateData(void *newData); + +/** + * Returns the work queue pointer for the current thread, if any. + * + * @return The work queue pointer or NULL + **/ +KvdoWorkQueue *getCurrentWorkQueue(void); + +/** + * Returns the kernel layer that owns the work queue. + * + * @param queue The work queue + * + * @return The owner pointer supplied at work queue creation + **/ +KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue); + +#endif /* ALBIREO_WORK_QUEUE_H */ diff --git a/vdo/kernel/workQueueHandle.c b/vdo/kernel/workQueueHandle.c new file mode 100644 index 0000000..65b3e02 --- /dev/null +++ b/vdo/kernel/workQueueHandle.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.c#2 $ + */ + +#include "workQueueHandle.h" + +WorkQueueStackHandleGlobals workQueueStackHandleGlobals; + +/**********************************************************************/ +void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, + SimpleWorkQueue *queue) +{ + handle->nonce = workQueueStackHandleGlobals.nonce; + handle->queue = queue; + + long offset = (char *) handle - (char *) task_stack_page(current); + spin_lock(&workQueueStackHandleGlobals.offsetLock); + if (workQueueStackHandleGlobals.offset == 0) { + workQueueStackHandleGlobals.offset = offset; + spin_unlock(&workQueueStackHandleGlobals.offsetLock); + } else { + long foundOffset = workQueueStackHandleGlobals.offset; + spin_unlock(&workQueueStackHandleGlobals.offsetLock); + BUG_ON(foundOffset != offset); + } +} + +/**********************************************************************/ +void initWorkQueueStackHandleOnce(void) +{ + spin_lock_init(&workQueueStackHandleGlobals.offsetLock); + workQueueStackHandleGlobals.nonce = currentTime(CLOCK_MONOTONIC); +} diff --git a/vdo/kernel/workQueueHandle.h b/vdo/kernel/workQueueHandle.h new file mode 100644 index 0000000..e72ce42 --- /dev/null +++ b/vdo/kernel/workQueueHandle.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.h#1 $ + */ + +#ifndef WORK_QUEUE_HANDLE_H +#define WORK_QUEUE_HANDLE_H + +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) +#include +#else +#include +#endif + +#include "workQueueInternals.h" + +/* + * Layout of a special structure stored at a consistent place on the + * stack in work queue threads. + */ +typedef struct workQueueStackHandle { + unsigned long nonce; + SimpleWorkQueue *queue; +} WorkQueueStackHandle; + +typedef struct workQueueStackHandleGlobals { + /* + * Location in the stack, relative to the task structure which is + * contained in the same memory allocation. + */ + long offset; + /* + * A lock is used to guard against multiple updaters, but once an + * update is done, the offset variable will be read-only. + */ + spinlock_t offsetLock; + /* + * A nonce chosen differently each time the module is loaded, used + * as a marker so we can check that the current thread really is a + * work queue thread. Set at module initialization time, before any + * work queues are created. + */ + unsigned long nonce; +} WorkQueueStackHandleGlobals; + +extern WorkQueueStackHandleGlobals workQueueStackHandleGlobals; + +/** + * Initialize a stack handle associated with a work queue. + * + * @param [out] handle The handle to be initialized + * @param [in] queue The work queue pointer + **/ +void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, + SimpleWorkQueue *queue); + +/** + * Return the work queue pointer recorded at initialization time in + * the work-queue stack handle initialized on the stack of the current + * thread, if any. + * + * @return the work queue pointer, or NULL + **/ +static inline SimpleWorkQueue *getCurrentThreadWorkQueue(void) +{ + WorkQueueStackHandle *handle + = (WorkQueueStackHandle *)(task_stack_page(current) + + workQueueStackHandleGlobals.offset); + if (likely(handle->nonce == workQueueStackHandleGlobals.nonce)) { + return handle->queue; + } else { + return NULL; + } +} + +/** + * Initialize the global state used by the work-queue stack-handle + * code. + **/ +void initWorkQueueStackHandleOnce(void); + +#endif // WORK_QUEUE_HANDLE_H diff --git a/vdo/kernel/workQueueInternals.h b/vdo/kernel/workQueueInternals.h new file mode 100644 index 0000000..fc7a2a3 --- /dev/null +++ b/vdo/kernel/workQueueInternals.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueInternals.h#4 $ + */ + +#ifndef WORK_QUEUE_INTERNALS_H +#define WORK_QUEUE_INTERNALS_H + +#include +#include +#include +#include +#include + +#include "workItemStats.h" +#include "workQueueStats.h" + +typedef struct kvdoWorkItemList { + KvdoWorkItem *tail; +} KvdoWorkItemList; + +/** + * Work queue definition. + * + * There are two types of work queues: simple, with one worker thread, and + * round-robin, which uses a group of the former to do the work, and assigns + * work to them in -- you guessed it -- round-robin fashion. Externally, both + * are represented via the same common sub-structure, though there's actually + * not a great deal of overlap between the two types internally. + **/ +struct kvdoWorkQueue { + /** Name of just the work queue (e.g., "cpuQ12") */ + char *name; + /** + * Whether this is a round-robin work queue or a simple (one-thread) + * work queue. + **/ + bool roundRobinMode; + /** A handle to a sysfs tree for reporting stats and other info */ + struct kobject kobj; + /** The kernel layer owning this work queue */ + KernelLayer *owner; +}; + +typedef struct simpleWorkQueue SimpleWorkQueue; +typedef struct roundRobinWorkQueue RoundRobinWorkQueue; + +struct simpleWorkQueue { + /** Common work queue bits */ + KvdoWorkQueue common; + /** A copy of .thread->pid, for safety in the sysfs support */ + atomic_t threadID; + /** + * Number of priorities actually used, so we don't keep re-checking unused + * funnel queues. + **/ + unsigned int numPriorityLists; + /** + * Map from action codes to priorities. + * + * This mapping can be changed at run time in internal builds, for tuning + * purposes. + **/ + uint8_t priorityMap[WORK_QUEUE_ACTION_COUNT]; + /** The funnel queues */ + FunnelQueue *priorityLists[WORK_QUEUE_PRIORITY_COUNT]; + /** The kernel thread */ + struct task_struct *thread; + /** Life cycle functions, etc */ + const KvdoWorkQueueType *type; + /** Opaque private data pointer, defined by higher level code */ + void *private; + /** In a subordinate work queue, a link back to the round-robin parent */ + KvdoWorkQueue *parentQueue; + /** Padding for cache line separation */ + char pad[CACHE_LINE_BYTES - sizeof(KvdoWorkQueue *)]; + /** Lock protecting delayedItems, priorityMap, numPriorityLists, started */ + spinlock_t lock; + /** Any worker threads (zero or one) waiting for new work to do */ + wait_queue_head_t waitingWorkerThreads; + /** + * Hack to reduce wakeup calls if the worker thread is running. See comments + * in workQueue.c. + * + * There is a lot of redundancy with "firstWakeup", though, and the pair + * should be re-examined. + **/ + atomic_t idle; + /** Wait list for synchronization during worker thread startup */ + wait_queue_head_t startWaiters; + /** Worker thread status (boolean) */ + bool started; + + /** List of delayed work items; usually only one, if any */ + KvdoWorkItemList delayedItems; + /** + * Timer for pulling delayed work items off their list and submitting them to + * run. + * + * If the spinlock "lock" above is not held, this timer is scheduled (or + * currently firing and the callback about to acquire the lock) iff + * delayedItems is nonempty. + **/ + struct timer_list delayedItemsTimer; + + /** + * Timestamp (ns) from the submitting thread that decided to wake us up; also + * used as a flag to indicate whether a wakeup is needed. + * + * Written by submitting threads with atomic64_cmpxchg, and by the worker + * thread setting to 0. + * + * If the value is 0, the worker is probably asleep; the submitting thread + * stores a non-zero value and becomes responsible for calling wake_up on the + * worker thread. If the value is non-zero, either the worker is running or + * another thread has the responsibility for issuing the wakeup. + * + * The "sleep" mode has periodic wakeups and the worker thread may happen to + * wake up while a work item is being enqueued. If that happens, the wakeup + * may be unneeded but will be attempted anyway. + * + * So the return value from cmpxchg(firstWakeup,0,nonzero) can always be + * done, and will tell the submitting thread whether to issue the wakeup or + * not; cmpxchg is atomic, so no other synchronization is needed. + * + * A timestamp is used rather than, say, 1, so that the worker thread can + * record stats on how long it takes to actually get the worker thread + * running. + * + * There is some redundancy between this and "idle" above. + **/ + atomic64_t firstWakeup; + /** Padding for cache line separation */ + char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /** Scheduling and work-function statistics */ + KvdoWorkQueueStats stats; + /** Last time (ns) the scheduler actually woke us up */ + uint64_t mostRecentWakeup; +}; + +struct roundRobinWorkQueue { + /** Common work queue bits */ + KvdoWorkQueue common; + /** Simple work queues, for actually getting stuff done */ + SimpleWorkQueue **serviceQueues; + /** Number of subordinate work queues */ + unsigned int numServiceQueues; + /** Padding for cache line separation */ + char pad[CACHE_LINE_BYTES - sizeof(unsigned int)]; + /** + * Rotor used for dispatching across subordinate service queues. + * + * Used and updated by submitting threads. (Not atomically or with locking, + * because we don't really care about it being precise, only about getting a + * roughly even spread; if an increment is missed here and there, it's not a + * problem.) + **/ + unsigned int serviceQueueRotor; +}; + +static inline SimpleWorkQueue *asSimpleWorkQueue(KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, SimpleWorkQueue, common)); +} + +static inline const SimpleWorkQueue * +asConstSimpleWorkQueue(const KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, SimpleWorkQueue, common)); +} + +static inline RoundRobinWorkQueue *asRoundRobinWorkQueue(KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, RoundRobinWorkQueue, common)); +} + +#endif // WORK_QUEUE_INTERNALS_H diff --git a/vdo/kernel/workQueueStats.c b/vdo/kernel/workQueueStats.c new file mode 100644 index 0000000..d5a38ae --- /dev/null +++ b/vdo/kernel/workQueueStats.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.c#6 $ + */ + +#include "workQueueStats.h" + +#include "atomic.h" +#include "logger.h" +#include "workItemStats.h" +#include "workQueueInternals.h" + +/**********************************************************************/ +int initializeWorkQueueStats(KvdoWorkQueueStats *stats, + struct kobject *queueKObject) +{ + spin_lock_init(&stats->workItemStats.functionTable.lock); + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + initSimpleStats(&stats->workItemStats.times[i]); + } + } + + stats->queueTimeHistogram + = makeLogarithmicHistogram(queueKObject, "queue_time", + "Queue Time", "work items", "wait time", + "microseconds", 9); + if (stats->queueTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->rescheduleQueueLengthHistogram + = makeLogarithmicHistogram(queueKObject, "reschedule_queue_length", + "Reschedule Queue Length", "calls", + "queued work items", NULL, 4); + if (stats->rescheduleQueueLengthHistogram == NULL) { + return -ENOMEM; + } + + stats->rescheduleTimeHistogram + = makeLogarithmicHistogram(queueKObject, "reschedule_time", + "Reschedule Time", "calls", + "sleep interval", "microseconds", 9); + if (stats->rescheduleTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->runTimeBeforeRescheduleHistogram + = makeLogarithmicHistogram(queueKObject, "run_time_before_reschedule", + "Run Time Before Reschedule", + "calls", "run time", "microseconds", 9); + if (stats->runTimeBeforeRescheduleHistogram == NULL) { + return -ENOMEM; + } + + stats->scheduleTimeHistogram + = makeLogarithmicHistogram(queueKObject, "schedule_time", + "Schedule Time", + "calls", "sleep interval", "microseconds", 9); + if (stats->scheduleTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->wakeupLatencyHistogram + = makeLogarithmicHistogram(queueKObject, "wakeup_latency", + "Wakeup Latency", + "wakeups", "latency", "microseconds", 9); + if (stats->wakeupLatencyHistogram == NULL) { + return -ENOMEM; + } + + stats->wakeupQueueLengthHistogram + = makeLogarithmicHistogram(queueKObject, "wakeup_queue_length", + "Wakeup Queue Length", "wakeups", + "queued work items", NULL, 4); + if (stats->wakeupQueueLengthHistogram == NULL) { + return -ENOMEM; + } + + return 0; +} + +/**********************************************************************/ +void cleanupWorkQueueStats(KvdoWorkQueueStats *stats) +{ + freeHistogram(&stats->queueTimeHistogram); + freeHistogram(&stats->rescheduleQueueLengthHistogram); + freeHistogram(&stats->rescheduleTimeHistogram); + freeHistogram(&stats->runTimeBeforeRescheduleHistogram); + freeHistogram(&stats->scheduleTimeHistogram); + freeHistogram(&stats->wakeupLatencyHistogram); + freeHistogram(&stats->wakeupQueueLengthHistogram); +} + +/**********************************************************************/ +static uint64_t getTotalProcessed(const SimpleWorkQueue *queue) +{ + uint64_t totalProcessed = 0; + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + totalProcessed += queue->stats.workItemStats.times[i].count; + } + return totalProcessed; +} + +/**********************************************************************/ +void logWorkQueueStats(const SimpleWorkQueue *queue) +{ + uint64_t runtimeNS = 0; + if (queue->thread != NULL) { + runtimeNS += queue->thread->se.sum_exec_runtime; + } + + unsigned long nsPerWorkItem = 0; + uint64_t totalProcessed = getTotalProcessed(queue); + if (totalProcessed > 0) { + nsPerWorkItem = runtimeNS / totalProcessed; + } + unsigned long runtimeMS = runtimeNS / 1000; + logInfo("workQ %" PRIptr " (%s) thread cpu usage %lu.%06lus, %" PRIu64 + " tasks, %lu.%03luus/task", + queue, + queue->common.name, + runtimeMS / 1000000, runtimeMS % 1000000, + totalProcessed, + nsPerWorkItem / 1000, nsPerWorkItem % 1000); +} + +/**********************************************************************/ +ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer) +{ + // Get snapshots of all three at approximately the same time. + uint64_t startTime = stats->startTime; + uint64_t runTime = atomic64_read(&stats->runTime); + uint64_t rescheduleTime = atomic64_read(&stats->rescheduleTime); + loadFence(); // rdtsc barrier + uint64_t now = currentTime(CLOCK_MONOTONIC); + uint64_t lifetime = now - startTime; + + return sprintf(buffer, + "%llu %llu %llu\n", + lifetime, runTime, rescheduleTime); +} diff --git a/vdo/kernel/workQueueStats.h b/vdo/kernel/workQueueStats.h new file mode 100644 index 0000000..914f5f4 --- /dev/null +++ b/vdo/kernel/workQueueStats.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.h#2 $ + */ + +#ifndef WORK_QUEUE_STATS_H +#define WORK_QUEUE_STATS_H + +#include "workQueue.h" + +#include "timeUtils.h" + +#include "histogram.h" +#include "workItemStats.h" + +// Defined in workQueueInternals.h after inclusion of workQueueStats.h. +struct simpleWorkQueue; + +/* + * Tracking statistics. + * + * Cache line contention issues: + * + * In workItemStats, there are read-only fields accessed mostly by + * work submitters, then fields updated by the work submitters (for + * which there will be contention), then fields rarely if ever updated + * (more than two cache lines' worth), then fields updated only by the + * worker thread. The trailing fields here are updated only by the + * worker thread. + */ +typedef struct kvdoWorkQueueStats { + // Per-work-function counters and optional nanosecond timing data + KvdoWorkItemStats workItemStats; + // How often we go to sleep waiting for work + uint64_t waits; + + // Run time data, for monitoring utilization levels. + + // Thread start time, from which we can compute lifetime thus far. + uint64_t startTime; + /* + * Time the thread has not been blocked waiting for a new work item, + * nor in cond_resched(). This will include time the thread has been + * blocked by some kernel function invoked by the work functions + * (e.g., waiting for socket buffer space). + * + * This is not redundant with runTimeBeforeRescheduleHistogram, as + * the latter doesn't count run time not followed by a cond_resched + * call. + */ + atomic64_t runTime; + // Time the thread has been suspended via cond_resched(). + // (Duplicates data hidden within rescheduleTimeHistogram.) + atomic64_t rescheduleTime; + + // Histogram of the queue times of work items (microseconds) + Histogram *queueTimeHistogram; + // How busy we are when cond_resched is called + Histogram *rescheduleQueueLengthHistogram; + // Histogram of the time cond_resched makes us sleep for (microseconds) + Histogram *rescheduleTimeHistogram; + // Histogram of the run time between cond_resched calls (microseconds) + Histogram *runTimeBeforeRescheduleHistogram; + // Histogram of the time schedule_timeout lets us sleep for (microseconds) + Histogram *scheduleTimeHistogram; + // How long from thread wakeup call to thread actually running (microseconds) + Histogram *wakeupLatencyHistogram; + // How much work is pending by the time we start running + Histogram *wakeupQueueLengthHistogram; +} KvdoWorkQueueStats; + +/** + * Initialize the work queue's statistics tracking. + * + * @param stats The statistics structure + * @param queueKObject The sysfs directory kobject for the work queue + * + * @return 0 or a kernel error code + **/ +int initializeWorkQueueStats(KvdoWorkQueueStats *stats, + struct kobject *queueKObject) + __attribute__((warn_unused_result)); + +/** + * Tear down any allocated storage or objects for statistics tracking. + * + * @param stats The statistics structure + **/ +void cleanupWorkQueueStats(KvdoWorkQueueStats *stats); + +/** + * Update the work queue statistics tracking to note the enqueueing of + * a work item. + * + * @param stats The statistics structure + * @param item The work item being enqueued + * @param priority The priority of the work item + **/ +static inline void updateStatsForEnqueue(KvdoWorkQueueStats *stats, + KvdoWorkItem *item, + int priority) +{ + updateWorkItemStatsForEnqueue(&stats->workItemStats, item, priority); + item->enqueueTime = currentTime(CLOCK_MONOTONIC); +} + +/** + * Update the work queue statistics tracking to note the dequeueing of + * a work item. + * + * @param stats The statistics structure + * @param item The work item being enqueued + **/ +static inline void updateStatsForDequeue(KvdoWorkQueueStats *stats, + KvdoWorkItem *item) +{ + updateWorkItemStatsForDequeue(&stats->workItemStats, item); + enterHistogramSample(stats->queueTimeHistogram, + (currentTime(CLOCK_MONOTONIC) - item->enqueueTime) / 1000); + item->enqueueTime = 0; +} + +/** + * Write the work queue's accumulated statistics to the kernel log. + * + * The queue pointer is needed so that its address and name can be + * logged along with the statistics. + * + * @param queue The work queue + **/ +void logWorkQueueStats(const struct simpleWorkQueue *queue); + +/** + * Format the thread lifetime, run time, and suspend time into a + * supplied buffer for reporting via sysfs. + * + * @param [in] stats The stats structure containing the run-time info + * @param [out] buffer The buffer in which to report the info + **/ +ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer); + +#endif // WORK_QUEUE_STATS_H diff --git a/vdo/kernel/workQueueSysfs.c b/vdo/kernel/workQueueSysfs.c new file mode 100644 index 0000000..f9dd9cb --- /dev/null +++ b/vdo/kernel/workQueueSysfs.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.c#1 $ + */ + +#include "workQueueSysfs.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "workQueueInternals.h" + +typedef struct workQueueAttribute { + struct attribute attr; + ssize_t (*show)(const KvdoWorkQueue *queue, char *buf); + ssize_t (*store)(KvdoWorkQueue *queue, const char *buf, size_t length); +} WorkQueueAttribute; + +/**********************************************************************/ +static ssize_t nameShow(const KvdoWorkQueue *queue, char *buf) +{ + return sprintf(buf, "%s\n", queue->name); +} + +/**********************************************************************/ +static ssize_t pidShow(const KvdoWorkQueue *queue, char *buf) +{ + return sprintf(buf, "%ld\n", + (long) atomic_read(&asConstSimpleWorkQueue(queue)->threadID)); +} + +/**********************************************************************/ +static ssize_t timesShow(const KvdoWorkQueue *queue, char *buf) +{ + return formatRunTimeStats(&asConstSimpleWorkQueue(queue)->stats, buf); +} + +/**********************************************************************/ +static ssize_t typeShow(const KvdoWorkQueue *queue, char *buf) +{ + strcpy(buf, queue->roundRobinMode ? "round-robin\n" : "simple\n"); + return strlen(buf); +} + +/**********************************************************************/ +static ssize_t workFunctionsShow(const KvdoWorkQueue *queue, char *buf) +{ + const SimpleWorkQueue *simpleQueue = asConstSimpleWorkQueue(queue); + return formatWorkItemStats(&simpleQueue->stats.workItemStats, buf, + PAGE_SIZE); +} + +/**********************************************************************/ +static WorkQueueAttribute nameAttr = { + .attr = { .name = "name", .mode = 0444, }, + .show = nameShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute pidAttr = { + .attr = { .name = "pid", .mode = 0444, }, + .show = pidShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute timesAttr = { + .attr = { .name = "times", .mode = 0444 }, + .show = timesShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute typeAttr = { + .attr = { .name = "type", .mode = 0444, }, + .show = typeShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute workFunctionsAttr = { + .attr = { .name = "work_functions", .mode = 0444, }, + .show = workFunctionsShow, +}; + +/**********************************************************************/ +static struct attribute *simpleWorkQueueAttrs[] = { + &nameAttr.attr, + &pidAttr.attr, + ×Attr.attr, + &typeAttr.attr, + &workFunctionsAttr.attr, + NULL, +}; + +/**********************************************************************/ +static struct attribute *roundRobinWorkQueueAttrs[] = { + &nameAttr.attr, + &typeAttr.attr, + NULL, +}; + +/**********************************************************************/ +static ssize_t workQueueAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); + if (wqAttr->show == NULL) { + return -EINVAL; + } + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + return wqAttr->show(queue, buf); +} + +/**********************************************************************/ +static ssize_t workQueueAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); + if (wqAttr->store == NULL) { + return -EINVAL; + } + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + return wqAttr->store(queue, buf, length); +} + +/**********************************************************************/ +static struct sysfs_ops workQueueSysfsOps = { + .show = workQueueAttrShow, + .store = workQueueAttrStore, +}; + +/**********************************************************************/ +static void workQueueRelease(struct kobject *kobj) +{ + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + FREE(queue->name); + if (queue->roundRobinMode) { + FREE(asRoundRobinWorkQueue(queue)); + } else { + FREE(asSimpleWorkQueue(queue)); + } +} + +/**********************************************************************/ +struct kobj_type simpleWorkQueueKobjType = { + .default_attrs = simpleWorkQueueAttrs, + .release = workQueueRelease, + .sysfs_ops = &workQueueSysfsOps, +}; + +/**********************************************************************/ +struct kobj_type roundRobinWorkQueueKobjType = { + .default_attrs = roundRobinWorkQueueAttrs, + .release = workQueueRelease, + .sysfs_ops = &workQueueSysfsOps, +}; diff --git a/vdo/kernel/workQueueSysfs.h b/vdo/kernel/workQueueSysfs.h new file mode 100644 index 0000000..41f6af5 --- /dev/null +++ b/vdo/kernel/workQueueSysfs.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.h#1 $ + */ + +#ifndef WORK_QUEUE_SYSFS_H +#define WORK_QUEUE_SYSFS_H + +#include + +extern struct kobj_type roundRobinWorkQueueKobjType; +extern struct kobj_type simpleWorkQueueKobjType; + +#endif // WORK_QUEUE_SYSFS_H