diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
new file mode 100644
index 0000000..2719005
--- /dev/null
+++ b/CONTRIBUTORS.txt
@@ -0,0 +1,98 @@
+The Red Hat VDO Team:
+  Principal Engineer/Lead Architect:
+    J. corwin Coburn <corwin@redhat.com>
+
+  Primary Authors:
+    Joseph Chapman <jochapma@redhat.com>
+    Sweet Tea Dorminy <sweettea@redhat.com>
+    *Thomas Jaskiewicz <tom@jaskiewicz.us>
+    Bruce Johnston <bjohnsto@redhat.com>
+    Susan McGhee <slegendr@redhat.com>
+    Ken Raeburn <raeburn@redhat.com>
+    Michael Sclafani <sclafani@redhat.com>
+    Matthew Sakai <msakai@redhat.com>
+    Joseph Shimkus <jshimkus@redhat.com>
+    John Wiele <jwiele@redhat.com>
+
+  Support, Testing, Documentation, and other things too numerous to mention:
+    Chung Chung <cchung@redhat.com>:
+    Bryan Gurney <bgurney@redhat.com>
+    *Simon J. Hernandez <sihernan@redhat.com>
+    Jakub Krysl <jkrysl@redhat.com>
+    Marek Suchanek <msuchane@redhat.com>
+
+  Project Management & Technical Direction:
+    Jered Floyd <jered@redhat.com>
+    Louis Imershein <limershe@redhat.com>
+    Dennis Keefe <dkeefe@redhat.com>
+    Andrew Walsh <awalsh@redhat.com>
+
+  *former team members
+
+Other Contributors:
+  Ji-Hyeon Gim <potatogim@potatogim.net>:
+    Updates for FC26/Kernel 4.13
+  Vojtech Trefny <vtrefny@redhat.com>
+    Getting correct size of partitions
+  Achilles Gaikwad <agaikwad@redhat.com>
+    Bash completion for the vdo and vdostats commands
+  Jin-young Kwon <gc757489@gmail.com>
+    Adding vdo --version command, and documentation fixes
+
+VDO was originally created at Permabit Technology Corporation, and was
+subsequently acquired and open-sourced by Red Hat.
+
+Former Members of the Permabit VDO Team:
+  Engineers:
+    Mark Amidon
+    David Buckle
+    Jacky Chu
+    Joel Hoff
+    Dimitri Kountourogianni
+    Alexis Layton
+    Michael Lee
+    Rich Macchi
+    Dave Paniriti
+    Karl Ramm
+    Hooman Vassef
+    Assar Westurlund
+
+  Support, Testing, Documentation, etc.
+    Carl Alexander
+    Mike Chu
+    Mark Iskra
+    Farid Jahanmir
+    Francesca Koulikov
+    Erik Lattimore
+    Jennifer Levine
+    Randy Long
+    Steve Looby
+    Uche Onyekwuluje
+    Catherine Powell
+    Jeff Pozz
+    Sarmad Sada
+    John Schmidt
+    Omri Schwarz
+    Jay Splaine
+    John Welle
+    Mary-Anne Wolf
+    Devon Yablonski
+    Robert Zupko
+
+  Interns:
+    Ari Entlich
+    Lori Monteleone
+
+  Project Management & Technical Direction:
+    Michael Fortson
+
+Other Past Permabit Contributors (for early work on the index):
+    James Clough
+    Dave Golombek
+    Albert Lin
+    Edwin Olson
+    Dave Pinkney
+    Rich Brennan
+
+And Very Special Thanks To:
+  Norman Margolis, who started the whole thing
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..7d5393a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,278 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4084615
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,2 @@
+obj-y += uds/
+obj-y += vdo/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..528277d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,125 @@
+# kvdo
+
+A pair of kernel modules which provide pools of deduplicated and/or compressed
+block storage.
+
+## Background
+
+VDO (which includes [kvdo](https://github.com/dm-vdo/kvdo) and
+[vdo](https://github.com/dm-vdo/vdo)) is software that provides inline
+block-level deduplication, compression, and thin provisioning capabilities for
+primary storage. VDO installs within the Linux device mapper framework, where
+it takes ownership of existing physical block devices and remaps these to new,
+higher-level block devices with data-efficiency capabilities.
+
+Deduplication is a technique for reducing the consumption of storage resources
+by eliminating multiple copies of duplicate blocks. Compression takes the
+individual unique blocks and shrinks them with coding algorithms; these reduced
+blocks are then efficiently packed together into physical blocks.  Thin
+provisioning manages the mapping from LBAs presented by VDO to where the data
+has actually been stored, and also eliminates any blocks of all zeroes.
+
+With deduplication, instead of writing the same data more than once each
+duplicate block is detected and recorded as a reference to the original
+block. VDO maintains a mapping from logical block addresses (used by the
+storage layer above VDO) to physical block addresses (used by the storage layer
+under VDO). After deduplication, multiple logical block addresses may be mapped
+to the same physical block address; these are called shared blocks and are
+reference-counted by the software.
+
+With VDO's compression, multiple blocks (or shared blocks) are compressed with
+the fast LZ4 algorithm, and binned together where possible so that multiple
+compressed blocks fit within a 4 KB block on the underlying storage.  Mapping
+from LBA is to a physical block address and index within it for the desired
+compressed data.  All compressed blocks are individually reference counted for
+correctness.
+
+Block sharing and block compression are invisible to applications using the
+storage, which read and write blocks as they would if VDO were not
+present. When a shared block is overwritten, a new physical block is allocated
+for storing the new block data to ensure that other logical block addresses
+that are mapped to the shared physical block are not modified.
+
+This public source release of VDO includes two kernel modules, and a set of
+userspace tools for managing them. The "kvdo" module implements fine-grained
+storage virtualization, thin provisioning, block sharing, and compression; the
+"uds" module provides memory-efficient duplicate identification. The userspace
+tools include a pair of python scripts, "vdo" for creating and managing VDO
+volumes, and "vdostats" for extracting statistics from those volumes.
+
+## Documentation
+
+- [RHEL8 VDO Documentation](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/deduplicating_and_compressing_storage/index)
+- [RHEL7 VDO Integration Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-integration)
+- [RHEL7 VDO Evaluation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-evaluation)
+
+## Releases
+
+Each branch on this project is intended to work with a specific release of
+Enterprise Linux (Red Hat Enterprise Linux, CentOS, etc.). We try to maintain
+compatibility with active Fedora releases, but some modifications may be
+required.
+
+Version | Intended Enterprise Linux Release | Supported With Modifications
+------- | --------------------------------- | -------------------------------
+6.1.x.x | EL7 (3.10.0-*.el7) |
+6.2.x.x | EL8 (4.18.0-*.el8) | Fedora 28, Fedora 29, Fedora 30, Rawhide
+* Pre-built versions with the required modifications for the referenced Fedora
+  releases can be found
+  [here](https://copr.fedorainfracloud.org/coprs/rhawalsh/dm-vdo) and can be
+  used by running `dnf copr enable rhawalsh/dm-vdo`.
+
+## Status
+
+VDO was originally developed by Permabit Technology Corp. as a proprietary set
+of kernel modules and userspace tools. This software and technology has been
+acquired by Red Hat, has been relicensed under the GPL (v2 or later), and this
+repository begins the process of preparing for integration with the upstream
+kernel.
+
+While this software has been relicensed there are a number of issues that must
+still be addressed to be ready for upstream.  These include:
+
+- Conformance with kernel coding standards
+- Use of existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate
+- Refactoring of primitives (e.g. cryptographic) to appropriate kernel
+  subsystems
+- Support for non-x86-64 platforms
+- Refactoring of platform layer abstractions and other changes requested by
+  upstream maintainers
+
+We expect addressing these issues to take some time. In the meanwhile, this
+project allows interested parties to begin using VDO immediately. The
+technology itself is thoroughly tested, mature, and in production use since
+2014 in its previous proprietary form.
+
+## Building
+
+In order to build the kernel modules, invoke the following command
+from the top directory of this tree:
+
+        make -C /usr/src/kernels/`uname -r` M=`pwd`
+
+* Patched sources that work with the most recent upstream kernels can be found
+  [here](https://github.com/rhawalsh/kvdo).
+
+## Communication channels
+
+Community feedback, participation and patches are welcome to the
+vdo-devel@redhat.com mailing list -- subscribe
+[here](https://www.redhat.com/mailman/listinfo/vdo-devel).
+
+## Contributing
+
+This project is currently a stepping stone towards integration with the Linux
+kernel. As such, contributions are welcome via a process similar to that for
+Linux kernel development. Patches should be submitted to the
+vdo-devel@redhat.com mailing list, where they will be considered for
+inclusion. This project does not accept pull requests.
+
+## Licensing
+
+[GPL v2.0 or later](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html).
+All contributions retain ownership by their original author, but must also
+be licensed under the GPL 2.0 or later to be merged.
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..d2d5cf5
--- /dev/null
+++ b/TODO
@@ -0,0 +1,6 @@
+- Conform to kernel coding standards
+- Use existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate
+- Refactor primitives (e.g. cryptographic) to appropriate kernel subsystems
+- Support non-x86-64 platforms
+- Refactor platform layer abstractions and other changes requested by upstream
+  maintainers
diff --git a/kvdo.spec b/kvdo.spec
new file mode 100644
index 0000000..e340b2c
--- /dev/null
+++ b/kvdo.spec
@@ -0,0 +1,89 @@
+%define spec_release 1
+%define kmod_name		kvdo
+%define kmod_driver_version	6.2.4.26
+%define kmod_rpm_release	%{spec_release}
+%define kmod_kernel_version	3.10.0-693.el7
+
+# Disable the scanning for a debug package
+%global debug_package %{nil}
+
+Source0:        kmod-%{kmod_name}-%{kmod_driver_version}.tgz
+
+Name:		kmod-kvdo
+Version:	%{kmod_driver_version}
+Release:	%{kmod_rpm_release}%{?dist}
+Summary:	Kernel Modules for Virtual Data Optimizer
+License:	GPLv2+
+URL:		http://github.com/dm-vdo/kvdo
+BuildRoot:	%(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
+Requires:       dkms
+Requires:	kernel-devel >= %{kmod_kernel_version}
+Requires:       make
+ExclusiveArch:	x86_64
+ExcludeArch:    s390
+ExcludeArch:    s390x
+ExcludeArch:    ppc
+ExcludeArch:    ppc64
+ExcludeArch:    ppc64le
+ExcludeArch:    aarch64
+ExcludeArch:    i686
+
+%description
+Virtual Data Optimizer (VDO) is a device mapper target that delivers
+block-level deduplication, compression, and thin provisioning.
+
+This package provides the kernel modules for VDO.
+
+%post
+set -x
+/usr/sbin/dkms --rpm_safe_upgrade add -m %{kmod_name} -v %{version}-%{kmod_driver_version}
+/usr/sbin/dkms --rpm_safe_upgrade build -m %{kmod_name} -v %{version}-%{kmod_driver_version}
+/usr/sbin/dkms --rpm_safe_upgrade install -m %{kmod_name} -v %{version}-%{kmod_driver_version}
+
+%preun
+# Check whether kvdo or uds is loaded, and if so attempt to remove it.  A
+# failure here means there is still something using the module, which should be
+# cleared up before attempting to remove again.
+for module in kvdo uds; do
+  if grep -q "^${module}" /proc/modules; then
+    modprobe -r ${module}
+  fi
+done
+/usr/sbin/dkms --rpm_safe_upgrade remove -m %{kmod_name} -v %{version}-%{kmod_driver_version} --all || :
+
+%prep
+%setup -n kmod-%{kmod_name}-%{kmod_driver_version}
+
+%build
+# Nothing doing here, as we're going to build on whatever kernel we end up
+# running inside.
+
+%install
+mkdir -p $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}
+cp -r * $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/
+cat > $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/dkms.conf <<EOF
+PACKAGE_NAME="kvdo"
+PACKAGE_VERSION="%{version}-%{kmod_driver_version}"
+AUTOINSTALL="yes"
+
+BUILT_MODULE_NAME[0]="uds"
+BUILT_MODULE_LOCATION[0]="uds"
+DEST_MODULE_LOCATION[0]="/kernel/drivers/block/"
+STRIP[0]="no"
+
+BUILT_MODULE_NAME[1]="kvdo"
+BUILT_MODULE_LOCATION[1]="vdo"
+DEST_MODULE_LOCATION[1]="/kernel/drivers/block/"
+STRIP[1]="no"
+EOF
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(644,root,root,755)
+%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/*
+
+%changelog
+* Mon Nov 02 2020 - Red Hat VDO Group <vdo-devel@redhat.com> - 6.2.4.26-1
+HASH(0x5645fb62bab0)
\ No newline at end of file
diff --git a/uds/Makefile b/uds/Makefile
new file mode 100644
index 0000000..5afc64a
--- /dev/null
+++ b/uds/Makefile
@@ -0,0 +1,21 @@
+UDS_VERSION = 8.0.2.4
+
+SOURCES =  $(notdir $(wildcard $(src)/*.c)) murmur/MurmurHash3.c
+SOURCES += $(addprefix util/,$(notdir $(wildcard $(src)/util/*.c)))
+OBJECTS = $(SOURCES:%.c=%.o)
+INCLUDES = -I$(src)
+
+EXTRA_CFLAGS =	-std=gnu99				\
+		-fno-builtin-memset			\
+		-Werror					\
+		-Wframe-larger-than=400			\
+		-Wno-declaration-after-statement	\
+		-DUDS_VERSION=\"$(UDS_VERSION)\"	\
+		$(INCLUDES)
+
+CFLAGS_REMOVE_deltaIndex.o = -std=gnu99
+CFLAGS_REMOVE_masterIndex005.o = -std=gnu99
+
+obj-m += uds.o
+
+uds-objs = $(OBJECTS)
diff --git a/uds/atomicDefs.h b/uds/atomicDefs.h
new file mode 100644
index 0000000..0c82bca
--- /dev/null
+++ b/uds/atomicDefs.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/atomicDefs.h#2 $
+ */
+
+#ifndef LINUX_KERNEL_ATOMIC_DEFS_H
+#define LINUX_KERNEL_ATOMIC_DEFS_H
+
+#include <linux/atomic.h>
+
+#endif /* LINUX_KERNEL_ATOMIC_DEFS_H */
diff --git a/uds/bits.c b/uds/bits.c
new file mode 100644
index 0000000..eea4912
--- /dev/null
+++ b/uds/bits.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bits.c#1 $
+ */
+
+#include "bits.h"
+
+#include "compiler.h"
+
+/**
+ * This is the largest field size supported by getBigField & setBigField.
+ * Any field that is larger is not guaranteed to fit in a single, byte
+ * aligned uint64_t.
+ **/
+enum { MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1 };
+
+/**
+ * Get a big bit field from a bit stream
+ *
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE uint64_t getBigField(const byte *memory,
+                                   uint64_t    offset,
+                                   int         size)
+{
+  const void *addr = memory + offset / CHAR_BIT;
+  return (getUInt64LE(addr) >> (offset % CHAR_BIT)) & ((1UL << size) - 1);
+}
+
+/**
+ * Set a big bit field in a bit stream
+ *
+ * @param value   The value to put into the field
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE void setBigField(uint64_t value, byte *memory, uint64_t offset,
+                               int size)
+{
+  void *addr = memory + offset / CHAR_BIT;
+  int shift = offset % CHAR_BIT;
+  uint64_t data = getUInt64LE(addr);
+  data &= ~(((1UL << size) - 1) << shift);
+  data |= value << shift;
+  storeUInt64LE(addr, data);
+}
+
+/***********************************************************************/
+void getBytes(const byte *memory, uint64_t offset, byte *destination, int size)
+{
+  const byte *addr = memory + offset / CHAR_BIT;
+  int shift = offset % CHAR_BIT;
+  while (--size >= 0) {
+    *destination++ = getUInt16LE(addr++) >> shift;
+  }
+}
+
+/***********************************************************************/
+void setBytes(byte *memory, uint64_t offset, const byte *source, int size)
+{
+  byte *addr = memory + offset / CHAR_BIT;
+  int shift = offset % CHAR_BIT;
+  uint16_t mask = ~((uint16_t) 0xFF << shift);
+  while (--size >= 0) {
+    uint16_t data = (getUInt16LE(addr) & mask) | (*source++ << shift);
+    storeUInt16LE(addr++, data);
+  }
+}
+
+/***********************************************************************/
+void moveBits(const byte *sMemory, uint64_t source, byte *dMemory,
+              uint64_t destination, int size)
+{
+  enum { UINT32_BIT = sizeof(uint32_t) * CHAR_BIT };
+  if (size > MAX_BIG_FIELD_BITS) {
+    if (source > destination) {
+      // This is a large move from a higher to a lower address.  We move
+      // the lower addressed bits first.  Start by moving one field that
+      // ends on a destination int boundary
+      int count
+        = MAX_BIG_FIELD_BITS - (destination + MAX_BIG_FIELD_BITS) % UINT32_BIT;
+      uint64_t field = getBigField(sMemory, source, count);
+      setBigField(field, dMemory, destination, count);
+      source      += count;
+      destination += count;
+      size        -= count;
+      // Now do the main loop to copy 32 bit chunks that are int-aligned
+      // at the destination.
+      int offset = source % UINT32_BIT;
+      const byte *src = sMemory + (source - offset) / CHAR_BIT;
+      byte *dest = dMemory + destination / CHAR_BIT;
+      while (size > MAX_BIG_FIELD_BITS) {
+        storeUInt32LE(dest, getUInt64LE(src) >> offset);
+        src  += sizeof(uint32_t);
+        dest += sizeof(uint32_t);
+        source      += UINT32_BIT;
+        destination += UINT32_BIT;
+        size        -= UINT32_BIT;
+      }
+    } else {
+      // This is a large move from a lower to a higher address.  We move
+      // the higher addressed bits first.  Start by moving one field that
+      // begins on a destination int boundary
+      int count = (destination + size) % UINT32_BIT;
+      if (count > 0) {
+        size -= count;
+        uint64_t field = getBigField(sMemory, source + size, count);
+        setBigField(field, dMemory, destination + size, count);
+      }
+      // Now do the main loop to copy 32 bit chunks that are int-aligned
+      // at the destination.
+      int offset = (source + size) % UINT32_BIT;
+      const byte *src = sMemory + (source + size - offset) / CHAR_BIT;
+      byte *dest = dMemory + (destination + size) / CHAR_BIT;
+      while (size > MAX_BIG_FIELD_BITS) {
+        src  -= sizeof(uint32_t);
+        dest -= sizeof(uint32_t);
+        size -= UINT32_BIT;
+        storeUInt32LE(dest, getUInt64LE(src) >> offset);
+      }
+    }
+  }
+  // Finish up by doing the last chunk, which can have any arbitrary alignment
+  if (size > 0) {
+    uint64_t field = getBigField(sMemory, source, size);
+    setBigField(field, dMemory, destination, size);
+  }
+}
+
+/***********************************************************************/
+bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2,
+              uint64_t offset2, int size)
+{
+  while (size >= MAX_FIELD_BITS) {
+    unsigned int field1 = getField(mem1, offset1, MAX_FIELD_BITS);
+    unsigned int field2 = getField(mem2, offset2, MAX_FIELD_BITS);
+    if (field1 != field2) return false;
+    offset1 += MAX_FIELD_BITS;
+    offset2 += MAX_FIELD_BITS;
+    size    -= MAX_FIELD_BITS;
+  }
+  if (size > 0) {
+    unsigned int field1 = getField(mem1, offset1, size);
+    unsigned int field2 = getField(mem2, offset2, size);
+    if (field1 != field2) return false;
+  }
+  return true;
+}
diff --git a/uds/bits.h b/uds/bits.h
new file mode 100644
index 0000000..2c2d4ea
--- /dev/null
+++ b/uds/bits.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bits.h#1 $
+ */
+
+#ifndef BITS_H
+#define BITS_H 1
+
+#include "compiler.h"
+#include "numeric.h"
+#include "typeDefs.h"
+
+/*
+ * These bit stream and bit field utility routines are used for the
+ * non-byte aligned delta indices.
+ *
+ * Bits and bytes are numbered in little endian order.  For example: Within
+ * a byte, bit 0 is the least significant bit (0x1), and bit 7 is the most
+ * significant bit (0x80).  Within a bit stream, bit 7 is the most
+ * signficant bit of byte 0, and bit 8 is the least significant bit of byte
+ * 1.  Within a byte array, a byte's number corresponds to it's index in
+ * the array.
+ *
+ * The implementation assumes that the native machine is little endian, and
+ * that performance is very important.  These assumptions match our current
+ * operating environment.
+ */
+
+/**
+ * This is the largest field size supported by getField & setField.  Any
+ * field that is larger is not guaranteed to fit in a single, byte aligned
+ * uint32_t.
+ **/
+enum { MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1 };
+
+/**
+ * This is the number of guard bytes needed at the end of the memory byte
+ * array when using the bit utilities.  3 bytes are needed when getField &
+ * setField access a field, because they will access some "extra" bytes
+ * past the end of the field.  And 7 bytes are needed when getBigField &
+ * setBigField access a big field, for the same reason.  Note that moveBits
+ * calls getBigField & setBigField.  7 is rewritten to make it clear how it
+ * is derived.
+ **/
+enum { POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1 };
+
+/**
+ * Get a bit field from a bit stream
+ *
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE unsigned int getField(const byte *memory, uint64_t offset,
+                                    int size)
+{
+  const void *addr = memory + offset / CHAR_BIT;
+  return (getUInt32LE(addr) >> (offset % CHAR_BIT)) & ((1 << size) - 1);
+}
+
+/**
+ * Set a bit field in a bit stream
+ *
+ * @param value   The value to put into the field
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE void setField(unsigned int value, byte *memory, uint64_t offset,
+                            int size)
+{
+  void *addr = memory + offset / CHAR_BIT;
+  int shift = offset % CHAR_BIT;
+  uint32_t data = getUInt32LE(addr);
+  data &= ~(((1 << size) - 1) << shift);
+  data |= value << shift;
+  storeUInt32LE(addr, data);
+}
+
+/**
+ * Set a bit field in a bit stream to all ones
+ *
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE void setOne(byte *memory, uint64_t offset, int size)
+{
+  if (size > 0) {
+    byte *addr = memory + offset / CHAR_BIT;
+    int shift = offset % CHAR_BIT;
+    int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size;
+    *addr++ |= ((1 << count) - 1) << shift;
+    for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) {
+      *addr++ = 0xFF;
+    }
+    if (size) {
+      *addr |= ~(0xFF << size);
+    }
+  }
+}
+
+/**
+ * Set a bit field in a bit stream to all zeros
+ *
+ * @param memory  The base memory byte address
+ * @param offset  The bit offset into the memory for the start of the field
+ * @param size    The number of bits in the field
+ *
+ * @return the bit field
+ **/
+static INLINE void setZero(byte *memory, uint64_t offset, int size)
+{
+  if (size > 0) {
+    byte *addr = memory + offset / CHAR_BIT;
+    int shift = offset % CHAR_BIT;
+    int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size;
+    *addr++ &= ~(((1 << count) - 1) << shift);
+    for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) {
+      *addr++ = 0;
+    }
+    if (size) {
+      *addr &= 0xFF << size;
+    }
+  }
+}
+
+/**
+ * Get a byte stream from a bit stream, reading a whole number of bytes
+ * from an arbitrary bit boundary.
+ *
+ * @param memory       The base memory byte address for the bit stream
+ * @param offset       The bit offset of the start of the bit stream
+ * @param destination  Where to store the bytes
+ * @param size         The number of bytes
+ **/
+void getBytes(const byte *memory, uint64_t offset, byte *destination, int size);
+
+/**
+ * Store a byte stream into a bit stream, writing a whole number of bytes
+ * to an arbitrary bit boundary.
+ *
+ * @param memory  The base memory byte address for the bit stream
+ * @param offset  The bit offset of the start of the bit stream
+ * @param source  Where to read the bytes
+ * @param size    The number of bytes
+ **/
+void setBytes(byte *memory, uint64_t offset, const byte *source, int size);
+
+/**
+ * Move bits from one field to another.  When the fields overlap, behave as
+ * if we first move all the bits from the source to a temporary value, and
+ * then move all the bits from the temporary value to the destination.
+ *
+ * @param sMemory         The base source memory byte address
+ * @param source          Bit offset into memory for the source start
+ * @param dMemory         The base destination memory byte address
+ * @param destination     Bit offset into memory for the destination start
+ * @param size            The number of bits in the field
+ **/
+void moveBits(const byte *sMemory, uint64_t source, byte *dMemory,
+              uint64_t destination, int size);
+
+/**
+ * Compare bits from one field to another, testing for sameness
+ *
+ * @param mem1     The base memory byte address (first field)
+ * @param offset1  Bit offset into the memory for the start (first field)
+ * @param mem2     The base memory byte address (second field)
+ * @param offset2  Bit offset into the memory for the start (second field)
+ * @param size     The number of bits in the field
+ *
+ * @return true if fields are the same, false if different
+ **/
+bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2,
+              uint64_t offset2, int size)
+  __attribute__((warn_unused_result));
+
+#endif /* BITS_H */
diff --git a/uds/buffer.c b/uds/buffer.c
new file mode 100644
index 0000000..2bf6d20
--- /dev/null
+++ b/uds/buffer.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/buffer.c#3 $
+ */
+
+#include "buffer.h"
+
+#include "bufferPrivate.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "typeDefs.h"
+
+/**********************************************************************/
+int wrapBuffer(byte    *bytes,
+               size_t   length,
+               size_t   contentLength,
+               Buffer **bufferPtr)
+{
+  int result = ASSERT((contentLength <= length),
+                      "content length, %zu, fits in buffer size, %zu",
+                      length, contentLength);
+  Buffer *buffer;
+  result = ALLOCATE(1, Buffer, "buffer", &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  buffer->data    = bytes;
+  buffer->start   = 0;
+  buffer->end     = contentLength;
+  buffer->length  = length;
+  buffer->wrapped = true;
+
+  *bufferPtr      = buffer;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int makeBuffer(size_t size, Buffer **newBuffer)
+{
+  byte *data;
+  int result = ALLOCATE(size, byte, "buffer data", &data);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  Buffer *buffer;
+  result = wrapBuffer(data, size, 0, &buffer);
+  if (result != UDS_SUCCESS) {
+    FREE(data);
+    return result;
+  }
+
+  buffer->wrapped = false;
+  *newBuffer = buffer;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+void freeBuffer(Buffer **pBuffer)
+{
+  Buffer *buffer = *pBuffer;
+  *pBuffer = NULL;
+  if (buffer == NULL) {
+    return;
+  }
+  if (!buffer->wrapped) {
+    FREE(buffer->data);
+  }
+  FREE(buffer);
+}
+
+/**********************************************************************/
+size_t bufferLength(Buffer *buffer)
+{
+  return buffer->length;
+}
+
+/**********************************************************************/
+size_t contentLength(Buffer *buffer)
+{
+  return buffer->end - buffer->start;
+}
+
+/**********************************************************************/
+size_t uncompactedAmount(Buffer *buffer)
+{
+  return buffer->start;
+}
+
+/**********************************************************************/
+size_t availableSpace(Buffer *buffer)
+{
+  return buffer->length - buffer->end;
+}
+
+/**********************************************************************/
+size_t bufferUsed(Buffer *buffer)
+{
+  return buffer->end;
+}
+
+/***********************************************************************/
+int growBuffer(Buffer *buffer, size_t length)
+{
+  if (buffer == NULL) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot resize NULL buffer");
+  }
+
+  if (buffer->wrapped) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot resize wrapped buffer");
+  }
+  if (buffer->end > length) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot shrink buffer");
+  }
+
+  byte *data;
+  int result = reallocateMemory(buffer->data, buffer->length, length,
+                                "buffer data", &data);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  buffer->data   = data;
+  buffer->length = length;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+bool ensureAvailableSpace(Buffer *buffer, size_t bytes)
+{
+  if (availableSpace(buffer) >= bytes) {
+    return true;
+  }
+  compactBuffer(buffer);
+  return (availableSpace(buffer) >= bytes);
+}
+
+/***********************************************************************/
+void clearBuffer(Buffer *buffer)
+{
+  buffer->start = 0;
+  buffer->end = buffer->length;
+}
+
+/***********************************************************************/
+void compactBuffer(Buffer *buffer)
+{
+  if ((buffer->start == 0) || (buffer->end == 0)) {
+    return;
+  }
+  size_t bytesToMove = buffer->end - buffer->start;
+  memmove(buffer->data, buffer->data + buffer->start, bytesToMove);
+  buffer->start = 0;
+  buffer->end = bytesToMove;
+}
+
+/**********************************************************************/
+int resetBufferEnd(Buffer *buffer, size_t end)
+{
+  if (end > buffer->length) {
+    return UDS_BUFFER_ERROR;
+  }
+  buffer->end = end;
+  if (buffer->start > buffer->end) {
+    buffer->start = buffer->end;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int skipForward(Buffer *buffer, size_t bytesToSkip)
+{
+  if (contentLength(buffer) < bytesToSkip) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  buffer->start += bytesToSkip;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int rewindBuffer(Buffer *buffer, size_t bytesToRewind)
+{
+  if (buffer->start < bytesToRewind) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  buffer->start -= bytesToRewind;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+bool hasSameBytes(Buffer *buffer, const byte *data, size_t length)
+{
+  return ((contentLength(buffer) >= length)
+          && (memcmp(buffer->data + buffer->start, data, length) == 0));
+}
+
+/**********************************************************************/
+bool equalBuffers(Buffer *buffer1, Buffer *buffer2)
+{
+  return hasSameBytes(buffer1, buffer2->data + buffer2->start,
+                      contentLength(buffer2));
+}
+
+/**********************************************************************/
+int getByte(Buffer *buffer, byte *bytePtr)
+{
+  if (contentLength(buffer) < sizeof(byte)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  *bytePtr = buffer->data[buffer->start++];
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int peekByte(Buffer *buffer, size_t offset, byte *bytePtr)
+{
+  if (contentLength(buffer) < (offset + sizeof(byte))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  *bytePtr = buffer->data[buffer->start + offset];
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putByte(Buffer *buffer, byte b)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(byte))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  buffer->data[buffer->end++] = b;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination)
+{
+  if (contentLength(buffer) < length) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  memcpy(destination, buffer->data + buffer->start, length);
+  buffer->start += length;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+byte *getBufferContents(Buffer *buffer)
+{
+  return buffer->data + buffer->start;
+}
+
+/**********************************************************************/
+int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr)
+{
+  byte *destination;
+  int   result = ALLOCATE(length, byte, "copyBytes() buffer",
+                          &destination);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = getBytesFromBuffer(buffer, length, destination);
+  if (result != UDS_SUCCESS) {
+    FREE(destination);
+  } else {
+    *destinationPtr = destination;
+  }
+  return result;
+}
+
+/**********************************************************************/
+int putBytes(Buffer *buffer, size_t length, const void *source)
+{
+  if (!ensureAvailableSpace(buffer, length)) {
+    return UDS_BUFFER_ERROR;
+  }
+  memcpy(buffer->data + buffer->end, source, length);
+  buffer->end += length;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putBuffer(Buffer *target, Buffer *source, size_t length)
+{
+  if (contentLength(source) < length) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  int result = putBytes(target, length, getBufferContents(source));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  source->start += length;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int zeroBytes(Buffer *buffer, size_t length)
+{
+  if (!ensureAvailableSpace(buffer, length)) {
+    return UDS_BUFFER_ERROR;
+  }
+  memset(buffer->data + buffer->end, 0, length);
+  buffer->end += length;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getBoolean(Buffer *buffer, bool *b)
+{
+  byte by;
+  int result = getByte(buffer, &by);
+  if (result == UDS_SUCCESS) {
+    *b = (by == 1);
+  }
+  return result;
+}
+
+/**********************************************************************/
+int putBoolean(Buffer *buffer, bool b)
+{
+  return putByte(buffer, (byte) (b ? 1 : 0));
+}
+
+/**********************************************************************/
+int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui)
+{
+  if (contentLength(buffer) < sizeof(uint16_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeUInt16BE(buffer->data, &buffer->start, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeUInt16BE(buffer->data, &buffer->end, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui)
+{
+  if (contentLength(buffer) < sizeof(uint32_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeUInt32BE(buffer->data, &buffer->start, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeUInt32BE(buffer->data, &buffer->end, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui)
+{
+  if (contentLength(buffer) < (sizeof(uint32_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    decodeUInt32BE(buffer->data, &buffer->start, ui + i);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint32_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    encodeUInt32BE(buffer->data, &buffer->end, ui[i]);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui)
+{
+  if (contentLength(buffer) < (sizeof(uint64_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    decodeUInt64BE(buffer->data, &buffer->start, ui + i);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    encodeUInt64BE(buffer->data, &buffer->end, ui[i]);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui)
+{
+  if (contentLength(buffer) < sizeof(uint16_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeUInt16LE(buffer->data, &buffer->start, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeUInt16LE(buffer->data, &buffer->end, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui)
+{
+  if (contentLength(buffer) < (sizeof(uint16_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    decodeUInt16LE(buffer->data, &buffer->start, ui + i);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint16_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    encodeUInt16LE(buffer->data, &buffer->end, ui[i]);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getInt32LEFromBuffer(Buffer *buffer, int32_t *i)
+{
+  if (contentLength(buffer) < sizeof(int32_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeInt32LE(buffer->data, &buffer->start, i);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui)
+{
+  if (contentLength(buffer) < sizeof(uint32_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeUInt32LE(buffer->data, &buffer->start, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeUInt32LE(buffer->data, &buffer->end, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putInt64LEIntoBuffer(Buffer *buffer, int64_t i)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(int64_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeInt64LE(buffer->data, &buffer->end, i);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui)
+{
+  if (contentLength(buffer) < sizeof(uint64_t)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  decodeUInt64LE(buffer->data, &buffer->start, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint64_t))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  encodeUInt64LE(buffer->data, &buffer->end, ui);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui)
+{
+  if (contentLength(buffer) < (sizeof(uint64_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    decodeUInt64LE(buffer->data, &buffer->start, ui + i);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui)
+{
+  if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  unsigned int i;
+  for (i = 0; i < count; i++) {
+    encodeUInt64LE(buffer->data, &buffer->end, ui[i]);
+  }
+  return UDS_SUCCESS;
+}
+
diff --git a/uds/buffer.h b/uds/buffer.h
new file mode 100644
index 0000000..22df042
--- /dev/null
+++ b/uds/buffer.h
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/buffer.h#2 $
+ */
+
+#ifndef BUFFER_H
+#define BUFFER_H
+
+#include "common.h"
+
+typedef struct buffer Buffer;
+
+/**
+ * Create a buffer which wraps an existing byte array.
+ *
+ * @param bytes         The bytes to wrap
+ * @param length        The length of the buffer
+ * @param contentLength The length of the current contents of the buffer
+ * @param bufferPtr     A pointer to hold the buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int wrapBuffer(byte    *bytes,
+               size_t   length,
+               size_t   contentLength,
+               Buffer **bufferPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a new buffer and allocate its memory.
+ *
+ * @param length    The length of the buffer
+ * @param bufferPtr A pointer to hold the buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeBuffer(size_t length, Buffer **bufferPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release a buffer and, if not wrapped, free its memory.
+ *
+ * @param pBuffer Pointer to the buffer to release
+ **/
+void freeBuffer(Buffer **pBuffer);
+
+/**
+ * Grow a non-wrapped buffer.
+ *
+ * @param buffer The buffer to resize
+ * @param length The new length of the buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int growBuffer(Buffer *buffer, size_t length)
+  __attribute__((warn_unused_result));
+
+/**
+ * Ensure that a buffer has a given amount of space available, compacting the
+ * buffer if necessary.
+ *
+ * @param buffer The buffer
+ * @param bytes  The number of available bytes desired
+ *
+ * @return <code>true</code> if the requested number of bytes are now available
+ **/
+bool ensureAvailableSpace(Buffer *buffer, size_t bytes)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clear the buffer. The start position is set to zero and the end position
+ * is set to the buffer length.
+ **/
+void clearBuffer(Buffer *buffer);  
+
+/**
+ * Eliminate buffer contents which have been extracted. This function copies
+ * any data between the start and end pointers to the beginning of the buffer,
+ * moves the start pointer to the beginning, and the end pointer to the end
+ * of the copied data.
+ *
+ * @param buffer The buffer to compact
+ **/
+void compactBuffer(Buffer *buffer);
+
+/**
+ * Skip forward the specified number of bytes in a buffer (advance the
+ * start pointer).
+ *
+ * @param buffer      The buffer
+ * @param bytesToSkip The number of bytes to skip
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long
+ *         enough to skip forward the requested number of bytes
+ **/
+int skipForward(Buffer *buffer, size_t bytesToSkip)
+  __attribute__((warn_unused_result));
+
+/**
+ * Rewind the specified number of bytes in a buffer (back up the start
+ * pointer).
+ *
+ * @param buffer        The buffer
+ * @param bytesToRewind The number of bytes to rewind
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long
+ *         enough to rewind backward the requested number of bytes
+ **/
+int rewindBuffer(Buffer *buffer, size_t bytesToRewind)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the length of the buffer.
+ *
+ * @param buffer        the buffer
+ *
+ * @return the buffer length
+ **/
+size_t bufferLength(Buffer *buffer);
+
+/**
+ * Compute the amount of data current in the buffer.
+ *
+ * @param buffer The buffer to examine
+ *
+ * @return The number of bytes between the start and end pointers of the buffer
+ **/
+size_t contentLength(Buffer *buffer);
+
+/**
+ * Compute the amount of available space in this buffer.
+ *
+ * @param buffer The buffer to examine
+ *
+ * @return The number of bytes between the end pointer and the end of the buffer
+ **/
+size_t availableSpace(Buffer *buffer);
+
+/**
+ * Amount of buffer that has already been processed.
+ *
+ * @param buffer  the buffer to examine
+ *
+ * @return The number of bytes between the beginning of the buffer and the
+ *         start pointer.
+ **/
+size_t uncompactedAmount(Buffer *buffer);
+
+/**
+ * Return the amount of the buffer that is currently utilized.
+ *
+ * @param buffer  the buffer to examine
+ *
+ * @return The number of bytes between the beginning of the buffer and
+ *         the end pointer.
+ **/
+size_t bufferUsed(Buffer *buffer);
+
+/**
+ * Reset the end of buffer to a different position.
+ *
+ * @param buffer        the buffer
+ * @param end           the new end of the buffer
+ *
+ * @return UDS_SUCCESS unless the end is larger than can fit
+ **/
+int resetBufferEnd(Buffer *buffer, size_t end)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the start of the content of a buffer matches a specified
+ * array of bytes.
+ *
+ * @param buffer The buffer to check
+ * @param data   The desired data
+ * @param length The length of the desired data
+ *
+ * @return <code>true</code> if the first length bytes of the buffer's
+ *         contents match data
+ **/
+bool hasSameBytes(Buffer *buffer, const byte *data, size_t length)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether two buffers have the same contents.
+ *
+ * @param buffer1  The first buffer
+ * @param buffer2  The second buffer
+ *
+ * @return <code>true</code> if the contents of the two buffers are the
+ * same
+ **/
+bool equalBuffers(Buffer *buffer1, Buffer *buffer2);
+
+/**
+ * Get a single byte from a buffer and advance the start pointer.
+ *
+ * @param buffer  The buffer
+ * @param bytePtr A pointer to hold the byte
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are no bytes to
+ *         retrieve
+ **/
+int getByte(Buffer *buffer, byte *bytePtr) __attribute__((warn_unused_result));
+
+/**
+ * Get a single byte from a buffer without advancing the start pointer.
+ *
+ * @param buffer The buffer
+ * @param offset The offset past the start pointer of the desired byte
+ * @param bytePtr A pointer to hold the byte
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the offset is past the end
+ * of the buffer
+ **/
+int peekByte(Buffer *buffer, size_t offset, byte *bytePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a single byte into a buffer and advance the end pointer.
+ *
+ * @param buffer  The buffer
+ * @param b       The byte to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer
+ **/
+int putByte(Buffer *buffer, byte b) __attribute__((warn_unused_result));
+
+/**
+ * Get bytes out of a buffer and advance the start of the buffer past the
+ * copied data.
+ *
+ * @param buffer      The buffer from which to copy
+ * @param length      The number of bytes to copy
+ * @param destination A pointer to hold the data
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a pointer to the current contents of the buffer. This will be a pointer
+ * to the actual memory managed by the buffer. It is the caller's responsibility
+ * to ensure that the buffer is not modified while this pointer is in use.
+ *
+ * @param buffer The buffer from which to get the contents
+ *
+ * @return a pointer to the current contents of the buffer
+ **/
+byte *getBufferContents(Buffer *buffer);
+
+/**
+ * Copy bytes out of a buffer and advance the start of the buffer past the
+ * copied data. Memory will be allocated to hold the copy.
+ *
+ * @param buffer         The buffer from which to copy
+ * @param length         The number of bytes to copy
+ * @param destinationPtr A pointer to hold the copied data
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Copy bytes into a buffer and advance the end of the buffer past the
+ * copied data.
+ *
+ * @param buffer The buffer to copy into
+ * @param length The length of the data to copy
+ * @param source The data to copy
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have
+ *         length bytes available
+ **/
+int putBytes(Buffer *buffer, size_t length, const void *source)
+  __attribute__((warn_unused_result));
+
+/**
+ * Copy the contents of a source buffer into the target buffer. Advances the
+ * start of the source buffer and the end of the target buffer past the copied
+ * data.
+ *
+ * @param target The buffer to receive the copy of the data
+ * @param source The buffer containing the data to copy
+ * @param length The length of the data to copy
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the target buffer does not have
+ *         length bytes available or if the source buffer does not have length
+ *         bytes of content
+ **/
+int putBuffer(Buffer *target, Buffer *source, size_t length)
+  __attribute__((warn_unused_result));
+
+/**
+ * Zero bytes in a buffer starting at the start pointer, and advance the
+ * end of the buffer past the zeros.
+ *
+ * @param buffer The buffer to zero
+ * @param length The number of bytes to zero
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have
+ *         length bytes available
+ **/
+int zeroBytes(Buffer *buffer, size_t length)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a boolean value from a buffer and advance the start pointer.
+ *
+ * @param buffer The buffer
+ * @param b      A pointer to hold the boolean value
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data
+ *         in the buffer
+ **/
+int getBoolean(Buffer *buffer, bool *b) __attribute__((warn_unused_result));
+
+/**
+ * Put a boolean value into a buffer and advance the end pointer.
+ *
+ * @param buffer  The buffer
+ * @param b       The boolean to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer
+ **/
+int putBoolean(Buffer *buffer, bool b) __attribute__((warn_unused_result));
+
+/**
+ * Get a 2 byte, big endian encoded integer from a buffer and advance the
+ * start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2
+ *         bytes available
+ **/
+int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a 2 byte, big endian encoded integer into a buffer and advance the
+ * end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2
+ *         bytes available
+ **/
+int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a 4 byte, big endian encoded integer from a buffer and advance the
+ * start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4
+ *         bytes available
+ **/
+int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a 4 byte, big endian encoded integer into a buffer and advance the
+ * end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4
+ *         bytes available
+ **/
+int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a series of 4 byte, big endian encoded integer from a buffer and
+ * advance the start pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to get
+ * @param ui     A pointer to hold the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data
+ *         in the buffer
+ **/
+int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a series of 4 byte, big endian encoded integers into a buffer and
+ * advance the end pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to put
+ * @param ui     A pointer to the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space
+ *         in the buffer
+ **/
+int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a series of 8 byte, big endian encoded integer from a buffer and
+ * advance the start pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to get
+ * @param ui     A pointer to hold the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data
+ *         in the buffer
+ **/
+int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a series of 8 byte, big endian encoded integers into a buffer and
+ * advance the end pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to put
+ * @param ui     A pointer to the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space
+ *         in the buffer
+ **/
+int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a 2 byte, little endian encoded integer from a buffer and
+ * advance the start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2
+ *         bytes available
+ **/
+int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a 2 byte, little endian encoded integer into a buffer and advance the
+ * end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2
+ *         bytes available
+ **/
+int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a series of 2 byte, little endian encoded integer from a buffer
+ * and advance the start pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to get
+ * @param ui     A pointer to hold the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data
+ *         in the buffer
+ **/
+int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a series of 2 byte, little endian encoded integers into a
+ * buffer and advance the end pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to put
+ * @param ui     A pointer to the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space
+ *         in the buffer
+ **/
+int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a 4 byte, little endian encoded integer from a buffer and advance the
+ * start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param i      A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4
+ *         bytes available
+ **/
+int getInt32LEFromBuffer(Buffer *buffer, int32_t *i)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a 4 byte, little endian encoded integer from a buffer and advance the
+ * start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4
+ *         bytes available
+ **/
+int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a 4 byte, little endian encoded integer into a buffer and advance the
+ * end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4
+ *         bytes available
+ **/
+int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get an 8 byte, little endian encoded, unsigned integer from a
+ * buffer and advance the start pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     A pointer to hold the integer
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8
+ *         bytes available
+ **/
+int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put an 8 byte, little endian encoded signed integer into a buffer
+ * and advance the end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param i      The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8
+ *         bytes available
+ **/
+int putInt64LEIntoBuffer(Buffer *buffer, int64_t i)
+  __attribute__((warn_unused_result));
+
+ /**
+ * Put an 8 byte, little endian encoded integer into a buffer and advance the
+ * end pointer past it.
+ *
+ * @param buffer The buffer
+ * @param ui     The integer to put
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8
+ *         bytes available
+ **/
+int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a series of 8 byte, little endian encoded integer from a buffer
+ * and advance the start pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to get
+ * @param ui     A pointer to hold the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data
+ *         in the buffer
+ **/
+int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a series of 8 byte, little endian encoded integers into a buffer and
+ * advance the end pointer past them.
+ *
+ * @param buffer The buffer
+ * @param count  The number of integers to put
+ * @param ui     A pointer to the integers
+ *
+ * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space
+ *         in the buffer
+ **/
+int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui)
+  __attribute__((warn_unused_result));
+
+#endif /* BUFFER_H */
diff --git a/uds/bufferPrivate.h b/uds/bufferPrivate.h
new file mode 100644
index 0000000..8a0f46a
--- /dev/null
+++ b/uds/bufferPrivate.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bufferPrivate.h#1 $
+ */
+
+#ifndef BUFFER_PRIVATE_H
+#define BUFFER_PRIVATE_H
+
+#include "common.h"
+
+struct buffer {
+  size_t  start;
+  size_t  end;
+  size_t  length;
+  byte   *data;
+  bool    wrapped;
+};
+
+#endif /* BUFFER_PRIVATE_H */
diff --git a/uds/bufferedReader.c b/uds/bufferedReader.c
new file mode 100644
index 0000000..b67d33d
--- /dev/null
+++ b/uds/bufferedReader.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.c#5 $
+ */
+
+#include "bufferedReader.h"
+
+#include "compiler.h"
+#include "ioFactory.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+
+#ifndef __KERNEL__
+/*
+ * Define sector_t.  The kernel really wants us to use it.  The code becomes
+ * ugly if we need to #ifdef every usage of sector_t.  Note that the of #define
+ * means that even if a user mode include typedefs sector_t, it will not affect
+ * this module.
+ */
+#define sector_t uint64_t
+#endif
+
+struct bufferedReader {
+#ifdef __KERNEL__
+  // IOFactory owning the block device
+  IOFactory              *br_factory;
+  // The dm_bufio_client to read from
+  struct dm_bufio_client *br_client;
+  // The current dm_buffer
+  struct dm_buffer       *br_buffer;
+  // The number of blocks that can be read from
+  sector_t                br_limit;
+  // Number of the current block
+  sector_t                br_blockNumber;
+#else
+  // Region to read from
+  IORegion               *br_region;
+  // Number of the current block
+  uint64_t                br_blockNumber;
+#endif
+  // Start of the buffer
+  byte                   *br_start;
+  // End of the data read from the buffer
+  byte                   *br_pointer;
+};
+
+#ifdef __KERNEL__
+/*****************************************************************************/
+static void readAhead(BufferedReader *br, sector_t blockNumber)
+{
+  if (blockNumber < br->br_limit) {
+    enum { MAX_READ_AHEAD = 4 };
+    size_t readAhead = minSizeT(MAX_READ_AHEAD, br->br_limit - blockNumber);
+    dm_bufio_prefetch(br->br_client, blockNumber, readAhead);
+  }
+}
+#endif
+
+/*****************************************************************************/
+#ifdef __KERNEL__
+int makeBufferedReader(IOFactory               *factory,
+                       struct dm_bufio_client  *client,
+                       sector_t                 blockLimit,
+                       BufferedReader         **readerPtr)
+{
+  BufferedReader *reader = NULL;
+  int result = ALLOCATE(1, BufferedReader, "buffered reader", &reader);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *reader = (BufferedReader) {
+    .br_factory     = factory,
+    .br_client      = client,
+    .br_buffer      = NULL,
+    .br_limit       = blockLimit,
+    .br_blockNumber = 0,
+    .br_start       = NULL,
+    .br_pointer     = NULL,
+  };
+  
+  readAhead(reader,0);
+  getIOFactory(factory);
+  *readerPtr = reader;
+  return UDS_SUCCESS;
+}
+#else
+int makeBufferedReader(IORegion *region, BufferedReader **readerPtr)
+{
+  byte *data;
+  int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte,
+                                   "buffer writer buffer", &data);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BufferedReader *reader = NULL;
+  result = ALLOCATE(1, BufferedReader, "buffered reader", &reader);
+  if (result != UDS_SUCCESS) {
+    FREE(data);
+    return result;
+  }
+
+  *reader = (BufferedReader) {
+    .br_region      = region,
+    .br_blockNumber = 0,
+    .br_start       = data,
+    .br_pointer     = NULL,
+  };
+  
+  getIORegion(region);
+  *readerPtr = reader;
+  return UDS_SUCCESS;
+}
+#endif
+
+/*****************************************************************************/
+void freeBufferedReader(BufferedReader *br)
+{
+  if (br == NULL) {
+    return;
+  }
+#ifdef __KERNEL__
+  if (br->br_buffer != NULL) {
+    dm_bufio_release(br->br_buffer);
+  }
+  dm_bufio_client_destroy(br->br_client);
+  putIOFactory(br->br_factory);
+#else
+  putIORegion(br->br_region);
+  FREE(br->br_start);
+#endif
+  FREE(br);
+}
+
+/*****************************************************************************/
+static int positionReader(BufferedReader *br,
+                          sector_t        blockNumber,
+                          off_t           offset)
+{
+  if ((br->br_pointer == NULL) || (blockNumber != br->br_blockNumber)) {
+#ifdef __KERNEL__
+    if (blockNumber >= br->br_limit) {
+      return UDS_OUT_OF_RANGE;
+    }
+    if (br->br_buffer != NULL) {
+      dm_bufio_release(br->br_buffer);
+      br->br_buffer = NULL;
+    }
+    struct dm_buffer *buffer = NULL;
+    void *data = dm_bufio_read(br->br_client, blockNumber, &buffer);
+    if (IS_ERR(data)) {
+      return -PTR_ERR(data);
+    }
+    br->br_buffer = buffer;
+    br->br_start  = data;
+    if (blockNumber == br->br_blockNumber + 1) {
+      readAhead(br, blockNumber + 1);
+    }
+#else
+    int result = readFromRegion(br->br_region, blockNumber * UDS_BLOCK_SIZE,
+                                br->br_start, UDS_BLOCK_SIZE, NULL);
+    if (result != UDS_SUCCESS) {
+      logWarningWithStringError(result, "%s got readFromRegion error",
+                                __func__);
+      return result;
+    }
+#endif
+  }
+  br->br_blockNumber = blockNumber;
+  br->br_pointer     = br->br_start + offset;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static size_t bytesRemainingInReadBuffer(BufferedReader *br)
+{
+  return (br->br_pointer == NULL
+          ? 0
+          : br->br_start + UDS_BLOCK_SIZE - br->br_pointer);
+}
+
+/*****************************************************************************/
+int readFromBufferedReader(BufferedReader *br, void *data, size_t length)
+{
+  byte *dp = data;
+  int result = UDS_SUCCESS;
+  while (length > 0) {
+    if (bytesRemainingInReadBuffer(br) == 0) {
+      sector_t blockNumber = br->br_blockNumber;
+      if (br->br_pointer != NULL) {
+        ++blockNumber;
+      }
+      result = positionReader(br, blockNumber, 0);
+      if (result != UDS_SUCCESS) {
+        break;
+      }
+    }
+
+    size_t avail = bytesRemainingInReadBuffer(br);
+    size_t chunk = minSizeT(length, avail);
+    memcpy(dp, br->br_pointer, chunk);
+    length         -= chunk;
+    dp             += chunk;
+    br->br_pointer += chunk;
+  }
+
+  if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE))
+      && (dp - (byte *) data > 0)) {
+    result = UDS_SHORT_READ;
+  }
+  return result;
+}
+
+/*****************************************************************************/
+int verifyBufferedData(BufferedReader *br,
+                       const void     *value,
+                       size_t          length)
+{
+  const byte *vp = value;
+  sector_t startingBlockNumber = br->br_blockNumber;
+  int      startingOffset      = br->br_pointer - br->br_start;
+  while (length > 0) {
+    if (bytesRemainingInReadBuffer(br) == 0) {
+      sector_t blockNumber = br->br_blockNumber;
+      if (br->br_pointer != NULL) {
+        ++blockNumber;
+      }
+      int result = positionReader(br, blockNumber, 0);
+      if (result != UDS_SUCCESS) {
+        positionReader(br, startingBlockNumber, startingOffset);
+        return UDS_CORRUPT_FILE;
+      }
+    }
+
+    size_t avail = bytesRemainingInReadBuffer(br);
+    size_t chunk = minSizeT(length, avail);
+    if (memcmp(vp, br->br_pointer, chunk) != 0) {
+      positionReader(br, startingBlockNumber, startingOffset);
+      return UDS_CORRUPT_FILE;
+    }
+    length         -= chunk;
+    vp             += chunk;
+    br->br_pointer += chunk;
+  }
+
+  return UDS_SUCCESS;
+}
diff --git a/uds/bufferedReader.h b/uds/bufferedReader.h
new file mode 100644
index 0000000..4da8119
--- /dev/null
+++ b/uds/bufferedReader.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.h#3 $
+ */
+
+#ifndef BUFFERED_READER_H
+#define BUFFERED_READER_H 1
+
+#include "common.h"
+
+#ifdef __KERNEL__
+struct dm_bufio_client;
+struct ioFactory;
+#else
+struct ioRegion;
+#endif
+
+/**
+ * The buffered reader allows efficient IO for IORegions, which may be
+ * file- or block-based. The internal buffer always reads aligned data
+ * from the underlying region.
+ **/
+typedef struct bufferedReader BufferedReader;
+
+#ifdef __KERNEL__
+/**
+ * Make a new buffered reader.
+ *
+ * @param factory     The IOFactory creating the buffered reader.
+ * @param client      The dm_bufio_client to read from.
+ * @param blockLimit  The number of blocks that may be read.
+ * @param readerPtr   The pointer to hold the newly allocated buffered reader
+ *
+ * @return UDS_SUCCESS or error code.
+ **/
+int makeBufferedReader(struct ioFactory        *factory,
+                       struct dm_bufio_client  *client,
+                       sector_t                 blockLimit,
+                       BufferedReader         **readerPtr)
+  __attribute__((warn_unused_result));
+#else
+/**
+ * Make a new buffered reader.
+ *
+ * @param region     An IORegion to read from.
+ * @param readerPtr  The pointer to hold the newly allocated buffered reader.
+ *
+ * @return UDS_SUCCESS or error code.
+ **/
+int makeBufferedReader(struct ioRegion *region, BufferedReader **readerPtr)
+  __attribute__((warn_unused_result));
+#endif
+
+/**
+ * Free a buffered reader.
+ *
+ * @param reader        The buffered reader
+ **/
+void freeBufferedReader(BufferedReader *reader);
+
+/**
+ * Retrieve data from a buffered reader, reading from the region when needed.
+ *
+ * @param reader        The buffered reader
+ * @param data          The buffer to read data into
+ * @param length        The length of the data to read
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int readFromBufferedReader(BufferedReader *reader, void *data, size_t length)
+  __attribute__((warn_unused_result));
+
+/**
+ * Verify that the data currently in the buffer matches the required value.
+ *
+ * @param reader        The buffered reader.
+ * @param value         The value that must match the buffer contents.
+ * @param length        The length of the value that must match.
+ *
+ * @return UDS_SUCCESS or an error code, specifically UDS_CORRUPT_FILE
+ *         if the required value fails to match.
+ *
+ * @note If the value matches, the matching contents are consumed. However,
+ *       if the match fails, any buffer contents are left as is.
+ **/
+int verifyBufferedData(BufferedReader *reader,
+                       const void     *value,
+                       size_t          length)
+  __attribute__((warn_unused_result));
+
+#endif // BUFFERED_READER_H
diff --git a/uds/bufferedWriter.c b/uds/bufferedWriter.c
new file mode 100644
index 0000000..abfb9cf
--- /dev/null
+++ b/uds/bufferedWriter.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.c#6 $
+ */
+
+#include "bufferedWriter.h"
+
+#include "compiler.h"
+#include "errors.h"
+#include "ioFactory.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+
+
+struct bufferedWriter {
+#ifdef __KERNEL__
+  // IOFactory owning the block device
+  IOFactory              *bw_factory;
+  // The dm_bufio_client to write to
+  struct dm_bufio_client *bw_client;
+  // The current dm_buffer
+  struct dm_buffer       *bw_buffer;
+  // The number of blocks that can be written to
+  sector_t                bw_limit;
+  // Number of the current block
+  sector_t                bw_blockNumber;
+#else
+  // Region to write to
+  IORegion               *bw_region;
+  // Number of the current block
+  uint64_t                bw_blockNumber;
+#endif
+  // Start of the buffer
+  byte                   *bw_start;
+  // End of the data written to the buffer
+  byte                   *bw_pointer;
+  // Error code
+  int                     bw_error;
+  // Have writes been done?
+  bool                    bw_used;
+};
+
+#ifdef __KERNEL__
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+int prepareNextBuffer(BufferedWriter *bw)
+{
+  if (bw->bw_blockNumber >= bw->bw_limit) {
+    bw->bw_error = UDS_OUT_OF_RANGE;
+    return UDS_OUT_OF_RANGE;
+  }
+
+  struct dm_buffer *buffer = NULL;
+  void *data = dm_bufio_new(bw->bw_client, bw->bw_blockNumber, &buffer);
+  if (IS_ERR(data)) {
+    bw->bw_error = -PTR_ERR(data);
+    return bw->bw_error;
+  }
+  bw->bw_buffer  = buffer;
+  bw->bw_start   = data;
+  bw->bw_pointer = data;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int flushPreviousBuffer(BufferedWriter *bw)
+{
+  if (bw->bw_buffer != NULL) {
+    if (bw->bw_error == UDS_SUCCESS) {
+      size_t avail = spaceRemainingInWriteBuffer(bw);
+      if (avail > 0) {
+        memset(bw->bw_pointer, 0, avail);
+      }
+      dm_bufio_mark_buffer_dirty(bw->bw_buffer);
+    }
+    dm_bufio_release(bw->bw_buffer);
+    bw->bw_buffer  = NULL;
+    bw->bw_start   = NULL;
+    bw->bw_pointer = NULL;
+    bw->bw_blockNumber++;
+  }
+  return bw->bw_error;
+}
+#endif
+
+/*****************************************************************************/
+#ifdef __KERNEL__
+int makeBufferedWriter(IOFactory               *factory,
+                       struct dm_bufio_client  *client,
+                       sector_t                 blockLimit,
+                       BufferedWriter         **writerPtr)
+{
+  BufferedWriter *writer;
+  int result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *writer = (BufferedWriter) {
+    .bw_factory     = factory,
+    .bw_client      = client,
+    .bw_buffer      = NULL,
+    .bw_limit       = blockLimit,
+    .bw_start       = NULL,
+    .bw_pointer     = NULL,
+    .bw_blockNumber = 0,
+    .bw_error       = UDS_SUCCESS,
+    .bw_used        = false,
+  };
+
+  getIOFactory(factory);
+  *writerPtr = writer;
+  return UDS_SUCCESS;
+}
+#else
+int makeBufferedWriter(IORegion *region, BufferedWriter **writerPtr)
+{
+  byte *data;
+  int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte,
+                                   "buffer writer buffer", &data);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BufferedWriter *writer;
+  result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer);
+  if (result != UDS_SUCCESS) {
+    FREE(data);
+    return result;
+  }
+
+  *writer = (BufferedWriter) {
+    .bw_region      = region,
+    .bw_start       = data,
+    .bw_pointer     = data,
+    .bw_blockNumber = 0,
+    .bw_error       = UDS_SUCCESS,
+    .bw_used        = false,
+  };
+
+  getIORegion(region);
+  *writerPtr = writer;
+  return UDS_SUCCESS;
+}
+#endif
+
+/*****************************************************************************/
+void freeBufferedWriter(BufferedWriter *bw)
+{
+  if (bw == NULL) {
+    return;
+  }
+#ifdef __KERNEL__
+  flushPreviousBuffer(bw);
+  int result = -dm_bufio_write_dirty_buffers(bw->bw_client);
+#else
+  int result = syncRegionContents(bw->bw_region);
+#endif
+  if (result != UDS_SUCCESS) {
+    logWarningWithStringError(result, "%s cannot sync storage", __func__);
+  }
+#ifdef __KERNEL__
+  dm_bufio_client_destroy(bw->bw_client);
+  putIOFactory(bw->bw_factory);
+#else
+  putIORegion(bw->bw_region);
+  FREE(bw->bw_start);
+#endif
+  FREE(bw);
+}
+
+/*****************************************************************************/
+static INLINE size_t spaceUsedInBuffer(BufferedWriter *bw)
+{
+  return bw->bw_pointer - bw->bw_start;
+}
+
+/*****************************************************************************/
+size_t spaceRemainingInWriteBuffer(BufferedWriter *bw)
+{
+  return UDS_BLOCK_SIZE - spaceUsedInBuffer(bw);
+}
+
+/*****************************************************************************/
+int writeToBufferedWriter(BufferedWriter *bw, const void *data, size_t len)
+{
+  if (bw->bw_error != UDS_SUCCESS) {
+    return bw->bw_error;
+  }
+
+  const byte *dp = data;
+  int result = UDS_SUCCESS;
+  while ((len > 0) && (result == UDS_SUCCESS)) {
+#ifdef __KERNEL__
+    if (bw->bw_buffer == NULL) {
+      result = prepareNextBuffer(bw);
+      continue;
+    }
+#endif
+
+    size_t avail = spaceRemainingInWriteBuffer(bw);
+    size_t chunk = minSizeT(len, avail);
+    memcpy(bw->bw_pointer, dp, chunk);
+    len            -= chunk;
+    dp             += chunk;
+    bw->bw_pointer += chunk;
+
+    if (spaceRemainingInWriteBuffer(bw) == 0) {
+      result = flushBufferedWriter(bw);
+    }
+  }
+
+  bw->bw_used = true;
+  return result;
+}
+
+/*****************************************************************************/
+int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len)
+{
+  if (bw->bw_error != UDS_SUCCESS) {
+    return bw->bw_error;
+  }
+
+  int result = UDS_SUCCESS;
+  while ((len > 0) && (result == UDS_SUCCESS)) {
+#ifdef __KERNEL__
+    if (bw->bw_buffer == NULL) {
+      result = prepareNextBuffer(bw);
+      continue;
+    }
+#endif
+
+    size_t avail = spaceRemainingInWriteBuffer(bw);
+    size_t chunk = minSizeT(len, avail);
+    memset(bw->bw_pointer, 0, chunk);
+    len            -= chunk;
+    bw->bw_pointer += chunk;
+
+    if (spaceRemainingInWriteBuffer(bw) == 0) {
+      result = flushBufferedWriter(bw);
+    }
+  }
+
+  bw->bw_used = true;
+  return result;
+}
+
+/*****************************************************************************/
+int flushBufferedWriter(BufferedWriter *bw)
+{
+  if (bw->bw_error != UDS_SUCCESS) {
+    return bw->bw_error;
+  }
+
+#ifdef __KERNEL__
+  return flushPreviousBuffer(bw);
+#else
+  size_t n = spaceUsedInBuffer(bw);
+  if (n > 0) {
+    int result = writeToRegion(bw->bw_region,
+                               bw->bw_blockNumber * UDS_BLOCK_SIZE,
+                               bw->bw_start, UDS_BLOCK_SIZE, n);
+    if (result != UDS_SUCCESS) {
+      return bw->bw_error = result;
+    } else {
+      bw->bw_pointer = bw->bw_start;
+      bw->bw_blockNumber++;
+    }
+  }
+  return UDS_SUCCESS;
+#endif
+}
+
+/*****************************************************************************/
+bool wasBufferedWriterUsed(const BufferedWriter *bw)
+{
+  return bw->bw_used;
+}
+
+/*****************************************************************************/
+void noteBufferedWriterUsed(BufferedWriter *bw)
+{
+  bw->bw_used = true;
+}
diff --git a/uds/bufferedWriter.h b/uds/bufferedWriter.h
new file mode 100644
index 0000000..8774b5b
--- /dev/null
+++ b/uds/bufferedWriter.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.h#5 $
+ */
+
+#ifndef BUFFERED_WRITER_H
+#define BUFFERED_WRITER_H 1
+
+#include "common.h"
+
+#ifdef __KERNEL__
+struct dm_bufio_client;
+struct ioFactory;
+#else
+struct ioRegion;
+#endif
+
+typedef struct bufferedWriter BufferedWriter;
+
+#ifdef __KERNEL__
+/**
+ * Make a new buffered writer.
+ *
+ * @param factory       The IOFactory creating the buffered writer
+ * @param client        The dm_bufio_client to write to.
+ * @param blockLimit    The number of blocks that may be written to.
+ * @param writerPtr     The new buffered writer goes here.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int makeBufferedWriter(struct ioFactory        *factory,
+                       struct dm_bufio_client  *client,
+                       sector_t                 blockLimit,
+                       BufferedWriter         **writerPtr)
+  __attribute__((warn_unused_result));
+#else
+/**
+ * Make a new buffered writer.
+ *
+ * @param region        The IOregion to write to.
+ * @param writerPtr     The new buffered writer goes here.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int makeBufferedWriter(struct ioRegion *region, BufferedWriter **writerPtr)
+  __attribute__((warn_unused_result));
+#endif
+
+/**
+ * Free a buffered writer, without flushing.
+ *
+ * @param [in] buffer   The buffered writer object.
+ **/
+void freeBufferedWriter(BufferedWriter *buffer);
+
+/**
+ * Append data to buffer, writing as needed.
+ *
+ * @param buffer        The buffered writer object.
+ * @param data          The data to write.
+ * @param len           The length of the data written.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ *                      The error may reflect previous attempts to write
+ *                      or flush the buffer.  Once a write or flush error
+ *                      occurs it is sticky.
+ **/
+int writeToBufferedWriter(BufferedWriter *buffer, const void *data, size_t len)
+  __attribute__((warn_unused_result));
+
+/**
+ * Zero data in the buffer, writing as needed.
+ *
+ * @param buffer        The buffered writer object.
+ * @param len           The number of zero bytes to write.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ *                      The error may reflect previous attempts to write
+ *                      or flush the buffer.  Once a write or flush error
+ *                      occurs it is sticky.
+ **/
+int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Flush any partial data from the buffer.
+ *
+ * @param buffer        The buffered writer object.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ *                      The error may reflect previous attempts to write
+ *                      or flush the buffer.  Once a write or flush error
+ *                      occurs it is sticky.
+ **/
+int flushBufferedWriter(BufferedWriter *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the size of the remaining space in the buffer (for testing)
+ *
+ * @param [in] buffer   The buffered writer object.
+ *
+ * @return              The number of available bytes in the buffer.
+ **/
+size_t spaceRemainingInWriteBuffer(BufferedWriter *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return whether the buffer was ever written to.
+ *
+ * @param buffer        The buffered writer object.
+ *
+ * @return              True if at least one call to writeToBufferedWriter
+ *                      was made.
+ **/
+bool wasBufferedWriterUsed(const BufferedWriter *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Note the buffer has been used.
+ *
+ * @param buffer        The buffered writer object.
+ **/
+void noteBufferedWriterUsed(BufferedWriter *buffer);
+
+#endif // BUFFERED_WRITER_H
diff --git a/uds/cacheCounters.c b/uds/cacheCounters.c
new file mode 100644
index 0000000..8bf7ad4
--- /dev/null
+++ b/uds/cacheCounters.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.c#1 $
+ */
+
+#include "cacheCounters.h"
+
+#include "atomicDefs.h"
+#include "compiler.h"
+#include "errors.h"
+#include "permassert.h"
+#include "stringUtils.h"
+#include "uds.h"
+
+/**********************************************************************/
+void incrementCacheCounter(CacheCounters   *counters,
+                           int              probeType,
+                           CacheResultKind  kind)
+{
+  CacheProbeType basicProbeType = probeType & ~CACHE_PROBE_IGNORE_FAILURE;
+  int result = ASSERT(basicProbeType <= CACHE_PROBE_RECORD_RETRY,
+                      "invalid cache probe type %#x", probeType);
+  if (result != UDS_SUCCESS) {
+    return;
+  }
+  result = ASSERT(kind <= CACHE_RESULT_QUEUED,
+                  "invalid cache probe result type %#x", kind);
+  if (result != UDS_SUCCESS) {
+    return;
+  }
+
+  if (((probeType & CACHE_PROBE_IGNORE_FAILURE) != 0)
+      && (kind != CACHE_RESULT_HIT)) {
+    return;
+  }
+
+  CacheCountsByKind *kindCounts;
+  switch (basicProbeType) {
+  case CACHE_PROBE_INDEX_FIRST:
+    kindCounts = &counters->firstTime.indexPage;
+    break;
+  case CACHE_PROBE_RECORD_FIRST:
+    kindCounts = &counters->firstTime.recordPage;
+    break;
+  case CACHE_PROBE_INDEX_RETRY:
+    kindCounts = &counters->retried.indexPage;
+    break;
+  case CACHE_PROBE_RECORD_RETRY:
+    kindCounts = &counters->retried.recordPage;
+    break;
+  default:
+    // Never used but the compiler hasn't figured that out.
+    return;
+  }
+
+  uint64_t *myCounter;
+  switch (kind) {
+  case CACHE_RESULT_MISS:
+    myCounter = &kindCounts->misses;
+    break;
+  case CACHE_RESULT_QUEUED:
+    myCounter = &kindCounts->queued;
+    break;
+  case CACHE_RESULT_HIT:
+    myCounter = &kindCounts->hits;
+    break;
+  default:
+    // Never used but the compiler hasn't figured that out.
+    return;
+  }
+  // XXX Vile case makes many assumptions.  Counters should be declared atomic.
+  atomic64_inc((atomic64_t *) myCounter);
+}
diff --git a/uds/cacheCounters.h b/uds/cacheCounters.h
new file mode 100644
index 0000000..9029453
--- /dev/null
+++ b/uds/cacheCounters.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.h#1 $
+ */
+
+#ifndef CACHE_COUNTERS_H
+#define CACHE_COUNTERS_H
+
+#include "typeDefs.h"
+
+/**
+ * Basic counts of hits and misses for a given type of cache probe.
+ **/
+typedef struct cacheCountsByKind {
+  /** Number of hits */
+  uint64_t hits;
+  /** Number of misses */
+  uint64_t misses;
+  /** Number of probes for data already queued for read */
+  uint64_t queued;
+} CacheCountsByKind;
+
+/**
+ * The various types of cache probes we care about.
+ **/
+typedef enum cacheProbeType {
+  /** First attempt to look up an index page, for a given request. */
+  CACHE_PROBE_INDEX_FIRST = 0,
+  /** First attempt to look up a record page, for a given request. */
+  CACHE_PROBE_RECORD_FIRST,
+  /** Second or later attempt to look up an index page, for a given request. */
+  CACHE_PROBE_INDEX_RETRY,
+  /** Second or later attempt to look up a record page, for a given request. */
+  CACHE_PROBE_RECORD_RETRY
+} CacheProbeType;
+
+enum {
+  /** Flag bit to indicate that failures shouldn't be recorded.  */
+  CACHE_PROBE_IGNORE_FAILURE = 128
+};
+
+/**
+ * Result-type counts for both kinds of data pages in the page cache.
+ **/
+typedef struct cacheCountsByPageType {
+  /** His/miss counts for index pages. */
+  CacheCountsByKind indexPage;
+  /** Hit/miss counts for record pages. */
+  CacheCountsByKind recordPage;
+} CacheCountsByPageType;
+
+/**
+ * All the counters used for an entry cache.
+ **/
+typedef struct cacheCounters {
+  // counters for the page cache
+  /** Hit/miss counts for the first attempt per request */
+  CacheCountsByPageType firstTime;
+  /** Hit/miss counts when a second (or later) attempt is needed */
+  CacheCountsByPageType retried;
+
+  /** Number of cache entry invalidations due to single-entry eviction */
+  uint64_t              evictions;
+  /** Number of cache entry invalidations due to chapter expiration */
+  uint64_t              expirations;
+
+  // counters for the sparse chapter index cache
+  /** Hit/miss counts for the sparse cache chapter probes */
+  CacheCountsByKind     sparseChapters;
+  /** Hit/miss counts for the sparce cache name searches */
+  CacheCountsByKind     sparseSearches;
+} CacheCounters;
+
+/**
+ * Success/failure assessment of cache probe result.
+ **/
+typedef enum cacheResultKind {
+  /** The requested entry was found in the cache */
+  CACHE_RESULT_HIT,
+  /** The requested entry was not found in the cache */
+  CACHE_RESULT_MISS,
+  /** The requested entry wasn't found in the cache but is queued for read */
+  CACHE_RESULT_QUEUED
+} CacheResultKind;
+
+/**
+ * Increment one of the cache counters.
+ *
+ * @param counters    pointer to the counters
+ * @param probeType   type of access done
+ * @param kind        result of probe
+ **/
+void incrementCacheCounter(CacheCounters   *counters,
+                           int              probeType,
+                           CacheResultKind  kind);
+
+#endif /* CACHE_COUNTERS_H */
diff --git a/uds/cachedChapterIndex.c b/uds/cachedChapterIndex.c
new file mode 100644
index 0000000..ae0a22c
--- /dev/null
+++ b/uds/cachedChapterIndex.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.c#3 $
+ */
+
+#include "cachedChapterIndex.h"
+
+#include "memoryAlloc.h"
+
+/**********************************************************************/
+int initializeCachedChapterIndex(CachedChapterIndex *chapter,
+                                 const Geometry     *geometry)
+{
+  chapter->virtualChapter  = UINT64_MAX;
+  chapter->indexPagesCount = geometry->indexPagesPerChapter;
+
+  int result = ALLOCATE(chapter->indexPagesCount, DeltaIndexPage, __func__,
+                        &chapter->indexPages);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(chapter->indexPagesCount, struct volume_page,
+                    "sparse index VolumePages", &chapter->volumePages);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  
+  unsigned int i;
+  for (i = 0; i < chapter->indexPagesCount; i++) {
+    result = initializeVolumePage(geometry, &chapter->volumePages[i]);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void destroyCachedChapterIndex(CachedChapterIndex *chapter)
+{
+  if (chapter->volumePages != NULL) {
+    unsigned int i;
+    for (i = 0; i < chapter->indexPagesCount; i++) {
+      destroyVolumePage(&chapter->volumePages[i]);
+    }
+  }
+  FREE(chapter->indexPages);
+  FREE(chapter->volumePages);
+}
+
+/**********************************************************************/
+int cacheChapterIndex(CachedChapterIndex *chapter,
+                      uint64_t            virtualChapter,
+                      const Volume       *volume)
+{
+  // Mark the cached chapter as unused in case the update fails midway.
+  chapter->virtualChapter = UINT64_MAX;
+
+  // Read all the page data and initialize the entire DeltaIndexPage array.
+  // (It's not safe for the zone threads to do it lazily--they'll race.)
+  int result = readChapterIndexFromVolume(volume, virtualChapter,
+                                          chapter->volumePages,
+                                          chapter->indexPages);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Reset all chapter counter values to zero.
+  chapter->counters.searchHits        = 0;
+  chapter->counters.searchMisses      = 0;
+  chapter->counters.consecutiveMisses = 0;
+
+  // Mark the entry as valid--it's now in the cache.
+  chapter->virtualChapter = virtualChapter;
+  chapter->skipSearch     = false;
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int searchCachedChapterIndex(CachedChapterIndex *chapter,
+                             const Geometry     *geometry,
+                             const IndexPageMap *indexPageMap,
+                             const UdsChunkName *name,
+                             int                *recordPagePtr)
+{
+  // Find the indexPageNumber in the chapter that would have the chunk name.
+  unsigned int physicalChapter
+    = mapToPhysicalChapter(geometry, chapter->virtualChapter);
+  unsigned int indexPageNumber;
+  int result = findIndexPageNumber(indexPageMap, name, physicalChapter,
+                                   &indexPageNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return searchChapterIndexPage(&chapter->indexPages[indexPageNumber],
+                                geometry, name, recordPagePtr);
+}
diff --git a/uds/cachedChapterIndex.h b/uds/cachedChapterIndex.h
new file mode 100644
index 0000000..f759d5d
--- /dev/null
+++ b/uds/cachedChapterIndex.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.h#3 $
+ */
+
+#ifndef CACHED_CHAPTER_INDEX_H
+#define CACHED_CHAPTER_INDEX_H
+
+#include "chapterIndex.h"
+#include "common.h"
+#include "compiler.h"
+#include "cpu.h"
+#include "geometry.h"
+#include "indexPageMap.h"
+#include "typeDefs.h"
+#include "volume.h"
+#include "volumeStore.h"
+
+/**
+ * These counters are essentially fields of the CachedChapterIndex, but are
+ * segregated into this structure because they are frequently modified. They
+ * are grouped and aligned to keep them on different cache lines from the
+ * chapter fields that are accessed far more often than they are updated.
+ **/
+struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedIndexCounters {
+  /** the total number of search hits since this chapter was cached */
+  uint64_t searchHits;
+
+  /** the total number of search misses since this chapter was cached */
+  uint64_t searchMisses;
+
+  /** the number of consecutive search misses since the last cache hit */
+  uint64_t consecutiveMisses;
+};
+typedef struct cachedIndexCounters CachedIndexCounters;
+
+/**
+ * CachedChapterIndex is the structure for a cache entry, representing a
+ * single cached chapter index in the sparse chapter index cache.
+ **/
+struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedChapterIndex {
+  /*
+   * The virtual chapter number of the cached chapter index. UINT64_MAX means
+   * this cache entry is unused. Must only be modified in the critical section
+   * in updateSparseCache().
+   */
+  uint64_t          virtualChapter;
+
+  /* The number of index pages in a chapter */
+  unsigned int      indexPagesCount;
+
+  /*
+   * This flag is mutable between cache updates, but it rarely changes and
+   * is frequently accessed, so it groups with the immutable fields.
+   *
+   * If set, skip the chapter when searching the entire cache.  This flag is
+   * just a performance optimization.  If we do not see a recent change to it,
+   * it will be corrected when we pass through a memory barrier while getting
+   * the next request from the queue.  So we may do one extra search of the
+   * chapter index, or miss one deduplication opportunity.
+   */
+  bool              skipSearch;
+
+  // These pointers are immutable during the life of the cache. The contents
+  // of the arrays change when the cache entry is replaced.
+
+  /* pointer to a cache-aligned array of ChapterIndexPages */
+  DeltaIndexPage *indexPages;
+
+  /* pointer to an array of VolumePages containing the index pages */
+  struct volume_page *volumePages;
+
+  // The cache-aligned counters change often and are placed at the end of the
+  // structure to prevent false sharing with the more stable fields above.
+
+  /* counter values updated by the thread servicing zone zero */
+  CachedIndexCounters counters;
+};
+typedef struct cachedChapterIndex CachedChapterIndex;
+
+/**
+ * Initialize a CachedChapterIndex, allocating the memory for the array of
+ * ChapterIndexPages and the raw index page data. The chapter index will be
+ * marked as unused (virtualChapter == UINT64_MAX).
+ *
+ * @param chapter   the chapter index cache entry to initialize
+ * @param geometry  the geometry governing the volume
+ **/
+int initializeCachedChapterIndex(CachedChapterIndex *chapter,
+                                 const Geometry     *geometry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a CachedChapterIndex, freeing the memory allocated for the
+ * ChapterIndexPages and raw index page data.
+ *
+ * @param chapter   the chapter index cache entry to destroy
+ **/
+void destroyCachedChapterIndex(CachedChapterIndex *chapter);
+
+/**
+ * Assign a new value to the skipSearch flag of a cached chapter index.
+ *
+ * @param chapter     the chapter index cache entry to modify
+ * @param skipSearch  the new value of the skipSearch falg
+ **/
+static INLINE void setSkipSearch(CachedChapterIndex *chapter, bool skipSearch)
+{
+  // Explicitly check if the field is set so we don't keep dirtying the memory
+  // cache line on continued search hits.
+  if (READ_ONCE(chapter->skipSearch) != skipSearch) {
+    WRITE_ONCE(chapter->skipSearch, skipSearch);
+  }
+}
+
+/**
+ * Check if a cached sparse chapter index should be skipped over in the search
+ * for a chunk name. Filters out unused, invalid, disabled, and irrelevant
+ * cache entries.
+ *
+ * @param zone            the zone doing the check
+ * @param chapter         the cache entry search candidate
+ * @param virtualChapter  the virtualChapter containing a hook, or UINT64_MAX
+ *                        if searching the whole cache for a non-hook
+ *
+ * @return <code>true</code> if the provided chapter index should be skipped
+ **/
+static INLINE bool shouldSkipChapterIndex(const IndexZone *zone,
+                                          const CachedChapterIndex *chapter,
+                                          uint64_t virtualChapter)
+{
+  // Don't search unused entries (contents undefined) or invalid entries
+  // (the chapter is no longer the zone's view of the volume).
+  if ((chapter->virtualChapter == UINT64_MAX)
+      || (chapter->virtualChapter < zone->oldestVirtualChapter)) {
+    return true;
+  }
+
+  if (virtualChapter != UINT64_MAX) {
+    // If the caller specified a virtual chapter, only search the cache
+    // entry containing that chapter.
+    return (virtualChapter != chapter->virtualChapter);
+  } else {
+    // When searching the entire cache, save time by skipping over chapters
+    // that have had too many consecutive misses.
+    return READ_ONCE(chapter->skipSearch);
+  }
+}
+
+/**
+ * Cache a chapter index, reading all the index pages from the volume and
+ * initializing the array of ChapterIndexPages in the cache entry to represent
+ * them. The virtualChapter field of the cache entry will be set to UINT64_MAX
+ * if there is any error since the remaining mutable fields will be in an
+ * undefined state.
+ *
+ * @param chapter         the chapter index cache entry to replace
+ * @param virtualChapter  the virtual chapter number of the index to read
+ * @param volume          the volume containing the chapter index
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int cacheChapterIndex(CachedChapterIndex *chapter,
+                      uint64_t            virtualChapter,
+                      const Volume       *volume)
+  __attribute__((warn_unused_result));
+
+/**
+ * Search a single cached sparse chapter index for a chunk name, returning the
+ * record page number that may contain the name.
+ *
+ * @param [in]  chapter        the cache entry for the chapter to search
+ * @param [in]  geometry       the geometry governing the volume
+ * @param [in]  indexPageMap   the index page number map for the volume
+ * @param [in]  name           the chunk name to search for
+ * @param [out] recordPagePtr  the record page number of a match, else
+ *                             NO_CHAPTER_INDEX_ENTRY if nothing matched
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int searchCachedChapterIndex(CachedChapterIndex *chapter,
+                             const Geometry     *geometry,
+                             const IndexPageMap *indexPageMap,
+                             const UdsChunkName *name,
+                             int                *recordPagePtr)
+  __attribute__((warn_unused_result));
+
+#endif /* CACHED_CHAPTER_INDEX_H */
diff --git a/uds/chapterIndex.c b/uds/chapterIndex.c
new file mode 100644
index 0000000..5653a41
--- /dev/null
+++ b/uds/chapterIndex.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.c#5 $
+ */
+
+#include "chapterIndex.h"
+
+#include "compiler.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "uds.h"
+
+
+/**********************************************************************/
+int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex,
+                         const Geometry    *geometry,
+                         bool               chapterIndexHeaderNativeEndian,
+                         uint64_t           volumeNonce)
+{
+
+  int result = ALLOCATE(1, OpenChapterIndex, "open chapter index",
+                        openChapterIndex);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // The delta index will rebalance delta lists when memory gets tight, so
+  // give the chapter index one extra page.
+  size_t memorySize
+    = (geometry->indexPagesPerChapter + 1) * geometry->bytesPerPage;
+  (*openChapterIndex)->geometry           = geometry;
+  (*openChapterIndex)->volumeNonce        = volumeNonce;
+  (*openChapterIndex)->headerNativeEndian = chapterIndexHeaderNativeEndian,
+  result = initializeDeltaIndex(&(*openChapterIndex)->deltaIndex, 1,
+                                geometry->deltaListsPerChapter,
+                                geometry->chapterMeanDelta,
+                                geometry->chapterPayloadBits, memorySize);
+  if (result != UDS_SUCCESS) {
+    FREE(*openChapterIndex);
+    *openChapterIndex                     = NULL;
+  }
+  return result;
+}
+
+/**********************************************************************/
+void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex)
+{
+  if (openChapterIndex == NULL) {
+    return;
+  }
+
+
+  uninitializeDeltaIndex(&openChapterIndex->deltaIndex);
+  FREE(openChapterIndex);
+}
+
+/**********************************************************************/
+void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex,
+                           uint64_t          virtualChapterNumber)
+{
+  emptyDeltaIndex(&openChapterIndex->deltaIndex);
+  openChapterIndex->virtualChapterNumber = virtualChapterNumber;
+}
+
+/**
+ * Check whether a delta list entry reflects a successful search for a given
+ * address.
+ *
+ * @param entry    the delta list entry from the search
+ * @param address  the address of the desired entry
+ *
+ * @return <code>true</code> iff the address was found
+ **/
+static INLINE bool wasEntryFound(const DeltaIndexEntry *entry,
+                                 unsigned int           address)
+{
+  return (!entry->atEnd && (entry->key == address));
+}
+
+/**********************************************************************/
+int putOpenChapterIndexRecord(OpenChapterIndex   *openChapterIndex,
+                              const UdsChunkName *name,
+                              unsigned int        pageNumber)
+{
+  const Geometry *geometry = openChapterIndex->geometry;
+  int result
+    = ASSERT_WITH_ERROR_CODE(pageNumber < geometry->recordPagesPerChapter,
+                             UDS_INVALID_ARGUMENT,
+                             "Page number within chapter (%u) exceeds"
+                             " the maximum value %u",
+                             pageNumber, geometry->recordPagesPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  DeltaIndexEntry entry;
+  unsigned int address = hashToChapterDeltaAddress(name, geometry);
+  result = getDeltaIndexEntry(&openChapterIndex->deltaIndex,
+                              hashToChapterDeltaList(name, geometry),
+                              address, name->name, false, &entry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  bool found = wasEntryFound(&entry, address);
+  result = ASSERT_WITH_ERROR_CODE(!(found && entry.isCollision),
+                                  UDS_BAD_STATE,
+                                  "Chunk appears more than once in chapter %"
+                                  PRIu64,
+                                  openChapterIndex->virtualChapterNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return putDeltaIndexEntry(&entry, address, pageNumber,
+                            (found ? name->name : NULL));
+}
+
+/**********************************************************************/
+int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex,
+                             byte             *memory,
+                             unsigned int      firstList,
+                             bool              lastPage,
+                             unsigned int     *numLists)
+{
+  DeltaIndex *deltaIndex = &openChapterIndex->deltaIndex;
+  const Geometry *geometry = openChapterIndex->geometry;
+  unsigned int removals = 0;
+  for (;;) {
+    int result = packDeltaIndexPage(deltaIndex, openChapterIndex->volumeNonce,
+                                    openChapterIndex->headerNativeEndian,
+                                    memory, geometry->bytesPerPage,
+                                    openChapterIndex->virtualChapterNumber,
+                                    firstList, numLists);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if ((firstList + *numLists) == geometry->deltaListsPerChapter) {
+      // All lists are packed
+      break;
+    } else if (*numLists == 0) {
+      // The next delta list does not fit on a page.  This delta list will
+      // be removed.
+    } else if (lastPage) {
+      /*
+       * This is the last page and there are lists left unpacked, but all of
+       * the remaining lists must fit on the page. Find a list that contains
+       * entries and remove the entire list. Try the first list that does not
+       * fit. If it is empty, we will select the last list that already fits
+       * and has any entries.
+       */
+    } else {
+      // This page is done
+      break;
+    }
+    if (removals == 0) {
+      DeltaIndexStats stats;
+      getDeltaIndexStats(deltaIndex, &stats);
+      logWarning("The chapter index for chapter %" PRIu64
+                 " contains %ld entries with %ld collisions",
+                 openChapterIndex->virtualChapterNumber,
+                 stats.recordCount, stats.collisionCount);
+    }
+    DeltaIndexEntry entry;
+    int listNumber = *numLists;
+    do {
+      if (listNumber < 0) {
+        return UDS_OVERFLOW;
+      }
+      result = startDeltaIndexSearch(deltaIndex, firstList + listNumber--,
+                                     0, false, &entry);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      result = nextDeltaIndexEntry(&entry);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    } while (entry.atEnd);
+    do {
+      result = removeDeltaIndexEntry(&entry);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      removals++;
+    } while (!entry.atEnd);
+  }
+  if (removals > 0) {
+    logWarning("To avoid chapter index page overflow in chapter %" PRIu64
+               ", %u entries were removed from the chapter index",
+               openChapterIndex->virtualChapterNumber, removals);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex)
+{
+  DeltaIndexStats stats;
+  getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats);
+  return stats.recordCount;
+}
+
+/**********************************************************************/
+size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex)
+{
+  DeltaIndexStats stats;
+  getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats);
+  return stats.memoryAllocated + sizeof(OpenChapterIndex);
+}
+
+/**********************************************************************/
+int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage,
+                               const Geometry *geometry,
+                               byte           *indexPage,
+                               uint64_t        volumeNonce)
+{
+  return initializeDeltaIndexPage(chapterIndexPage, volumeNonce,
+                                  geometry->chapterMeanDelta,
+                                  geometry->chapterPayloadBits,
+                                  indexPage, geometry->bytesPerPage);
+}
+
+/**********************************************************************/
+int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage,
+                             const Geometry       *geometry)
+{
+  const DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex;
+  unsigned int first = chapterIndexPage->lowestListNumber;
+  unsigned int last  = chapterIndexPage->highestListNumber;
+  // We walk every delta list from start to finish.
+  unsigned int listNumber;
+  for (listNumber = first; listNumber <= last; listNumber++) {
+    DeltaIndexEntry entry;
+    int result = startDeltaIndexSearch(deltaIndex, listNumber - first, 0, true,
+                                       &entry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    for (;;) {
+      result = nextDeltaIndexEntry(&entry);
+      if (result != UDS_SUCCESS) {
+        if (result == UDS_CORRUPT_DATA) {
+          // A random bit stream is highly likely to arrive here when we go
+          // past the end of the delta list
+          return UDS_CORRUPT_COMPONENT;
+        }
+        return result;
+      }
+      if (entry.atEnd) {
+        break;
+      }
+      // Also make sure that the record page field contains a plausible value
+      if (getDeltaEntryValue(&entry) >= geometry->recordPagesPerChapter) {
+        // Do not log this as an error.  It happens in normal operation when
+        // we are doing a rebuild but haven't written the entire volume once.
+        return UDS_CORRUPT_COMPONENT;
+      }
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int searchChapterIndexPage(DeltaIndexPage     *chapterIndexPage,
+                           const Geometry     *geometry,
+                           const UdsChunkName *name,
+                           int                *recordPagePtr)
+{
+  DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex;
+  unsigned int address = hashToChapterDeltaAddress(name, geometry);
+  unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry);
+  unsigned int subListNumber
+    = deltaListNumber - chapterIndexPage->lowestListNumber;;
+  DeltaIndexEntry entry;
+  int result = getDeltaIndexEntry(deltaIndex, subListNumber, address,
+                                  name->name, true, &entry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (wasEntryFound(&entry, address)) {
+    *recordPagePtr = getDeltaEntryValue(&entry);
+  } else {
+    *recordPagePtr = NO_CHAPTER_INDEX_ENTRY;
+  }
+  return UDS_SUCCESS;
+}
diff --git a/uds/chapterIndex.h b/uds/chapterIndex.h
new file mode 100644
index 0000000..4dd425b
--- /dev/null
+++ b/uds/chapterIndex.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.h#4 $
+ */
+
+#ifndef CHAPTER_INDEX_H
+#define CHAPTER_INDEX_H 1
+
+#include "deltaIndex.h"
+#include "geometry.h"
+
+enum {
+  // The value returned as the record page number when an entry is not found
+  // in the chapter index.
+  NO_CHAPTER_INDEX_ENTRY  = -1
+};
+
+typedef struct openChapterIndex {
+  const Geometry *geometry;
+  DeltaIndex      deltaIndex;
+  uint64_t        virtualChapterNumber;
+  bool            headerNativeEndian;
+  uint64_t        volumeNonce;
+} OpenChapterIndex;
+
+
+/**
+ * Make a new open chapter index.
+ *
+ * @param openChapterIndex  Location to hold new open chapter index pointer
+ * @param geometry                        The geometry
+ * @param chapterIndexHeaderNativeEndian  chapter index header format
+ * @param volumeNonce                     The volume nonce.
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex,
+                         const Geometry    *geometry,
+                         bool               chapterIndexHeaderNativeEndian,
+                         uint64_t           volumeNonce)
+  __attribute__((warn_unused_result));
+
+/**
+ * Terminate and clean up an open chapter index.
+ *
+ * @param openChapterIndex  The open chapter index to terminate
+ **/
+void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex);
+
+/**
+ * Empty an open chapter index, and prepare it for writing a new virtual
+ * chapter.
+ *
+ * @param openChapterIndex      The open chapter index to empty
+ * @param virtualChapterNumber  The virtual chapter number
+ **/
+void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex,
+                           uint64_t          virtualChapterNumber);
+
+/**
+ * Create a new record in an open chapter index, associating a chunk name with
+ * the number of the record page containing the metadata for the chunk.
+ *
+ * @param openChapterIndex  The open chapter index
+ * @param name              The chunk name
+ * @param pageNumber        The number of the record page containing the name
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int putOpenChapterIndexRecord(OpenChapterIndex   *openChapterIndex,
+                              const UdsChunkName *name,
+                              unsigned int        pageNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Pack a section of an open chapter index into a chapter index page.  A
+ * range of delta lists (starting with a specified list index) is copied
+ * from the open chapter index into a memory page.  The number of lists
+ * copied onto the page is returned to the caller.
+ *
+ * @param openChapterIndex  The open chapter index
+ * @param memory            The memory page to use
+ * @param firstList         The first delta list number to be copied
+ * @param lastPage          If true, this is the last page of the chapter
+ *                          index and all the remaining lists must be packed
+ *                          onto this page
+ * @param numLists          The number of delta lists that were copied
+ *
+ * @return error code or UDS_SUCCESS.  On UDS_SUCCESS, the numLists
+ *         argument contains the number of lists copied.
+ **/
+int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex,
+                             byte             *memory,
+                             unsigned int      firstList,
+                             bool              lastPage,
+                             unsigned int     *numLists)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of records in an open chapter index.
+ *
+ * @param openChapterIndex  The open chapter index
+ *
+ * @return The number of records
+ **/
+int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of bytes allocated for the open chapter index.
+ *
+ * @param openChapterIndex  The open chapter index
+ *
+ * @return the number of bytes allocated
+ **/
+size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex);
+
+/**
+ * Make a new chapter index page, initializing it with the data from the
+ * given buffer.
+ *
+ * @param chapterIndexPage  The new chapter index page
+ * @param geometry          The geometry
+ * @param indexPage         The memory page to use
+ * @param volumeNonce       If non-zero, the volume nonce to verify
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage,
+                               const Geometry *geometry,
+                               byte           *indexPage,
+                               uint64_t        volumeNonce)
+  __attribute__((warn_unused_result));
+
+/**
+ * Validate a chapter index page.  This is called at rebuild time to ensure
+ * that the volume file contains a coherent chapter index.
+ *
+ * @param chapterIndexPage  The chapter index page
+ * @param geometry          The geometry of the volume
+ *
+ * @return The result code:
+ *         UDS_SUCCESS for a good chapter index page
+ *         UDS_CORRUPT_COMPONENT if the chapter index code detects invalid data
+ *         UDS_CORRUPT_DATA if there is a problem in a delta list bit stream
+ *         UDS_BAD_STATE if the code follows an invalid code path
+ **/
+int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage,
+                             const Geometry       *geometry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Search a chapter index page for a chunk name, returning the record page
+ * number that may contain the name.
+ *
+ * @param [in]  chapterIndexPage    The chapter index page
+ * @param [in]  geometry            The geometry of the volume
+ * @param [in]  name                The chunk name
+ * @param [out] recordPagePtr       The record page number
+ *                                  or NO_CHAPTER_INDEX_ENTRY if not found
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int searchChapterIndexPage(DeltaIndexPage     *chapterIndexPage,
+                           const Geometry     *geometry,
+                           const UdsChunkName *name,
+                           int                *recordPagePtr)
+  __attribute__((warn_unused_result));
+
+#endif /* CHAPTER_INDEX_H */
diff --git a/uds/chapterWriter.c b/uds/chapterWriter.c
new file mode 100644
index 0000000..3a926ab
--- /dev/null
+++ b/uds/chapterWriter.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.c#2 $
+ */
+
+#include "chapterWriter.h"
+
+#include "errors.h"
+#include "index.h"
+#include "indexCheckpoint.h"
+#include "indexComponent.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "openChapter.h"
+#include "threads.h"
+
+
+struct chapterWriter {
+  /* The index to which we belong */
+  Index            *index;
+  /* The thread to do the writing */
+  Thread            thread;
+  /* lock protecting the following fields */
+  Mutex             mutex;
+  /* condition signalled on state changes */
+  CondVar           cond;
+  /* Set to true to stop the thread */
+  bool              stop;
+  /* The result from the most recent write */
+  int               result;
+  /* The number of bytes allocated by the chapter writer */
+  size_t            memoryAllocated;
+  /* The number of zones which have submitted a chapter for writing */
+  unsigned int      zonesToWrite;
+  /* Open chapter index used by closeOpenChapter() */
+  OpenChapterIndex *openChapterIndex;
+  /* Collated records used by closeOpenChapter() */
+  UdsChunkRecord   *collatedRecords;
+  /* The chapters to write (one per zone) */
+  OpenChapterZone  *chapters[];
+};
+
+/**
+ * This is the driver function for the writer thread. It loops until
+ * terminated, waiting for a chapter to provided to close.
+ **/
+static void closeChapters(void *arg)
+{
+  ChapterWriter *writer = arg;
+  logDebug("chapter writer starting");
+  lockMutex(&writer->mutex);
+  for (;;) {
+    while (writer->zonesToWrite < writer->index->zoneCount) {
+      if (writer->stop && (writer->zonesToWrite == 0)) {
+        // We've been told to stop, and all of the zones are in the same
+        // open chapter, so we can exit now.
+        unlockMutex(&writer->mutex);
+        logDebug("chapter writer stopping");
+        return;
+      }
+      waitCond(&writer->cond, &writer->mutex);
+    }
+
+    /*
+     * Release the lock while closing a chapter. We probably don't need to do
+     * this, but it seems safer in principle. It's OK to access the chapter
+     * and chapterNumber fields without the lock since those aren't allowed to
+     * change until we're done.
+     */
+    unlockMutex(&writer->mutex);
+
+    if (writer->index->hasSavedOpenChapter) {
+      writer->index->hasSavedOpenChapter = false;
+      /*
+       * Remove the saved open chapter as that chapter is about to be written
+       * to the volume.  This matters the first time we close the open chapter
+       * after loading from a clean shutdown, or after doing a clean save.
+       */
+      IndexComponent *oc = findIndexComponent(writer->index->state,
+                                              &OPEN_CHAPTER_INFO);
+      int result = discardIndexComponent(oc);
+      if (result == UDS_SUCCESS) {
+        logDebug("Discarding saved open chapter");
+      }
+    }
+
+    int result = closeOpenChapter(writer->chapters,
+                                  writer->index->zoneCount,
+                                  writer->index->volume,
+                                  writer->openChapterIndex,
+                                  writer->collatedRecords,
+                                  writer->index->newestVirtualChapter);
+
+    if (result == UDS_SUCCESS) {
+      result = processChapterWriterCheckpointSaves(writer->index);
+    }
+
+
+    lockMutex(&writer->mutex);
+    // Note that the index is totally finished with the writing chapter
+    advanceActiveChapters(writer->index);
+    writer->result       = result;
+    writer->zonesToWrite = 0;
+    broadcastCond(&writer->cond);
+  }
+}
+
+/**********************************************************************/
+int makeChapterWriter(Index                       *index,
+                      const struct index_version  *indexVersion,
+                      ChapterWriter              **writerPtr)
+{
+  size_t collatedRecordsSize
+    = (sizeof(UdsChunkRecord)
+       * (1 + index->volume->geometry->recordsPerChapter));
+  ChapterWriter *writer;
+  int result = ALLOCATE_EXTENDED(ChapterWriter,
+                                 index->zoneCount, OpenChapterZone *,
+                                 "Chapter Writer", &writer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  writer->index = index;
+
+  result = initMutex(&writer->mutex);
+  if (result != UDS_SUCCESS) {
+    FREE(writer);
+    return result;
+  }
+  result = initCond(&writer->cond);
+  if (result != UDS_SUCCESS) {
+    destroyMutex(&writer->mutex);
+    FREE(writer);
+    return result;
+  }
+
+  // Now that we have the mutex+cond, it is safe to call freeChapterWriter.
+  result = allocateCacheAligned(collatedRecordsSize, "collated records",
+                                &writer->collatedRecords);
+  if (result != UDS_SUCCESS) {
+    freeChapterWriter(writer);
+    return makeUnrecoverable(result);
+  }
+  result = makeOpenChapterIndex(&writer->openChapterIndex,
+                                index->volume->geometry,
+                                indexVersion->chapterIndexHeaderNativeEndian,
+                                index->volume->nonce);
+  if (result != UDS_SUCCESS) {
+    freeChapterWriter(writer);
+    return makeUnrecoverable(result);
+  }
+
+  size_t openChapterIndexMemoryAllocated
+    = getOpenChapterIndexMemoryAllocated(writer->openChapterIndex);
+  writer->memoryAllocated = (sizeof(ChapterWriter)
+                             + index->zoneCount * sizeof(OpenChapterZone *)
+                             + collatedRecordsSize
+                             + openChapterIndexMemoryAllocated);
+
+  // We're initialized, so now it's safe to start the writer thread.
+  result = createThread(closeChapters, writer, "writer", &writer->thread);
+  if (result != UDS_SUCCESS) {
+    freeChapterWriter(writer);
+    return makeUnrecoverable(result);
+  }
+
+  *writerPtr = writer;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeChapterWriter(ChapterWriter *writer)
+{
+  if (writer == NULL) {
+    return;
+  }
+
+  int result __attribute__((unused)) = stopChapterWriter(writer);
+  destroyMutex(&writer->mutex);
+  destroyCond(&writer->cond);
+  freeOpenChapterIndex(writer->openChapterIndex);
+  FREE(writer->collatedRecords);
+  FREE(writer);
+}
+
+/**********************************************************************/
+unsigned int startClosingChapter(ChapterWriter   *writer,
+                                 unsigned int     zoneNumber,
+                                 OpenChapterZone *chapter)
+{
+  lockMutex(&writer->mutex);
+  unsigned int finishedZones = ++writer->zonesToWrite;
+  writer->chapters[zoneNumber] = chapter;
+  broadcastCond(&writer->cond);
+  unlockMutex(&writer->mutex);
+
+  return finishedZones;
+}
+
+/**********************************************************************/
+int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber)
+{
+  int result;
+  lockMutex(&writer->mutex);
+  while (writer->index->newestVirtualChapter < currentChapterNumber) {
+    waitCond(&writer->cond, &writer->mutex);
+  }
+  result = writer->result;
+  unlockMutex(&writer->mutex);
+
+  if (result != UDS_SUCCESS) {
+    return logUnrecoverable(result, "Writing of previous open chapter failed");
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void waitForIdleChapterWriter(ChapterWriter *writer)
+{
+  lockMutex(&writer->mutex);
+  while (writer->zonesToWrite > 0) {
+    // The chapter writer is probably writing a chapter.  If it is not, it will
+    // soon wake up and write a chapter.
+    waitCond(&writer->cond, &writer->mutex);
+  }
+  unlockMutex(&writer->mutex);
+}
+
+/**********************************************************************/
+int stopChapterWriter(ChapterWriter *writer)
+{
+  Thread writerThread = 0;
+
+  lockMutex(&writer->mutex);
+  if (writer->thread != 0) {
+    writerThread = writer->thread;
+    writer->thread = 0;
+    writer->stop = true;
+    broadcastCond(&writer->cond);
+  }
+  int result = writer->result;
+  unlockMutex(&writer->mutex);
+
+  if (writerThread != 0) {
+    joinThreads(writerThread);
+  }
+
+  if (result != UDS_SUCCESS) {
+    return logUnrecoverable(result, "Writing of previous open chapter failed");
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+size_t getChapterWriterMemoryAllocated(ChapterWriter *writer)
+{
+  return writer->memoryAllocated;
+}
diff --git a/uds/chapterWriter.h b/uds/chapterWriter.h
new file mode 100644
index 0000000..85c1f42
--- /dev/null
+++ b/uds/chapterWriter.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.h#2 $
+ */
+
+#ifndef CHAPTER_WRITER_H
+#define CHAPTER_WRITER_H
+
+#include "atomicDefs.h"
+#include "indexVersion.h"
+#include "openChapterZone.h"
+
+typedef struct chapterWriter ChapterWriter;
+
+// This opaque declaration breaks the dependency loop with index.h
+struct index;
+
+
+/**
+ * Create a chapter writer and start its thread.
+ *
+ * @param index         the index containing the chapters to be written
+ * @param indexVersion  the index version parameters
+ * @param writerPtr     pointer to hold the new writer
+ *
+ * @return           UDS_SUCCESS or an error code
+ **/
+int makeChapterWriter(struct index                *index,
+                      const struct index_version  *indexVersion,
+                      ChapterWriter              **writerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a chapter writer, waiting for its thread to finish.
+ *
+ * @param writer  the chapter writer to destroy
+ **/
+void freeChapterWriter(ChapterWriter *writer);
+
+/**
+ * Asychronously close and write a chapter by passing it to the writer
+ * thread. Writing won't start until all zones have submitted a chapter.
+ *
+ * @param writer     the chapter writer
+ * @param zoneNumber the number of the zone submitting a chapter
+ * @param chapter    the chapter to write
+ *
+ * @return The number of zones which have submitted the current chapter
+ **/
+unsigned int startClosingChapter(ChapterWriter   *writer,
+                                 unsigned int     zoneNumber,
+                                 OpenChapterZone *chapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Wait for the chapter writer thread to finish closing the chapter previous
+ * to the one specified.
+ *
+ * @param writer               the chapter writer
+ * @param currentChapterNumber the currentChapter number
+ *
+ * @return UDS_SUCCESS or an error code from the most recent write
+ *         request
+ **/
+int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Wait for the chapter writer thread to finish all writes to storage.
+ *
+ * @param writer  the chapter writer
+ **/
+void waitForIdleChapterWriter(ChapterWriter *writer);
+
+/**
+ * Stop the chapter writer and wait for it to finish.
+ *
+ * @param writer  the chapter writer to stop
+ *
+ * @return UDS_SUCCESS or an error code from the most recent write
+ *         request
+ **/
+int stopChapterWriter(ChapterWriter *writer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of bytes allocated for the chapter writer.
+ *
+ * @param writer the chapter writer
+ *
+ * @return the number of bytes allocated
+ **/
+size_t getChapterWriterMemoryAllocated(ChapterWriter *writer);
+
+#endif /* CHAPTER_WRITER_H */
diff --git a/uds/common.h b/uds/common.h
new file mode 100644
index 0000000..bea27e5
--- /dev/null
+++ b/uds/common.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/common.h#1 $
+ */
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#include "stringUtils.h"
+#include "typeDefs.h"
+#include "uds.h"
+#include "uds-block.h"
+
+enum {
+  KILOBYTE = 1024,
+  MEGABYTE = KILOBYTE * KILOBYTE,
+  GIGABYTE = KILOBYTE * MEGABYTE
+};
+
+typedef struct udsChunkData UdsChunkData;
+
+typedef struct {
+  UdsChunkName name;
+  UdsChunkData data;
+} UdsChunkRecord;
+
+#endif /* COMMON_H */
diff --git a/uds/compiler.h b/uds/compiler.h
new file mode 100644
index 0000000..cd57590
--- /dev/null
+++ b/uds/compiler.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/compiler.h#1 $
+ */
+
+#ifndef COMMON_COMPILER_H
+#define COMMON_COMPILER_H
+
+#include "compilerDefs.h"
+
+// Count the elements in a static array while attempting to catch some type
+// errors. (See http://stackoverflow.com/a/1598827 for an explanation.)
+#define COUNT_OF(x) ((sizeof(x) / sizeof(0[x])) \
+                     / ((size_t) (!(sizeof(x) % sizeof(0[x])))))
+
+#define const_container_of(ptr, type, member)                     \
+  __extension__ ({                                                \
+    const __typeof__(((type *)0)->member) *__mptr = (ptr);        \
+    (const type *)((const char *)__mptr - offsetof(type,member)); \
+  })
+
+// The "inline" keyword alone takes affect only when the optimization level
+// is high enough.  Define INLINE to force the gcc to "always inline".
+#define INLINE __attribute__((always_inline)) inline
+
+#endif /* COMMON_COMPILER_H */
diff --git a/uds/compilerDefs.h b/uds/compilerDefs.h
new file mode 100644
index 0000000..cc81ce2
--- /dev/null
+++ b/uds/compilerDefs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/compilerDefs.h#1 $
+ */
+
+#ifndef LINUX_KERNEL_COMPILER_DEFS_H
+#define LINUX_KERNEL_COMPILER_DEFS_H
+
+#include <linux/compiler.h>
+
+#define __STRING(x) #x
+
+#endif /* LINUX_KERNEL_COMPILER_DEFS_H */
diff --git a/uds/config.c b/uds/config.c
new file mode 100644
index 0000000..a953da3
--- /dev/null
+++ b/uds/config.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/config.c#2 $
+ */
+
+#include "config.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+
+/**********************************************************************/
+void freeIndexLocation(IndexLocation *loc)
+{
+  if (loc == NULL) {
+    return;
+  }
+
+  FREE(loc->host);
+  FREE(loc->port);
+  FREE(loc->directory);
+}
+
+/**********************************************************************/
+bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b)
+{
+  bool result = true;
+  if (a->recordPagesPerChapter != b->recordPagesPerChapter) {
+    logError("Record pages per chapter (%u) does not match (%u)",
+             a->recordPagesPerChapter, b->recordPagesPerChapter);
+    result = false;
+  }
+  if (a->chaptersPerVolume != b->chaptersPerVolume) {
+    logError("Chapter count (%u) does not match (%u)",
+             a->chaptersPerVolume, b->chaptersPerVolume);
+    result = false;
+  }
+  if (a->sparseChaptersPerVolume != b->sparseChaptersPerVolume) {
+    logError("Sparse chapter count (%u) does not match (%u)",
+             a->sparseChaptersPerVolume, b->sparseChaptersPerVolume);
+    result = false;
+  }
+  if (a->cacheChapters != b->cacheChapters) {
+    logError("Cache size (%u) does not match (%u)",
+             a->cacheChapters, b->cacheChapters);
+    result = false;
+  }
+  if (a->masterIndexMeanDelta != b->masterIndexMeanDelta) {
+    logError("Master index mean delta (%u) does not match (%u)",
+             a->masterIndexMeanDelta, b->masterIndexMeanDelta);
+    result = false;
+  }
+  if (a->bytesPerPage != b->bytesPerPage) {
+    logError("Bytes per page value (%u) does not match (%u)",
+             a->bytesPerPage, b->bytesPerPage);
+    result = false;
+  }
+  if (a->sparseSampleRate != b->sparseSampleRate) {
+    logError("Sparse sample rate (%u) does not match (%u)",
+             a->sparseSampleRate, b->sparseSampleRate);
+    result = false;
+  }
+  if (a->nonce != b->nonce) {
+    logError("Nonce (%llu) does not match (%llu)",
+             a->nonce, b->nonce);
+    result = false;
+  }
+  return result;
+}
+
+/**********************************************************************/
+void logUdsConfiguration(UdsConfiguration conf)
+{
+  logDebug("Configuration:");
+  logDebug("  Record pages per chapter:   %10u", conf->recordPagesPerChapter);
+  logDebug("  Chapters per volume:        %10u", conf->chaptersPerVolume);
+  logDebug("  Sparse chapters per volume: %10u", conf->sparseChaptersPerVolume);
+  logDebug("  Cache size (chapters):      %10u", conf->cacheChapters);
+  logDebug("  Master index mean delta:    %10u", conf->masterIndexMeanDelta);
+  logDebug("  Bytes per page:             %10u", conf->bytesPerPage);
+  logDebug("  Sparse sample rate:         %10u", conf->sparseSampleRate);
+  logDebug("  Nonce:                      %llu", conf->nonce);
+}
diff --git a/uds/config.h b/uds/config.h
new file mode 100644
index 0000000..f31efab
--- /dev/null
+++ b/uds/config.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/config.h#2 $
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#include "bufferedReader.h"
+#include "bufferedWriter.h"
+#include "geometry.h"
+#include "uds.h"
+
+enum {
+  DEFAULT_MASTER_INDEX_MEAN_DELTA   = 4096,
+  DEFAULT_CACHE_CHAPTERS            = 7,
+  DEFAULT_SPARSE_SAMPLE_RATE        = 0
+};
+
+/**
+ * Data that are used for configuring a new index.
+ **/
+struct udsConfiguration {
+  /** Smaller (16), Small (64) or large (256) indices */
+  unsigned int recordPagesPerChapter;
+  /** Total number of chapters per volume */
+  unsigned int chaptersPerVolume;
+  /** Number of sparse chapters per volume */
+  unsigned int sparseChaptersPerVolume;
+  /** Size of the page cache, in chapters */
+  unsigned int cacheChapters;
+  /** Frequency with which to checkpoint */
+  // XXX the checkpointFrequency is not used - it is now a runtime parameter
+  unsigned int checkpointFrequency;
+  /** The master index mean delta to use */
+  unsigned int masterIndexMeanDelta;
+  /** Size of a page, used for both record pages and index pages */
+  unsigned int bytesPerPage;
+  /** Sampling rate for sparse indexing */
+  unsigned int sparseSampleRate;
+  /** Index Owner's nonce */
+  UdsNonce     nonce;
+};
+
+/**
+ * Data that are used for a 6.01 index.
+ **/
+struct udsConfiguration6_01 {
+  /** Smaller (16), Small (64) or large (256) indices */
+  unsigned int recordPagesPerChapter;
+  /** Total number of chapters per volume */
+  unsigned int chaptersPerVolume;
+  /** Number of sparse chapters per volume */
+  unsigned int sparseChaptersPerVolume;
+  /** Size of the page cache, in chapters */
+  unsigned int cacheChapters;
+  /** Frequency with which to checkpoint */
+  unsigned int checkpointFrequency;
+  /** The master index mean delta to use */
+  unsigned int masterIndexMeanDelta;
+  /** Size of a page, used for both record pages and index pages */
+  unsigned int bytesPerPage;
+  /** Sampling rate for sparse indexing */
+  unsigned int sparseSampleRate;
+};
+
+typedef struct indexLocation {
+  char *host;
+  char *port;
+  char *directory;
+} IndexLocation;
+
+/**
+ * A set of configuration parameters for the indexer.
+ **/
+typedef struct configuration Configuration;
+
+/**
+ * Construct a new indexer configuration.
+ *
+ * @param conf       UdsConfiguration to use
+ * @param configPtr  The new index configuration
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeConfiguration(UdsConfiguration   conf,
+                      Configuration    **configPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up the configuration struct.
+ **/
+void freeConfiguration(Configuration *config);
+
+/**
+ * Read the index configuration from stable storage.
+ *
+ * @param reader        A buffered reader.
+ * @param config        The index configuration to overwrite.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int readConfigContents(BufferedReader   *reader,
+                       UdsConfiguration  config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write the index configuration information to stable storage.
+ *
+ * @param writer        A buffered writer.
+ * @param config        The index configuration.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int writeConfigContents(BufferedWriter   *writer,
+                        UdsConfiguration  config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the memory used by an IndexLocation.
+ *
+ * @param loc   index location to free
+ **/
+void freeIndexLocation(IndexLocation *loc);
+
+/**
+ * Compare two configurations for equality.
+ *
+ * @param a  The first configuration to compare
+ * @param b  The second configuration to compare
+ *
+ * @return true iff they are equal
+ **/
+bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b)
+  __attribute__((warn_unused_result));
+
+/**
+ * Log a user configuration.
+ *
+ * @param conf    The configuration
+ **/
+void logUdsConfiguration(UdsConfiguration conf);
+
+#endif /* CONFIG_H */
diff --git a/uds/cpu.h b/uds/cpu.h
new file mode 100644
index 0000000..9314985
--- /dev/null
+++ b/uds/cpu.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/cpu.h#1 $
+ */
+
+#ifndef CPU_H
+#define CPU_H
+
+#include "compiler.h"
+#include "typeDefs.h"
+
+/**
+ * The number of bytes in a CPU cache line. In the future, we'll probably need
+ * to move this to a processor-specific file or discover it at compilation
+ * time (or runtime, if sufficiently heterogeneous), but this will do for now.
+ * (Must be a \#define since enums are not proper compile-time constants.)
+ **/
+#ifdef __PPC__
+// N.B.: Some PPC processors have smaller cache lines.
+#define CACHE_LINE_BYTES 128
+#elif defined(__s390x__)
+#define CACHE_LINE_BYTES 256
+#elif defined(__x86_64__) || defined(__aarch64__)
+#define CACHE_LINE_BYTES  64
+#else
+#error "unknown cache line size"
+#endif
+
+/**
+ * Minimize cache-miss latency by moving data into a CPU cache before it is
+ * accessed.
+ *
+ * @param address   the address to fetch (may be invalid)
+ * @param forWrite  must be constant at compile time--false if
+ *                  for reading, true if for writing
+ **/
+static INLINE void prefetchAddress(const void *address, bool forWrite)
+{
+  // forWrite won't won't be a constant if we are compiled with optimization
+  // turned off, in which case prefetching really doesn't matter.
+  if (__builtin_constant_p(forWrite)) {
+    __builtin_prefetch(address, forWrite);
+  }
+}
+
+/**
+ * Minimize cache-miss latency by moving a range of addresses into a
+ * CPU cache before they are accessed.
+ *
+ * @param start     the starting address to fetch (may be invalid)
+ * @param size      the number of bytes in the address range
+ * @param forWrite  must be constant at compile time--false if
+ *                  for reading, true if for writing
+ **/
+static INLINE void prefetchRange(const void   *start,
+                                 unsigned int  size,
+                                 bool          forWrite)
+{
+  // Count the number of cache lines to fetch, allowing for the address range
+  // to span an extra cache line boundary due to address alignment.
+  const char *address = (const char *) start;
+  unsigned int offset = ((uintptr_t) address % CACHE_LINE_BYTES);
+  size += offset;
+
+  unsigned int cacheLines = (1 + (size / CACHE_LINE_BYTES));
+  while (cacheLines-- > 0) {
+    prefetchAddress(address, forWrite);
+    address += CACHE_LINE_BYTES;
+  }
+}
+
+#endif /* CPU_H */
diff --git a/uds/deltaIndex.c b/uds/deltaIndex.c
new file mode 100644
index 0000000..0c43e9b
--- /dev/null
+++ b/uds/deltaIndex.c
@@ -0,0 +1,1707 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.c#7 $
+ */
+#include "deltaIndex.h"
+
+#include "bits.h"
+#include "buffer.h"
+#include "compiler.h"
+#include "cpu.h"
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "stringUtils.h"
+#include "typeDefs.h"
+#include "uds.h"
+#include "zone.h"
+
+/*
+ * A delta index is a key-value store, where each entry maps an address
+ * (the key) to a payload (the value).  The entries are sorted by address,
+ * and only the delta between successive addresses is stored in the entry.
+ * The addresses are assumed to be uniformly distributed,and the deltas are
+ * therefore exponentially distributed.
+ *
+ * The entries could be stored in a single DeltaList, but for efficiency we
+ * use multiple DeltaLists.  These lists are stored in a single chunk of
+ * memory managed by the DeltaMemory module.  The DeltaMemory module can
+ * move the data around in memory, so we never keep any byte pointers into
+ * DeltaList memory.  We only keep offsets into the memory.
+ *
+ * The delta lists are stored as bit streams.  These bit streams are stored
+ * in little endian order, and all offsets into DeltaMemory are bit
+ * offsets.
+ *
+ * All entries are stored as a fixed length payload (the value) followed by a
+ * variable length key (the delta). Always strictly in little endian order.
+ *
+ * A collision entry is used when two block names have the same delta list
+ * address.  A collision entry is encoded with DELTA==0, and has 256
+ * extension bits containing the full block name.
+ *
+ * There is a special exception to be noted.  The DELTA==0 encoding usually
+ * indicates a collision with the preceding entry.  But for the first entry
+ * in any delta list there is no preceding entry, so the DELTA==0 encoding
+ * at the beginning of a delta list indicates a normal entry.
+ *
+ * The Huffman code is driven by 3 parameters:
+ *
+ *  MINBITS   This is the number of bits in the smallest code
+ *
+ *  BASE      This is the number of values coded using a code of length MINBITS
+ *
+ *  INCR      This is the number of values coded by using one additional bit.
+ *
+ * These parameters are related by:
+ *
+ *       BASE + INCR == 1 << MINBITS
+ *
+ * When we create an index, we need to know the mean delta.  From the mean
+ * delta, we compute these three parameters.  The math for the Huffman code
+ * of an exponential distribution says that we compute:
+ *
+ *      INCR = log(2) * MEAN_DELTA
+ *
+ * Then we find the smallest MINBITS so that
+ *
+ *      1 << MINBITS  >  INCR
+ *
+ * And then:
+ *
+ *       BASE = (1 << MINBITS) - INCR
+ *
+ * Now we need a code such that
+ *
+ * - The first BASE values code using MINBITS bits
+ * - The next INCR values code using MINBITS+1 bits.
+ * - The next INCR values code using MINBITS+2 bits.
+ * - The next INCR values code using MINBITS+3 bits.
+ * - (and so on).
+ *
+ * ENCODE(DELTA):
+ *
+ *   if (DELTA < BASE) {
+ *       put DELTA in MINBITS bits;
+ *   } else {
+ *       T1 = (DELTA - BASE) % INCR + BASE;
+ *       T2 = (DELTA - BASE) / INCR;
+ *       put T1 in MINBITS bits;
+ *       put 0 in T2 bits;
+ *       put 1 in 1 bit;
+ *   }
+ *
+ * DECODE(BIT_STREAM):
+ *
+ *   T1 = next MINBITS bits of stream;
+ *   if (T1 < BASE) {
+ *       DELTA = T1;
+ *   } else {
+ *       Scan bits in the stream until reading a 1,
+ *         setting T2 to the number of 0 bits read;
+ *       DELTA = T2 * INCR + T1;
+ *   }
+ *
+ * The bit field utilities that we use on the delta lists assume that it is
+ * possible to read a few bytes beyond the end of the bit field.  So we
+ * make sure to allocates some extra bytes at the end of memory containing
+ * the delta lists.  Look for POST_FIELD_GUARD_BYTES to find the code
+ * related to this.
+ *
+ * And note that the decode bit stream code includes a step that skips over
+ * 0 bits until the first 1 bit is found.  A corrupted delta list could
+ * cause this step to run off the end of the delta list memory.  As an
+ * extra protection against this happening, the guard bytes at the end
+ * should be set to all ones.
+ */
+
+/**
+ * Constants and structures for the saved delta index. "DI" is for
+ * deltaIndex, and -##### is a number to increment when the format of the
+ * data changes.
+ **/
+enum { MAGIC_SIZE = 8 };
+static const char MAGIC_DI_START[] = "DI-00002";
+
+struct di_header {
+  char     magic[MAGIC_SIZE];   // MAGIC_DI_START
+  uint32_t zoneNumber;
+  uint32_t numZones;
+  uint32_t firstList;
+  uint32_t numLists;
+  uint64_t recordCount;
+  uint64_t collisionCount;
+};
+
+//**********************************************************************
+//  Methods for dealing with mutable delta list headers
+//**********************************************************************
+
+/**
+ * Move the start of the delta list bit stream without moving the end.
+ *
+ * @param deltaList  The delta list header
+ * @param increment  The change in the start of the delta list
+ **/
+static INLINE void moveDeltaListStart(DeltaList *deltaList, int increment)
+{
+  deltaList->startOffset += increment;
+  deltaList->size        -= increment;
+}
+
+/**
+ * Move the end of the delta list bit stream without moving the start.
+ *
+ * @param deltaList  The delta list header
+ * @param increment  The change in the end of the delta list
+ **/
+static INLINE void moveDeltaListEnd(DeltaList *deltaList, int increment)
+{
+  deltaList->size += increment;
+}
+
+//**********************************************************************
+//  Methods for dealing with immutable delta list headers packed
+//**********************************************************************
+
+// Header data used for immutable delta index pages.  These data are
+// followed by the delta list offset table.
+typedef struct __attribute__((packed)) deltaPageHeader {
+  uint64_t nonce;                 // Externally-defined nonce
+  uint64_t virtualChapterNumber;  // The virtual chapter number
+  uint16_t firstList;             // Index of the first delta list on the page
+  uint16_t numLists;              // Number of delta lists on the page
+} DeltaPageHeader;
+
+// Immutable delta lists are packed into pages containing a header that
+// encodes the delta list information into 19 bits per list (64KB bit offset)
+
+enum { IMMUTABLE_HEADER_SIZE = 19 };
+
+/**
+ * Get the bit offset to the immutable delta list header
+ *
+ * @param listNumber  The delta list number
+ *
+ * @return the offset of immutable delta list header
+ **/
+static INLINE unsigned int getImmutableHeaderOffset(unsigned int listNumber)
+{
+  return (sizeof(DeltaPageHeader) * CHAR_BIT
+          + listNumber * IMMUTABLE_HEADER_SIZE);
+}
+
+/**
+ * Get the bit offset to the start of the immutable delta list bit stream
+ *
+ * @param memory      The memory page containing the delta lists
+ * @param listNumber  The delta list number
+ *
+ * @return the start of the delta list
+ **/
+static INLINE unsigned int getImmutableStart(const byte *memory,
+                                             unsigned int listNumber)
+{
+  return getField(memory, getImmutableHeaderOffset(listNumber),
+                  IMMUTABLE_HEADER_SIZE);
+}
+
+/**
+ * Set the bit offset to the start of the immutable delta list bit stream
+ *
+ * @param memory       The memory page containing the delta lists
+ * @param listNumber   The delta list number
+ * @param startOffset  The start of the delta list
+ **/
+static INLINE void setImmutableStart(byte *memory, unsigned int listNumber,
+                                     unsigned int startOffset)
+{
+  setField(startOffset, memory, getImmutableHeaderOffset(listNumber),
+           IMMUTABLE_HEADER_SIZE);
+}
+
+//**********************************************************************
+//  Methods for dealing with Delta List Entries
+//**********************************************************************
+
+/**
+ * Decode a delta index entry delta value. The DeltaIndexEntry basically
+ * describes the previous list entry, and has had its offset field changed to
+ * point to the subsequent entry. We decode the bit stream and update the
+ * DeltaListEntry to describe the entry.
+ *
+ * @param deltaEntry  The delta index entry
+ **/
+static INLINE void decodeDelta(DeltaIndexEntry *deltaEntry)
+{
+  const DeltaMemory *deltaZone = deltaEntry->deltaZone;
+  const byte *memory = deltaZone->memory;
+  uint64_t deltaOffset
+    = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits;
+  const byte *addr = memory + deltaOffset / CHAR_BIT;
+  int offset = deltaOffset % CHAR_BIT;
+  uint32_t data = getUInt32LE(addr) >> offset;
+  addr += sizeof(uint32_t);
+  int keyBits = deltaZone->minBits;
+  unsigned int delta = data & ((1 << keyBits) - 1);
+  if (delta >= deltaZone->minKeys) {
+    data >>= keyBits;
+    if (data == 0) {
+      keyBits = sizeof(uint32_t) * CHAR_BIT - offset;
+      while ((data = getUInt32LE(addr)) == 0) {
+        addr += sizeof(uint32_t);
+        keyBits += sizeof(uint32_t) * CHAR_BIT;
+      }
+    }
+    keyBits += ffs(data);
+    delta += (keyBits - deltaZone->minBits - 1) * deltaZone->incrKeys;
+  }
+  deltaEntry->delta = delta;
+  deltaEntry->key += delta;
+
+  // Check for a collision, a delta of zero not at the start of the list.
+  if (unlikely((delta == 0) && (deltaEntry->offset > 0))) {
+    deltaEntry->isCollision = true;
+    // The small duplication of this math in the two arms of this if statement
+    // makes a tiny but measurable difference in performance.
+    deltaEntry->entryBits = deltaEntry->valueBits + keyBits + COLLISION_BITS;
+  } else {
+    deltaEntry->isCollision = false;
+    deltaEntry->entryBits = deltaEntry->valueBits + keyBits;
+  }
+}
+
+/**
+ * Delete bits from a delta list at the offset of the specified delta index
+ * entry.
+ *
+ * @param deltaEntry  The delta index entry
+ * @param size        The number of bits to delete
+ **/
+static void deleteBits(const DeltaIndexEntry *deltaEntry, int size)
+{
+  DeltaList *deltaList = deltaEntry->deltaList;
+  byte *memory = deltaEntry->deltaZone->memory;
+  // Compute how many bits are retained before and after the deleted bits
+  uint32_t totalSize = getDeltaListSize(deltaList);
+  uint32_t beforeSize = deltaEntry->offset;
+  uint32_t afterSize = totalSize - deltaEntry->offset - size;
+
+  // Determine whether to add to the available space either before or after
+  // the delta list.  We prefer to move the least amount of data.  If it is
+  // exactly the same, try to add to the smaller amount of free space.
+  bool beforeFlag;
+  if (beforeSize < afterSize) {
+    beforeFlag = true;
+  } else if (afterSize < beforeSize) {
+    beforeFlag = false;
+  } else {
+    uint64_t freeBefore
+      = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]);
+    uint64_t freeAfter
+      = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]);
+    beforeFlag = freeBefore < freeAfter;
+  }
+
+  uint64_t source, destination;
+  uint32_t count;
+  if (beforeFlag) {
+    source = getDeltaListStart(deltaList);
+    destination = source + size;
+    moveDeltaListStart(deltaList, size);
+    count = beforeSize;
+  } else {
+    moveDeltaListEnd(deltaList, -size);
+    destination = getDeltaListStart(deltaList) + deltaEntry->offset;
+    source = destination + size;
+    count = afterSize;
+  }
+  moveBits(memory, source, memory, destination, count);
+}
+
+/**
+ * Get the offset of the collision field in a DeltaIndexEntry
+ *
+ * @param entry  The delta index record
+ *
+ * @return the offset of the start of the collision name
+ **/
+static INLINE uint64_t getCollisionOffset(const DeltaIndexEntry *entry)
+{
+  return (getDeltaEntryOffset(entry) + entry->entryBits - COLLISION_BITS);
+}
+
+/**
+ * Encode a delta index entry delta.
+ *
+ * @param deltaEntry  The delta index entry
+ **/
+static void encodeDelta(const DeltaIndexEntry *deltaEntry)
+{
+  const DeltaMemory *deltaZone = deltaEntry->deltaZone;
+  byte *memory = deltaZone->memory;
+  uint64_t offset = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits;
+  if (deltaEntry->delta < deltaZone->minKeys) {
+    setField(deltaEntry->delta, memory, offset, deltaZone->minBits);
+    return;
+  }
+  unsigned int temp = deltaEntry->delta - deltaZone->minKeys;
+  unsigned int t1   = (temp % deltaZone->incrKeys) + deltaZone->minKeys;
+  unsigned int t2   = temp / deltaZone->incrKeys;
+  setField(t1, memory, offset, deltaZone->minBits);
+  setZero(memory, offset + deltaZone->minBits, t2);
+  setOne(memory, offset + deltaZone->minBits + t2, 1);
+}
+
+/**
+ * Encode a delta index entry.
+ *
+ * @param deltaEntry  The delta index entry
+ * @param value       The value associated with the entry
+ * @param name        For collision entries, the 256 bit full name.
+ **/
+static void encodeEntry(const DeltaIndexEntry *deltaEntry, unsigned int value,
+                        const byte *name)
+{
+  byte *memory = deltaEntry->deltaZone->memory;
+  uint64_t offset = getDeltaEntryOffset(deltaEntry);
+  setField(value, memory, offset, deltaEntry->valueBits);
+  encodeDelta(deltaEntry);
+  if (name != NULL) {
+    setBytes(memory, getCollisionOffset(deltaEntry), name, COLLISION_BYTES);
+  }
+}
+
+/**
+ * Insert bits into a delta list at the offset of the specified delta index
+ * entry.
+ *
+ * @param deltaEntry  The delta index entry
+ * @param size        The number of bits to insert
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int insertBits(DeltaIndexEntry *deltaEntry, int size)
+{
+  DeltaMemory *deltaZone = deltaEntry->deltaZone;
+  DeltaList  *deltaList  = deltaEntry->deltaList;
+  // Compute how many bits are in use before and after the inserted bits
+  uint32_t totalSize = getDeltaListSize(deltaList);
+  uint32_t beforeSize = deltaEntry->offset;
+  uint32_t afterSize = totalSize - deltaEntry->offset;
+  if ((unsigned int) (totalSize + size) > UINT16_MAX) {
+    deltaEntry->listOverflow = true;
+    deltaZone->overflowCount++;
+    return UDS_OVERFLOW;
+  }
+
+  // Compute how many bits are available before and after the delta list
+  uint64_t freeBefore
+    = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]);
+  uint64_t freeAfter
+    = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]);
+
+  bool beforeFlag;
+  if (((unsigned int) size <= freeBefore)
+      && ((unsigned int) size <= freeAfter)) {
+    // We have enough space to use either before or after the list.  Prefer
+    // to move the least amount of data.  If it is exactly the same, try to
+    // take from the larger amount of free space.
+    if (beforeSize < afterSize) {
+      beforeFlag = true;
+    } else if (afterSize < beforeSize) {
+      beforeFlag = false;
+    } else {
+      beforeFlag = freeBefore > freeAfter;
+    }
+  } else if ((unsigned int) size <= freeBefore) {
+    // There is space before but not after
+    beforeFlag = true;
+  } else if ((unsigned int) size <= freeAfter) {
+    // There is space after but not before
+    beforeFlag = false;
+  } else {
+    // Neither of the surrounding spaces is large enough for this request,
+    // Extend and/or rebalance the delta list memory choosing to move the
+    // least amount of data.
+    unsigned int growingIndex = deltaEntry->listNumber + 1;
+    beforeFlag = beforeSize < afterSize;
+    if (!beforeFlag) {
+      growingIndex++;
+    }
+    int result = extendDeltaMemory(deltaZone, growingIndex,
+                                   (size + CHAR_BIT - 1) / CHAR_BIT, true);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  uint64_t source, destination;
+  uint32_t count;
+  if (beforeFlag) {
+    source = getDeltaListStart(deltaList);
+    destination = source -  size;
+    moveDeltaListStart(deltaList, -size);
+    count = beforeSize;
+  } else {
+    moveDeltaListEnd(deltaList, size);
+    source = getDeltaListStart(deltaList) + deltaEntry->offset;
+    destination = source + size;
+    count = afterSize;
+  }
+  byte *memory = deltaZone->memory;
+  moveBits(memory, source, memory, destination, count);
+  return UDS_SUCCESS;
+}
+
+/**
+ * Get the amount of memory to allocate for each zone
+ *
+ * @param numZones    The number of zones in the index
+ * @param memorySize  The number of bytes in memory for the index
+ *
+ * @return the number of bytes to allocate for a single zone
+ **/
+static INLINE size_t getZoneMemorySize(unsigned int numZones,
+                                       size_t memorySize)
+{
+  size_t zoneSize = memorySize / numZones;
+  // Round the size up so that each zone is a multiple of 64K in size.
+  enum { ALLOC_BOUNDARY = 64 * KILOBYTE };
+  return (zoneSize + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
+}
+
+/**
+ * Validate delta index parameters
+ *
+ * @param meanDelta       The mean delta value
+ * @param numPayloadBits  The number of bits in the payload or value
+ **/
+static bool invalidParameters(unsigned int meanDelta,
+                              unsigned int numPayloadBits)
+{
+  const unsigned int minDelta = 10;
+  const unsigned int maxDelta = 1 << MAX_FIELD_BITS;
+  if ((meanDelta < minDelta) || (meanDelta > maxDelta)) {
+    logWarning("error initializing delta index: "
+               "meanDelta (%u) is not in the range %u to %u",
+               meanDelta, minDelta, maxDelta);
+    return true;
+  }
+  if (numPayloadBits > MAX_FIELD_BITS) {
+    logWarning("error initializing delta index: Too many payload bits (%u)",
+               numPayloadBits);
+    return true;
+  }
+  return false;
+}
+
+/**
+ * Set a delta index entry to be a collision
+ *
+ * @param deltaEntry  The delta index entry
+ **/
+static void setCollision(DeltaIndexEntry *deltaEntry)
+{
+  deltaEntry->isCollision = true;
+  deltaEntry->entryBits += COLLISION_BITS;
+}
+
+/**
+ * Set the delta in a delta index entry.
+ *
+ * @param deltaEntry  The delta index entry
+ * @param delta       The new delta
+ **/
+static void setDelta(DeltaIndexEntry *deltaEntry, unsigned int delta)
+{
+  const DeltaMemory *deltaZone = deltaEntry->deltaZone;
+  deltaEntry->delta = delta;
+  int keyBits = (deltaZone->minBits
+                 + ((deltaZone->incrKeys - deltaZone->minKeys + delta)
+                    / deltaZone->incrKeys));
+  deltaEntry->entryBits = deltaEntry->valueBits + keyBits;
+}
+
+//**********************************************************************
+//  External functions declared in deltaIndex.h
+//**********************************************************************
+
+int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones,
+                         unsigned int numLists, unsigned int meanDelta,
+                         unsigned int numPayloadBits, size_t memorySize)
+{
+  size_t memSize = getZoneMemorySize(numZones, memorySize);
+  if (invalidParameters(meanDelta, numPayloadBits)) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  int result = ALLOCATE(numZones, DeltaMemory, "Delta Index Zones",
+                        &deltaIndex->deltaZones);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  deltaIndex->numZones     = numZones;
+  deltaIndex->numLists     = numLists;
+  deltaIndex->listsPerZone = (numLists + numZones - 1) / numZones;
+  deltaIndex->isMutable    = true;
+  deltaIndex->tag          = 'm';
+
+  unsigned int z;
+  for (z = 0; z < numZones; z++) {
+    unsigned int firstListInZone = z * deltaIndex->listsPerZone;
+    unsigned int numListsInZone = deltaIndex->listsPerZone;
+    if (z == numZones - 1) {
+      /*
+       * The last zone gets fewer lists if numZones doesn't evenly divide
+       * numLists. We'll have an underflow if the assertion below doesn't
+       * hold. (And it turns out that the assertion is equivalent to
+       *   numZones <= 1 + (numLists / numZones) + (numLists % numZones)
+       * in the case that numZones doesn't evenly divide numlists.
+       * If numLists >= numZones * numZones, then the above inequality
+       * will always hold.)
+       */
+      if (deltaIndex->numLists <= firstListInZone) {
+        uninitializeDeltaIndex(deltaIndex);
+        return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                       "%u delta-lists not enough for %u zones",
+                                       numLists, numZones);
+      }
+      numListsInZone = deltaIndex->numLists - firstListInZone;
+    }
+    int result = initializeDeltaMemory(&deltaIndex->deltaZones[z], memSize,
+                                       firstListInZone, numListsInZone,
+                                       meanDelta, numPayloadBits);
+    if (result != UDS_SUCCESS) {
+      uninitializeDeltaIndex(deltaIndex);
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static bool verifyDeltaIndexPage(uint64_t  nonce,
+                                 uint16_t  numLists,
+                                 uint64_t  expectedNonce,
+                                 byte     *memory,
+                                 size_t    memSize)
+{
+  // Verify the nonce.  A mismatch here happens in normal operation when we are
+  // doing a rebuild but haven't written the entire volume once.
+  if (nonce != expectedNonce) {
+    return false;
+  }
+
+  // Verify that the number of delta lists can fit in the page.
+  if (numLists >
+      (memSize - sizeof(DeltaPageHeader)) * CHAR_BIT / IMMUTABLE_HEADER_SIZE) {
+    return false;
+  }
+
+  // Verify that the first delta list is immediately after the last delta list
+  // header.
+  if (getImmutableStart(memory, 0) != getImmutableHeaderOffset(numLists + 1)) {
+    return false;
+  }
+
+  // Verify that the lists are in the correct order.
+  unsigned int i;
+  for (i = 0; i < numLists; i++) {
+    if (getImmutableStart(memory, i) > getImmutableStart(memory, i + 1)) {
+      return false;
+    }
+  }
+
+  // Verify that the last list ends on the page, and that there is room for the
+  // post-field guard bits.
+  if (getImmutableStart(memory, numLists)
+      > (memSize - POST_FIELD_GUARD_BYTES) * CHAR_BIT) {
+    return false;
+  }
+
+  // Verify that the guard bytes are correctly set to all ones.
+  for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
+    byte guardByte = memory[memSize - POST_FIELD_GUARD_BYTES + i];
+    if (guardByte != (byte) ~0) {
+      return false;
+    }
+  }
+
+  // All verifications passed.
+  return true;
+}
+
+/**********************************************************************/
+int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage,
+                             uint64_t        expectedNonce,
+                             unsigned int    meanDelta,
+                             unsigned int    numPayloadBits,
+                             byte           *memory,
+                             size_t          memSize)
+{
+  const DeltaPageHeader *header = (const DeltaPageHeader *) memory;
+
+  if (invalidParameters(meanDelta, numPayloadBits)) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  // First assume that the header is little endian
+  uint64_t nonce = getUInt64LE((const byte *) &header->nonce);
+  uint64_t vcn   = getUInt64LE((const byte *) &header->virtualChapterNumber);
+  uint16_t firstList = getUInt16LE((const byte *) &header->firstList);
+  uint16_t numLists  = getUInt16LE((const byte *) &header->numLists);
+  if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, memSize)) {
+    // That failed, so try big endian
+    nonce     = getUInt64BE((const byte *) &header->nonce);
+    vcn       = getUInt64BE((const byte *) &header->virtualChapterNumber);
+    firstList = getUInt16BE((const byte *) &header->firstList);
+    numLists  = getUInt16BE((const byte *) &header->numLists);
+    if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory,
+                              memSize)) {
+      // Also failed.  Do not log this as an error.  It happens in normal
+      // operation when we are doing a rebuild but haven't written the entire
+      // volume once.
+      return UDS_CORRUPT_COMPONENT;
+    }
+  }
+
+  deltaIndexPage->deltaIndex.deltaZones   = &deltaIndexPage->deltaMemory;
+  deltaIndexPage->deltaIndex.numZones     = 1;
+  deltaIndexPage->deltaIndex.numLists     = numLists;
+  deltaIndexPage->deltaIndex.listsPerZone = numLists;
+  deltaIndexPage->deltaIndex.isMutable    = false;
+  deltaIndexPage->deltaIndex.tag          = 'p';
+  deltaIndexPage->virtualChapterNumber = vcn;
+  deltaIndexPage->lowestListNumber     = firstList;
+  deltaIndexPage->highestListNumber    = firstList + numLists - 1;
+
+  initializeDeltaMemoryPage(&deltaIndexPage->deltaMemory, (byte *) memory,
+                            memSize, numLists, meanDelta, numPayloadBits);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void uninitializeDeltaIndex(DeltaIndex *deltaIndex)
+{
+  if (deltaIndex != NULL) {
+    unsigned int z;
+    for (z = 0; z < deltaIndex->numZones; z++) {
+      uninitializeDeltaMemory(&deltaIndex->deltaZones[z]);
+    }
+    FREE(deltaIndex->deltaZones);
+    memset(deltaIndex, 0, sizeof(DeltaIndex));
+  }
+}
+
+/**********************************************************************/
+void emptyDeltaIndex(const DeltaIndex *deltaIndex)
+{
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    emptyDeltaLists(&deltaIndex->deltaZones[z]);
+  }
+}
+
+/**********************************************************************/
+void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, unsigned int zoneNumber)
+{
+  emptyDeltaLists(&deltaIndex->deltaZones[zoneNumber]);
+}
+
+/**********************************************************************/
+int packDeltaIndexPage(const DeltaIndex *deltaIndex,
+                       uint64_t          headerNonce,
+                       bool              headerNativeEndian,
+                       byte             *memory,
+                       size_t            memSize,
+                       uint64_t          virtualChapterNumber,
+                       unsigned int      firstList,
+                       unsigned int     *numLists)
+{
+  if (!deltaIndex->isMutable) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "Cannot pack an immutable index");
+  }
+  if (deltaIndex->numZones != 1) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "Cannot pack a delta index page when the"
+                                   " index has %u zones",
+                                   deltaIndex->numZones);
+  }
+  if (firstList > deltaIndex->numLists) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "Cannot pack a delta index page when the"
+                                   " first list (%u) is larger than the number"
+                                   " of lists (%u)",
+                                   firstList, deltaIndex->numLists);
+  }
+
+  const DeltaMemory *deltaZone = &deltaIndex->deltaZones[0];
+  DeltaList *deltaLists = &deltaZone->deltaLists[firstList + 1];
+  unsigned int maxLists = deltaIndex->numLists - firstList;
+
+  // Compute how many lists will fit on the page
+  int numBits = memSize * CHAR_BIT;
+  // Subtract the size of the fixed header and 1 delta list offset
+  numBits -= getImmutableHeaderOffset(1);
+  // Subtract the guard bytes of memory so that allow us to freely read a
+  // short distance past the end of any byte we are interested in.
+  numBits -= POST_FIELD_GUARD_BYTES * CHAR_BIT;
+  if (numBits < IMMUTABLE_HEADER_SIZE) {
+    // This page is too small to contain even one empty delta list
+    return logErrorWithStringError(UDS_OVERFLOW,
+                                   "Chapter Index Page of %zu bytes is too"
+                                   " small",
+                                   memSize);
+  }
+
+  unsigned int nLists = 0;
+  while (nLists < maxLists) {
+    // Each list requires 1 delta list offset and the list data
+    int bits = IMMUTABLE_HEADER_SIZE + getDeltaListSize(&deltaLists[nLists]);
+    if (bits > numBits) {
+      break;
+    }
+    nLists++;
+    numBits -= bits;
+  }
+  *numLists = nLists;
+
+  // Construct the page header
+  DeltaPageHeader *header = (DeltaPageHeader *) memory;
+  if (headerNativeEndian) {
+    header->nonce                = headerNonce;
+    header->virtualChapterNumber = virtualChapterNumber;
+    header->firstList            = firstList;
+    header->numLists             = nLists;
+  } else {
+    storeUInt64LE((byte *) &header->nonce,     headerNonce);
+    storeUInt64LE((byte *) &header->virtualChapterNumber,
+                  virtualChapterNumber);
+    storeUInt16LE((byte *) &header->firstList, firstList);
+    storeUInt16LE((byte *) &header->numLists,  nLists);
+  }
+
+  // Construct the delta list offset table, making sure that the memory
+  // page is large enough.
+  unsigned int offset = getImmutableHeaderOffset(nLists + 1);
+  setImmutableStart(memory, 0, offset);
+  unsigned int i;
+  for (i = 0; i < nLists; i++) {
+    offset += getDeltaListSize(&deltaLists[i]);
+    setImmutableStart(memory, i + 1, offset);
+  }
+
+  // Copy the delta list data onto the memory page
+  for (i = 0; i < nLists; i++) {
+    DeltaList *deltaList = &deltaLists[i];
+    moveBits(deltaZone->memory, getDeltaListStart(deltaList), memory,
+             getImmutableStart(memory, i), getDeltaListSize(deltaList));
+  }
+
+  // Set all the bits in the guard bytes.  Do not use the bit field
+  // utilities.
+  memset(memory + memSize - POST_FIELD_GUARD_BYTES, ~0,
+         POST_FIELD_GUARD_BYTES);
+  return UDS_SUCCESS;
+}
+
+
+/**********************************************************************/
+void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag)
+{
+  deltaIndex->tag = tag;
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    deltaIndex->deltaZones[z].tag = tag;
+  }
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeDeltaIndexHeader(Buffer *buffer, struct di_header *header)
+{
+  int result = getBytesFromBuffer(buffer, MAGIC_SIZE, &header->magic);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->zoneNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->numZones);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->firstList);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->numLists);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->recordCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->collisionCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer) - contentLength(buffer),
+                           bufferLength(buffer));
+  return result;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int readDeltaIndexHeader(BufferedReader *reader,
+                                struct di_header *header)
+{
+  Buffer *buffer;
+
+  int result = makeBuffer(sizeof(*header), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                  bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return logWarningWithStringError(result,
+                                     "failed to read delta index header");
+  }
+  result = resetBufferEnd(buffer, bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = decodeDeltaIndexHeader(buffer, header);
+  freeBuffer(&buffer);
+  return result;
+}
+
+/**********************************************************************/
+int startRestoringDeltaIndex(const DeltaIndex  *deltaIndex,
+                             BufferedReader   **bufferedReaders,
+                             int                numReaders)
+{
+  if (!deltaIndex->isMutable) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "Cannot restore to an immutable index");
+  }
+  if (numReaders <= 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "No delta index files");
+  }
+
+  unsigned int numZones = numReaders;
+  if (numZones > MAX_ZONES) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "zone count %u must not exceed MAX_ZONES",
+                                   numZones);
+  }
+
+  unsigned long recordCount = 0;
+  unsigned long collisionCount = 0;
+  unsigned int firstList[MAX_ZONES];
+  unsigned int numLists[MAX_ZONES];
+  BufferedReader *reader[MAX_ZONES];
+  bool zoneFlags[MAX_ZONES] = { false, };
+
+  // Read the header from each file, and make sure we have a matching set
+  unsigned int z;
+  for (z = 0; z < numZones; z++) {
+    struct di_header header;
+    int result = readDeltaIndexHeader(bufferedReaders[z], &header);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to read delta index header");
+    }
+    if (memcmp(header.magic, MAGIC_DI_START, MAGIC_SIZE) != 0) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "delta index file has bad magic"
+                                       " number");
+    }
+    if (numZones != header.numZones) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "delta index files contain mismatched"
+                                       " zone counts (%u,%u)",
+                                       numZones, header.numZones);
+    }
+    if (header.zoneNumber >= numZones) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "delta index files contains zone %u of"
+                                       " %u zones",
+                                       header.zoneNumber, numZones);
+    }
+    if (zoneFlags[header.zoneNumber]) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "delta index files contain two of zone"
+                                       " %u",
+                                       header.zoneNumber);
+    }
+    reader[header.zoneNumber] = bufferedReaders[z];
+    firstList[header.zoneNumber] = header.firstList;
+    numLists[header.zoneNumber] = header.numLists;
+    zoneFlags[header.zoneNumber] = true;
+    recordCount += header.recordCount;
+    collisionCount += header.collisionCount;
+  }
+  unsigned int listNext = 0;
+  for (z = 0; z < numZones; z++) {
+    if (firstList[z] != listNext) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "delta index file for zone %u starts"
+                                       " with list %u instead of list %u",
+                                       z, firstList[z], listNext);
+    }
+    listNext += numLists[z];
+  }
+  if (listNext != deltaIndex->numLists) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "delta index files contain %u delta lists"
+                                     " instead of %u delta lists",
+                                     listNext, deltaIndex->numLists);
+  }
+  if (collisionCount > recordCount) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "delta index files contain %ld collisions"
+                                     " and %ld records",
+                                     collisionCount, recordCount);
+  }
+
+  emptyDeltaIndex(deltaIndex);
+  deltaIndex->deltaZones[0].recordCount    = recordCount;
+  deltaIndex->deltaZones[0].collisionCount = collisionCount;
+
+  // Read the delta list sizes from the files, and distribute each of them
+  // to proper zone
+  for (z = 0; z < numZones; z++) {
+    unsigned int i;
+    for (i = 0; i < numLists[z]; i++) {
+      byte deltaListSizeData[sizeof(uint16_t)];
+      int result = readFromBufferedReader(reader[z], deltaListSizeData,
+                                          sizeof(deltaListSizeData));
+      if (result != UDS_SUCCESS) {
+        return logWarningWithStringError(result,
+                                         "failed to read delta index size");
+      }
+      uint16_t deltaListSize = getUInt16LE(deltaListSizeData);
+      unsigned int listNumber = firstList[z] + i;
+      unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber);
+      const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber];
+      listNumber -= deltaZone->firstList;
+      deltaZone->deltaLists[listNumber + 1].size = deltaListSize;
+    }
+  }
+
+  // Prepare each zone to start receiving the delta list data
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    int result = startRestoringDeltaMemory(&deltaIndex->deltaZones[z]);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex)
+{
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    if (!areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[z])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**********************************************************************/
+int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex,
+                                 const DeltaListSaveInfo *dlsi,
+                                 const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  // Make sure the data are intended for this delta list.  Do not
+  // log an error, as this may be valid data for another delta index.
+  if (dlsi->tag != deltaIndex->tag) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+
+  if (dlsi->index >= deltaIndex->numLists) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "invalid delta list number %u of %u",
+                                     dlsi->index, deltaIndex->numLists);
+  }
+
+  unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, dlsi->index);
+  return restoreDeltaList(&deltaIndex->deltaZones[zoneNumber], dlsi, data);
+}
+
+/**********************************************************************/
+void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex)
+{
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    abortRestoringDeltaMemory(&deltaIndex->deltaZones[z]);
+  }
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int encodeDeltaIndexHeader(Buffer *buffer, struct di_header *header)
+{
+  int result = putBytes(buffer, MAGIC_SIZE, MAGIC_DI_START);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->zoneNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->numZones);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->firstList);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->numLists);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->recordCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->collisionCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*header),
+                           "%zu bytes encoded of %zu expected",
+                           contentLength(buffer), sizeof(*header));
+
+  return result;
+}
+
+/**********************************************************************/
+int startSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                          unsigned int zoneNumber,
+                          BufferedWriter *bufferedWriter)
+{
+  DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber];
+  struct di_header header;
+  memcpy(header.magic, MAGIC_DI_START, MAGIC_SIZE);
+  header.zoneNumber     = zoneNumber;
+  header.numZones       = deltaIndex->numZones;
+  header.firstList      = deltaZone->firstList;
+  header.numLists       = deltaZone->numLists;
+  header.recordCount    = deltaZone->recordCount;
+  header.collisionCount = deltaZone->collisionCount;
+
+  Buffer *buffer;
+  int result = makeBuffer(sizeof(struct di_header), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = encodeDeltaIndexHeader(buffer, &header);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result,
+                                     "failed to write delta index header");
+  }
+
+  unsigned int i;
+  for (i = 0; i < deltaZone->numLists; i++) {
+    uint16_t deltaListSize = getDeltaListSize(&deltaZone->deltaLists[i + 1]);
+    byte data[2];
+    storeUInt16LE(data, deltaListSize);
+    result = writeToBufferedWriter(bufferedWriter, data, sizeof(data));
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to write delta list size");
+    }
+  }
+
+  startSavingDeltaMemory(deltaZone, bufferedWriter);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex,
+                            unsigned int zoneNumber)
+{
+  return areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[zoneNumber]);
+}
+
+/**********************************************************************/
+int finishSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                           unsigned int zoneNumber)
+{
+  return finishSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]);
+}
+
+/**********************************************************************/
+int abortSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                          unsigned int zoneNumber)
+{
+  abortSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize)
+{
+  // The exact amount of memory used depends upon the number of zones.
+  // Compute the maximum potential memory size.
+  size_t maxMemSize = memorySize;
+  unsigned int numZones;
+  for (numZones = 1; numZones <= MAX_ZONES; numZones++) {
+    size_t memSize = getZoneMemorySize(numZones, memorySize);
+    if (memSize > maxMemSize) {
+      maxMemSize = memSize;
+    }
+  }
+  // Saving a delta index requires a header ...
+  return (sizeof(struct di_header)
+          // ... plus a DeltaListSaveInfo per delta list
+          // plus an extra byte per delta list ...
+          + numLists * (sizeof(DeltaListSaveInfo) + 1)
+          // ... plus the delta list memory
+          + maxMemSize);
+}
+
+/**********************************************************************/
+int validateDeltaIndex(const DeltaIndex *deltaIndex)
+{
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    int result = validateDeltaLists(&deltaIndex->deltaZones[z]);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int assertNotAtEnd(const DeltaIndexEntry *deltaEntry, int errorCode)
+{
+  return ASSERT_WITH_ERROR_CODE(!deltaEntry->atEnd, errorCode,
+                                "operation is invalid because the list entry "
+                                "is at the end of the delta list");
+}
+
+/**********************************************************************/
+static void prefetchDeltaList(const DeltaMemory *deltaZone,
+                              const DeltaList *deltaList)
+{
+  const byte *memory = deltaZone->memory;
+  const byte *addr = &memory[getDeltaListStart(deltaList) / CHAR_BIT];
+  unsigned int size = getDeltaListSize(deltaList) / CHAR_BIT;
+  prefetchRange(addr, size, false);
+}
+
+/**********************************************************************/
+int startDeltaIndexSearch(const DeltaIndex *deltaIndex,
+                          unsigned int listNumber, unsigned int key,
+                          bool readOnly, DeltaIndexEntry *deltaEntry)
+{
+  int result
+    = ASSERT_WITH_ERROR_CODE((listNumber < deltaIndex->numLists),
+                             UDS_CORRUPT_DATA,
+                             "Delta list number (%u) is out of range (%u)",
+                             listNumber, deltaIndex->numLists);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber);
+  DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber];
+  listNumber -= deltaZone->firstList;
+  result = ASSERT_WITH_ERROR_CODE((listNumber < deltaZone->numLists),
+                                  UDS_CORRUPT_DATA,
+                                  "Delta list number (%u)"
+                                  " is out of range (%u) for zone (%u)",
+                                  listNumber, deltaZone->numLists, zoneNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  DeltaList *deltaList;
+  if (deltaIndex->isMutable) {
+    deltaList = &deltaZone->deltaLists[listNumber + 1];
+    if (!readOnly) {
+      // Here is the lazy writing of the index for a checkpoint
+      lazyFlushDeltaList(deltaZone, listNumber);
+    }
+  } else {
+    // Translate the immutable delta list header into a temporary full
+    // delta list header
+    deltaList = &deltaEntry->tempDeltaList;
+    deltaList->startOffset = getImmutableStart(deltaZone->memory, listNumber);
+    unsigned int endOffset = getImmutableStart(deltaZone->memory,
+                                               listNumber + 1);
+    deltaList->size = endOffset - deltaList->startOffset;
+    deltaList->saveKey = 0;
+    deltaList->saveOffset = 0;
+  }
+
+  if (key > deltaList->saveKey) {
+    deltaEntry->key    = deltaList->saveKey;
+    deltaEntry->offset = deltaList->saveOffset;
+  } else {
+    deltaEntry->key    = 0;
+    deltaEntry->offset = 0;
+    if (key == 0) {
+      // This usually means we're about to walk the entire delta list, so get
+      // all of it into the CPU cache.
+      prefetchDeltaList(deltaZone, deltaList);
+    }
+  }
+
+  deltaEntry->atEnd        = false;
+  deltaEntry->deltaZone    = deltaZone;
+  deltaEntry->deltaList    = deltaList;
+  deltaEntry->entryBits    = 0;
+  deltaEntry->isCollision  = false;
+  deltaEntry->listNumber   = listNumber;
+  deltaEntry->listOverflow = false;
+  deltaEntry->valueBits    = deltaZone->valueBits;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+__attribute__((__noinline__))
+int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry)
+{
+  int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  const DeltaList *deltaList = deltaEntry->deltaList;
+  deltaEntry->offset += deltaEntry->entryBits;
+  unsigned int size = getDeltaListSize(deltaList);
+  if (unlikely(deltaEntry->offset >= size)) {
+    deltaEntry->atEnd       = true;
+    deltaEntry->delta       = 0;
+    deltaEntry->isCollision = false;
+    return ASSERT_WITH_ERROR_CODE((deltaEntry->offset == size),
+                                  UDS_CORRUPT_DATA,
+                                  "next offset past end of delta list");
+  }
+
+  decodeDelta(deltaEntry);
+
+  unsigned int nextOffset = deltaEntry->offset + deltaEntry->entryBits;
+  if (nextOffset > size) {
+    // This is not an assertion because validateChapterIndexPage() wants to
+    // handle this error.
+    logWarning("Decoded past the end of the delta list");
+    return UDS_CORRUPT_DATA;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry)
+{
+  int result = ASSERT(!deltaEntry->isCollision, "entry is not a collision");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  DeltaList *deltaList = deltaEntry->deltaList;
+  deltaList->saveKey = deltaEntry->key - deltaEntry->delta;
+  deltaList->saveOffset = deltaEntry->offset;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber,
+                       unsigned int key, const byte *name, bool readOnly,
+                       DeltaIndexEntry *deltaEntry)
+{
+  int result = startDeltaIndexSearch(deltaIndex, listNumber, key, readOnly,
+                                     deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  do {
+    result = nextDeltaIndexEntry(deltaEntry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  } while (!deltaEntry->atEnd && (key > deltaEntry->key));
+
+  result = rememberDeltaIndexOffset(deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (!deltaEntry->atEnd && (key == deltaEntry->key)) {
+    DeltaIndexEntry collisionEntry;
+    collisionEntry = *deltaEntry;
+    for (;;) {
+      result = nextDeltaIndexEntry(&collisionEntry);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      if (collisionEntry.atEnd || !collisionEntry.isCollision) {
+        break;
+      }
+      byte collisionName[COLLISION_BYTES];
+      getBytes(deltaEntry->deltaZone->memory,
+               getCollisionOffset(&collisionEntry), collisionName,
+               COLLISION_BYTES);
+      if (memcmp(collisionName, name, COLLISION_BYTES) == 0) {
+        *deltaEntry = collisionEntry;
+        break;
+      }
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name)
+{
+  int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_WITH_ERROR_CODE(deltaEntry->isCollision, UDS_BAD_STATE,
+                                  "Cannot get full block name from a"
+                                  " non-collision delta index entry");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  getBytes(deltaEntry->deltaZone->memory, getCollisionOffset(deltaEntry),
+           name, COLLISION_BYTES);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int assertMutableEntry(const DeltaIndexEntry *deltaEntry)
+{
+  return ASSERT_WITH_ERROR_CODE(deltaEntry->deltaList
+                                != &deltaEntry->tempDeltaList,
+                                UDS_BAD_STATE,
+                                "delta index is mutable");
+}
+
+/**********************************************************************/
+int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value)
+{
+  int result = assertMutableEntry(deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT_WITH_ERROR_CODE(((value & ((1 << deltaEntry->valueBits) - 1))
+                                   == value), UDS_INVALID_ARGUMENT,
+                                  "Value (%u) being set in a delta index is "
+                                  "too large (must fit in %u bits)",
+                                  value, deltaEntry->valueBits);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  setField(value, deltaEntry->deltaZone->memory,
+           getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key,
+                       unsigned int value, const byte *name)
+{
+  int result = assertMutableEntry(deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (deltaEntry->isCollision) {
+    /*
+     * The caller wants us to insert a collision entry onto a collision
+     * entry.  This happens when we find a collision and attempt to add the
+     * name again to the index.  This is normally a fatal error unless we
+     * are replaying a closed chapter while we are rebuilding a master
+     * index.
+     */
+    return UDS_DUPLICATE_NAME;
+  }
+
+  if (deltaEntry->offset < deltaEntry->deltaList->saveOffset) {
+    // The saved entry offset is after the new entry and will no longer be
+    // valid, so replace it with the insertion point.
+    result = rememberDeltaIndexOffset(deltaEntry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  if (name != NULL) {
+    // We are inserting a collision entry which is placed after this entry
+    result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = ASSERT((key == deltaEntry->key),
+                    "incorrect key for collision entry");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    deltaEntry->offset += deltaEntry->entryBits;
+    setDelta(deltaEntry, 0);
+    setCollision(deltaEntry);
+    result = insertBits(deltaEntry, deltaEntry->entryBits);
+  } else if (deltaEntry->atEnd) {
+    // We are inserting a new entry at the end of the delta list
+    result = ASSERT((key >= deltaEntry->key), "key past end of list");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    setDelta(deltaEntry, key - deltaEntry->key);
+    deltaEntry->key   = key;
+    deltaEntry->atEnd = false;
+    result = insertBits(deltaEntry, deltaEntry->entryBits);
+  } else {
+    // We are inserting a new entry which requires the delta in the
+    // following entry to be updated.
+    result = ASSERT((key < deltaEntry->key), "key precedes following entry");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = ASSERT((key >= deltaEntry->key - deltaEntry->delta),
+                    "key effects following entry's delta");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    int oldEntrySize = deltaEntry->entryBits;
+    DeltaIndexEntry nextEntry = *deltaEntry;
+    unsigned int nextValue = getDeltaEntryValue(&nextEntry);
+    setDelta(deltaEntry, key - (deltaEntry->key - deltaEntry->delta));
+    deltaEntry->key = key;
+    setDelta(&nextEntry, nextEntry.key - key);
+    nextEntry.offset += deltaEntry->entryBits;
+    // The 2 new entries are always bigger than the 1 entry we are replacing
+    int additionalSize
+      = deltaEntry->entryBits + nextEntry.entryBits - oldEntrySize;
+    result = insertBits(deltaEntry, additionalSize);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    encodeEntry(&nextEntry, nextValue, NULL);
+  }
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  encodeEntry(deltaEntry, value, name);
+
+  DeltaMemory *deltaZone = deltaEntry->deltaZone;
+  deltaZone->recordCount++;
+  deltaZone->collisionCount += deltaEntry->isCollision ? 1 : 0;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry)
+{
+  int result = assertMutableEntry(deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  DeltaIndexEntry nextEntry = *deltaEntry;
+  result = nextDeltaIndexEntry(&nextEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  DeltaMemory *deltaZone = deltaEntry->deltaZone;
+
+  if (deltaEntry->isCollision) {
+    // This is a collision entry, so just remove it
+    deleteBits(deltaEntry, deltaEntry->entryBits);
+    nextEntry.offset = deltaEntry->offset;
+    deltaZone->collisionCount -= 1;
+  } else if (nextEntry.atEnd) {
+    // This entry is at the end of the list, so just remove it
+    deleteBits(deltaEntry, deltaEntry->entryBits);
+    nextEntry.key -= deltaEntry->delta;
+    nextEntry.offset = deltaEntry->offset;
+  } else {
+    // The delta in the next entry needs to be updated.
+    unsigned int nextValue = getDeltaEntryValue(&nextEntry);
+    int oldSize = deltaEntry->entryBits + nextEntry.entryBits;
+    if (nextEntry.isCollision) {
+      // The next record is a collision. It needs to be rewritten as a
+      // non-collision with a larger delta.
+      nextEntry.isCollision = false;
+      deltaZone->collisionCount -= 1;
+    }
+    setDelta(&nextEntry, deltaEntry->delta + nextEntry.delta);
+    nextEntry.offset = deltaEntry->offset;
+    // The 1 new entry is always smaller than the 2 entries we are replacing
+    deleteBits(deltaEntry, oldSize - nextEntry.entryBits);
+    encodeEntry(&nextEntry, nextValue, NULL);
+  }
+  deltaZone->recordCount--;
+  deltaZone->discardCount++;
+  *deltaEntry = nextEntry;
+
+  DeltaList *deltaList = deltaEntry->deltaList;
+  if (deltaEntry->offset < deltaList->saveOffset) {
+    // The saved entry offset is after the entry we just removed and it
+    // will no longer be valid.  We must force the next search to start at
+    // the beginning.
+    deltaList->saveKey = 0;
+    deltaList->saveOffset = 0;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex,
+                                        unsigned int zoneNumber)
+{
+  return deltaIndex->deltaZones[zoneNumber].firstList;
+}
+
+/**********************************************************************/
+unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex,
+                                       unsigned int zoneNumber)
+{
+  return deltaIndex->deltaZones[zoneNumber].numLists;
+}
+
+/**********************************************************************/
+uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex,
+                                        unsigned int zoneNumber)
+{
+  uint64_t bitCount = 0;
+  const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber];
+  unsigned int i;
+  for (i = 0; i < deltaZone->numLists; i++) {
+    bitCount += getDeltaListSize(&deltaZone->deltaLists[i + 1]);
+  }
+  return bitCount;
+}
+
+/**********************************************************************/
+uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex)
+{
+  uint64_t bitCount = 0;
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    bitCount += getDeltaIndexZoneDlistBitsUsed(deltaIndex, z);
+  }
+  return bitCount;
+}
+
+/**********************************************************************/
+uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex)
+{
+  uint64_t byteCount = 0;
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z];
+    byteCount += deltaZone->size;
+  }
+  return byteCount * CHAR_BIT;
+}
+
+/**********************************************************************/
+void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats)
+{
+  memset(stats, 0, sizeof(DeltaIndexStats));
+  stats->memoryAllocated = deltaIndex->numZones * sizeof(DeltaMemory);
+  unsigned int z;
+  for (z = 0; z < deltaIndex->numZones; z++) {
+    const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z];
+    stats->memoryAllocated += getDeltaMemoryAllocated(deltaZone);
+    stats->rebalanceTime   += deltaZone->rebalanceTime;
+    stats->rebalanceCount  += deltaZone->rebalanceCount;
+    stats->recordCount     += deltaZone->recordCount;
+    stats->collisionCount  += deltaZone->collisionCount;
+    stats->discardCount    += deltaZone->discardCount;
+    stats->overflowCount   += deltaZone->overflowCount;
+    stats->numLists        += deltaZone->numLists;
+  }
+}
+
+/**********************************************************************/
+unsigned int getDeltaIndexPageCount(unsigned int numEntries,
+                                    unsigned int numLists,
+                                    unsigned int meanDelta,
+                                    unsigned int numPayloadBits,
+                                    size_t bytesPerPage)
+{
+  // Compute the number of bits needed for all the entries
+  size_t bitsPerIndex
+    = getDeltaMemorySize(numEntries, meanDelta, numPayloadBits);
+  // Compute the number of bits needed for a single delta list
+  unsigned int bitsPerDeltaList = bitsPerIndex / numLists;
+  // Adjust the bits per index, adding the immutable delta list headers
+  bitsPerIndex += numLists * IMMUTABLE_HEADER_SIZE;
+  // Compute the number of usable bits on an immutable index page
+  unsigned int bitsPerPage
+    = (bytesPerPage - sizeof(DeltaPageHeader)) * CHAR_BIT;
+  // Adjust the bits per page, taking away one immutable delta list header
+  // and one delta list representing internal fragmentation
+  bitsPerPage -= IMMUTABLE_HEADER_SIZE + bitsPerDeltaList;
+  // Now compute the number of pages needed
+  return (bitsPerIndex + bitsPerPage - 1) / bitsPerPage;
+}
+
+/**********************************************************************/
+void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry)
+{
+  logRatelimit(logInfo, "List 0x%X Key 0x%X Offset 0x%X%s%s ListSize 0x%X%s",
+               deltaEntry->listNumber, deltaEntry->key, deltaEntry->offset,
+               deltaEntry->atEnd ? " end" : "",
+               deltaEntry->isCollision ? " collision" : "",
+               getDeltaListSize(deltaEntry->deltaList),
+               deltaEntry->listOverflow ? " overflow" : "");
+  deltaEntry->listOverflow = false;
+}
diff --git a/uds/deltaIndex.h b/uds/deltaIndex.h
new file mode 100644
index 0000000..af2d762
--- /dev/null
+++ b/uds/deltaIndex.h
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.h#4 $
+ */
+
+#ifndef DELTAINDEX_H
+#define DELTAINDEX_H 1
+
+#include "compiler.h"
+#include "deltaMemory.h"
+
+enum {
+  // the number of extra bytes and bits needed to store a collision entry
+  COLLISION_BYTES = UDS_CHUNK_NAME_SIZE,
+  COLLISION_BITS  = COLLISION_BYTES * CHAR_BIT
+};
+
+typedef struct deltaIndex {
+  DeltaMemory *deltaZones;    // The zones
+  unsigned int numZones;      // The number of zones
+  unsigned int numLists;      // The number of delta lists
+  unsigned int listsPerZone;  // Lists per zone (last zone can be smaller)
+  bool isMutable;             // True if this index is mutable
+  byte tag;                   // Tag belonging to this delta index
+} DeltaIndex;
+
+/*
+ * A DeltaIndexPage describes a single page of a chapter index.  The deltaIndex
+ * field allows the page to be treated as an immutable DeltaIndex.  We use the
+ * deltaMemory field to treat the chapter index page as a single zone index,
+ * and without the need to do an additional memory allocation.
+ */
+
+typedef struct deltaIndexPage {
+  DeltaIndex   deltaIndex;
+  // These values are loaded from the DeltaPageHeader
+  unsigned int lowestListNumber;
+  unsigned int highestListNumber;
+  uint64_t     virtualChapterNumber;
+  // This structure describes the single zone of a delta index page.
+  DeltaMemory  deltaMemory;
+} DeltaIndexPage;
+
+/*
+ * Notes on the DeltaIndexEntries:
+ *
+ * The fields documented as "public" can be read by any code that uses a
+ * DeltaIndex.  The fields documented as "private" carry information
+ * between DeltaIndex method calls and should not be used outside the
+ * DeltaIndex module.
+ *
+ * (1) The DeltaIndexEntry is used like an iterator when searching a delta
+ *     list.
+ *
+ * (2) And it is also the result of a successful search and can be used to
+ *     refer to the element found by the search.
+ *
+ * (3) And it is also the result of an unsuccessful search and can be used
+ *     to refer to the insertion point for a new record.
+ *
+ * (4) If atEnd==true, the DeltaListEntry can only be used as the insertion
+ *     point for a new record at the end of the list.
+ *
+ * (5) If atEnd==false and isCollision==true, the DeltaListEntry fields
+ *     refer to a collision entry in the list, and the DeltaListEntry can
+ *     be used a a reference to this entry.
+ *
+ * (6) If atEnd==false and isCollision==false, the DeltaListEntry fields
+ *     refer to a non-collision entry in the list.  Such DeltaListEntries
+ *     can be used as a reference to a found entry, or an insertion point
+ *     for a non-collision entry before this entry, or an insertion point
+ *     for a collision entry that collides with this entry.
+ */
+
+typedef struct deltaIndexEntry {
+  // Public fields
+  unsigned int key;          // The key for this entry
+  bool         atEnd;        // We are after the last entry in the list
+  bool         isCollision;  // This record is a collision
+  // Private fields (but DeltaIndex_t1 cheats and looks at them)
+  bool           listOverflow;  // This delta list overflowed
+  unsigned short valueBits;     // The number of bits used for the value
+  unsigned short entryBits;     // The number of bits used for the entire entry
+  DeltaMemory   *deltaZone;     // The delta index zone
+  DeltaList     *deltaList;     // The delta list containing the entry,
+  unsigned int   listNumber;    // The delta list number
+  uint32_t       offset;        // Bit offset of this entry within the list
+  unsigned int   delta;         // The delta between this and previous entry
+  DeltaList      tempDeltaList; // Temporary delta list for immutable indices
+} DeltaIndexEntry;
+
+typedef struct {
+  size_t memoryAllocated;  // Number of bytes allocated
+  RelTime rebalanceTime;   // The time spent rebalancing
+  int  rebalanceCount;     // Number of memory rebalances
+  long recordCount;        // The number of records in the index
+  long collisionCount;     // The number of collision records
+  long discardCount;       // The number of records removed
+  long overflowCount;      // The number of UDS_OVERFLOWs detected
+  unsigned int numLists;   // The number of delta lists
+} DeltaIndexStats;
+
+/**
+ * Initialize a delta index.
+ *
+ * @param deltaIndex      The delta index to initialize
+ * @param numZones        The number of zones in the index
+ * @param numLists        The number of delta lists in the index
+ * @param meanDelta       The mean delta value
+ * @param numPayloadBits  The number of bits in the payload or value
+ * @param memorySize      The number of bytes in memory for the index
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones,
+                         unsigned int numLists, unsigned int meanDelta,
+                         unsigned int numPayloadBits, size_t memorySize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Initialize an immutable delta index page.
+ *
+ * @param deltaIndexPage  The delta index page to initialize
+ * @param expectedNonce   If non-zero, the expected nonce.
+ * @param meanDelta       The mean delta value
+ * @param numPayloadBits  The number of bits in the payload or value
+ * @param memory          The memory page
+ * @param memSize         The size of the memory page
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage,
+                             uint64_t        expectedNonce,
+                             unsigned int    meanDelta,
+                             unsigned int    numPayloadBits,
+                             byte           *memory,
+                             size_t          memSize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Uninitialize a delta index.
+ *
+ * @param deltaIndex  The delta index to uninitialize
+ **/
+void uninitializeDeltaIndex(DeltaIndex *deltaIndex);
+
+/**
+ * Empty the delta index.
+ *
+ * @param deltaIndex  The delta index being emptied.
+ **/
+void emptyDeltaIndex(const DeltaIndex *deltaIndex);
+
+/**
+ * Empty a zone of the delta index.
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone being emptied
+ **/
+void emptyDeltaIndexZone(const DeltaIndex *deltaIndex,
+                         unsigned int zoneNumber);
+
+/**
+ * Pack delta lists from a mutable delta index into an immutable delta index
+ * page.  A range of delta lists (starting with a specified list index) is
+ * copied from the mutable delta index into a memory page used in the immutable
+ * index.  The number of lists copied onto the page is returned to the caller.
+ *
+ * @param deltaIndex            The delta index being converted
+ * @param headerNonce           The header nonce to store
+ * @param headerNativeEndian    If true, write native endian header
+ * @param memory                The memory page to use
+ * @param memSize               The size of the memory page
+ * @param virtualChapterNumber  The virtual chapter number
+ * @param firstList             The first delta list number to be copied
+ * @param numLists              The number of delta lists that were copied
+ *
+ * @return error code or UDS_SUCCESS.  On UDS_SUCCESS, the numLists
+ *         argument contains the number of lists copied.
+ **/
+int packDeltaIndexPage(const DeltaIndex *deltaIndex,
+                       uint64_t          headerNonce,
+                       bool              headerNativeEndian,
+                       byte             *memory,
+                       size_t            memSize,
+                       uint64_t          virtualChapterNumber,
+                       unsigned int      firstList,
+                       unsigned int     *numLists)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Set the tag value used when saving and/or restoring a delta index.
+ *
+ * @param deltaIndex  The delta index
+ * @param tag         The tag value
+ **/
+void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag);
+
+/**
+ * Start restoring a delta index from an input stream.
+ *
+ * @param deltaIndex       The delta index to read into
+ * @param bufferedReaders  The buffered readers to read the delta index from
+ * @param numReaders       The number of buffered readers
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int startRestoringDeltaIndex(const DeltaIndex *deltaIndex,
+                             BufferedReader **bufferedReaders, int numReaders)
+  __attribute__((warn_unused_result));
+
+/**
+ * Have all the data been read while restoring a delta index from an
+ * input stream?
+ *
+ * @param deltaIndex  The delta index
+ *
+ * @return true if all the data are read
+ **/
+bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex);
+
+/**
+ * Restore a saved delta list
+ *
+ * @param deltaIndex  The delta index
+ * @param dlsi        The DeltaListSaveInfo describing the delta list
+ * @param data        The saved delta list bit stream
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex,
+                                 const DeltaListSaveInfo *dlsi,
+                                 const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort restoring a delta index from an input stream.
+ *
+ * @param deltaIndex  The delta index
+ **/
+void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex);
+
+/**
+ * Start saving a delta index zone to a buffered output stream.
+ *
+ * @param deltaIndex      The delta index
+ * @param zoneNumber      The zone number
+ * @param bufferedWriter  The index state component being written
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int startSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                          unsigned int zoneNumber,
+                          BufferedWriter *bufferedWriter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Have all the data been written while saving a delta index zone to an
+ * output stream?  If the answer is yes, it is still necessary to call
+ * finishSavingDeltaIndex(), which will return quickly.
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return true if all the data are written
+ **/
+bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex,
+                            unsigned int zoneNumber);
+
+/**
+ * Finish saving a delta index zone to an output stream.  Force the writing
+ * of all of the remaining data.  If an error occurred asynchronously
+ * during the save operation, it will be returned here.
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int finishSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                           unsigned int zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort saving a delta index zone to an output stream.  If an error
+ * occurred asynchronously during the save operation, it will be dropped.
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int abortSavingDeltaIndex(const DeltaIndex *deltaIndex,
+                          unsigned int zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the number of bytes required to save a delta index
+ *
+ * @param numLists    The number of delta lists in the index
+ * @param memorySize  The number of bytes in memory for the index
+ *
+ * @return numBytes  The number of bytes required to save the master index
+ **/
+size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Validate the delta index
+ *
+ * @param deltaIndex  The delta index
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int validateDeltaIndex(const DeltaIndex *deltaIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prepare to search for an entry in the specified delta list.
+ *
+ * <p> This is always the first routine to be called when dealing with delta
+ * index entries. It is always followed by calls to nextDeltaIndexEntry to
+ * iterate through a delta list. The fields of the DeltaIndexEntry argument
+ * will be set up for iteration, but will not contain an entry from the list.
+ *
+ * @param deltaIndex  The delta index to search
+ * @param listNumber  The delta list number
+ * @param key         First delta list key that the caller is interested in
+ * @param readOnly    True if this is a read-only operation
+ * @param iterator    The index entry being used to search through the list
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int startDeltaIndexSearch(const DeltaIndex *deltaIndex,
+                          unsigned int listNumber, unsigned int key,
+                          bool readOnly, DeltaIndexEntry *iterator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the next entry in the specified delta list
+ *
+ * @param deltaEntry  Info about an entry, which is updated to describe the
+ *                    following entry
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remember the position of a delta index entry, so that we can use it when
+ * starting the next search.
+ *
+ * @param deltaEntry  Info about an entry found during a search.  This should
+ *                    be the first entry that matches the key exactly (i.e.
+ *                    not a collision entry), or the first entry with a key
+ *                    greater than the entry sought for.
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the delta index entry, or the insertion point for a delta index
+ * entry.
+ *
+ * @param deltaIndex  The delta index to search
+ * @param listNumber  The delta list number
+ * @param key         The key field being looked for
+ * @param name        The 256 bit full name
+ * @param readOnly    True if this is a read-only index search
+ * @param deltaEntry  Updated to describe the entry being looked for
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber,
+                       unsigned int key, const byte *name, bool readOnly,
+                       DeltaIndexEntry *deltaEntry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the full name from a collision DeltaIndexEntry
+ *
+ * @param deltaEntry  The delta index record
+ * @param name        The 256 bit full name
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the bit offset into delta memory of a delta index entry.
+ *
+ * @param deltaEntry  The delta index entry
+ *
+ * @return the bit offset into delta memory
+ **/
+static INLINE uint64_t getDeltaEntryOffset(const DeltaIndexEntry *deltaEntry)
+{
+  return getDeltaListStart(deltaEntry->deltaList) + deltaEntry->offset;
+}
+
+/**
+ * Get the number of bits used to encode the entry key (the delta).
+ *
+ * @param entry         The delta index record
+ *
+ * @return the number of bits used to encode the key
+ **/
+static INLINE unsigned int getDeltaEntryKeyBits(const DeltaIndexEntry *entry)
+{
+  /*
+   * Derive keyBits by subtracting the sizes of the other two fields from the
+   * total. We don't actually use this for encoding/decoding, so it doesn't
+   * need to be super-fast. We save time where it matters by not storing it.
+   */
+  return (entry->entryBits - entry->valueBits
+          - (entry->isCollision ? COLLISION_BITS : 0));
+}
+
+/**
+ * Get the value field of the DeltaIndexEntry
+ *
+ * @param deltaEntry  The delta index record
+ *
+ * @return the value
+ **/
+static INLINE unsigned int getDeltaEntryValue(const DeltaIndexEntry *deltaEntry)
+{
+  return getField(deltaEntry->deltaZone->memory,
+                  getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits);
+}
+
+/**
+ * Set the value field of the DeltaIndexEntry
+ *
+ * @param deltaEntry  The delta index record
+ * @param value       The new value
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a new entry in the delta index
+ *
+ * @param deltaEntry  The delta index entry that indicates the insertion point
+ *                    for the new record.  For a collision entry, this is the
+ *                    non-collision entry that the new entry collides with.
+ *                    For a non-collision entry, this new entry is inserted
+ *                    before the specified entry.
+ * @param key         The key field
+ * @param value       The value field
+ * @param name        For collision entries, the 256 bit full name;
+ *                    Otherwise null
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key,
+                       unsigned int value, const byte *name)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove an existing delta index entry, and advance to the next entry in
+ * the delta list.
+ *
+ * @param deltaEntry  On call the delta index record to remove.  After
+ *                    returning, the following entry in the delta list.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Map a delta list number to a delta zone number
+ *
+ * @param deltaIndex  The delta index
+ * @param listNumber  The delta list number
+ *
+ * @return the zone number containing the delta list
+ **/
+static INLINE unsigned int getDeltaIndexZone(const DeltaIndex *deltaIndex,
+                                             unsigned int listNumber)
+{
+  return listNumber / deltaIndex->listsPerZone;
+}
+
+/**
+ * Get the first delta list number in a zone
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return the first delta list index in the zone
+ **/
+unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex,
+                                        unsigned int zoneNumber);
+
+/**
+ * Get the number of delta lists in a zone
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return the number of delta lists in the zone
+ **/
+unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex,
+                                        unsigned int zoneNumber);
+
+/**
+ * Get the number of bytes used for master index entries in a zone
+ *
+ * @param deltaIndex  The delta index
+ * @param zoneNumber  The zone number
+ *
+ * @return The number of bits in use
+ **/
+uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex,
+                                        unsigned int zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of bytes used for master index entries.
+ *
+ * @param deltaIndex  The delta index
+ *
+ * @return The number of bits in use
+ **/
+uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of bytes allocated for master index entries.
+ *
+ * @param deltaIndex  The delta index
+ *
+ * @return The number of bits allocated
+ **/
+uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the delta index statistics.
+ *
+ * @param deltaIndex  The delta index
+ * @param stats       The statistics
+ **/
+void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats);
+
+/**
+ * Get the number of pages needed for an immutable delta index.
+ *
+ * @param numEntries      The number of entries in the index
+ * @param numLists        The number of delta lists
+ * @param meanDelta       The mean delta value
+ * @param numPayloadBits  The number of bits in the payload or value
+ * @param bytesPerPage    The number of bytes in a page
+ *
+ * @return the number of pages needed for the index
+ **/
+unsigned int getDeltaIndexPageCount(unsigned int numEntries,
+                                    unsigned int numLists,
+                                    unsigned int meanDelta,
+                                    unsigned int numPayloadBits,
+                                    size_t bytesPerPage);
+
+/**
+ * Log a delta index entry, and any error conditions related to the entry.
+ *
+ * @param deltaEntry  The delta index entry.
+ **/
+void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry);
+
+#endif /* DELTAINDEX_H */
diff --git a/uds/deltaMemory.c b/uds/deltaMemory.c
new file mode 100644
index 0000000..2b30714
--- /dev/null
+++ b/uds/deltaMemory.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.c#3 $
+ */
+#include "deltaMemory.h"
+
+#include "bits.h"
+#include "buffer.h"
+#include "compiler.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "timeUtils.h"
+#include "typeDefs.h"
+#include "uds.h"
+
+/*
+ * The DeltaMemory structure manages the memory that stores delta lists.
+ *
+ * The "mutable" form of DeltaMemory is used for the master index and for
+ * an open chapter index.  The "immutable" form of DeltaMemory is used for
+ * regular chapter indices.
+ */
+
+// This is the number of guard bits that are needed in the tail guard list
+enum { GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT };
+
+/**
+ * Get the offset of the first byte that a delta list bit stream resides in
+ *
+ * @param deltaList  The delta list
+ *
+ * @return the number byte offset
+ **/
+static INLINE uint64_t getDeltaListByteStart(const DeltaList *deltaList)
+{
+  return getDeltaListStart(deltaList) / CHAR_BIT;
+}
+
+/**
+ * Get the actual number of bytes that a delta list bit stream resides in
+ *
+ * @param deltaList  The delta list
+ *
+ * @return the number of bytes
+ **/
+static INLINE uint16_t getDeltaListByteSize(const DeltaList *deltaList)
+{
+  uint16_t startBitOffset = getDeltaListStart(deltaList) % CHAR_BIT;
+  uint16_t bitSize = getDeltaListSize(deltaList);
+  return ((unsigned int) startBitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT;
+}
+
+/**
+ * Get the number of bytes in the delta lists headers.
+ *
+ * @param numLists  The number of delta lists
+ *
+ * @return the number of bytes in the delta lists headers
+ **/
+static INLINE size_t getSizeOfDeltaLists(unsigned int numLists)
+{
+  return (numLists + 2) * sizeof(DeltaList);
+}
+
+/**
+ * Get the size of the flags array (in bytes)
+ *
+ * @param numLists  The number of delta lists
+ *
+ * @return the number of bytes for an array that has one bit per delta
+ *         list, plus the necessary guard bytes.
+ **/
+static INLINE size_t getSizeOfFlags(unsigned int numLists)
+{
+  return (numLists + CHAR_BIT - 1) / CHAR_BIT + POST_FIELD_GUARD_BYTES;
+}
+
+/**
+ * Get the number of bytes of scratch memory for the delta lists.
+ *
+ * @param numLists  The number of delta lists
+ *
+ * @return the number of bytes of scratch memory for the delta lists
+ **/
+static INLINE size_t getSizeOfTempOffsets(unsigned int numLists)
+{
+  return (numLists + 2) * sizeof(uint64_t);
+}
+
+/**********************************************************************/
+
+/**
+ * Clear the transfers flags.
+ *
+ * @param deltaMemory  The delta memory
+ **/
+static void clearTransferFlags(DeltaMemory *deltaMemory)
+{
+  memset(deltaMemory->flags, 0, getSizeOfFlags(deltaMemory->numLists));
+  deltaMemory->numTransfers = 0;
+  deltaMemory->transferStatus = UDS_SUCCESS;
+}
+
+/**********************************************************************/
+
+/**
+ * Set the transfer flags for delta lists that are not empty, and count how
+ * many there are.
+ *
+ * @param deltaMemory  The delta memory
+ **/
+static void flagNonEmptyDeltaLists(DeltaMemory *deltaMemory)
+{
+  clearTransferFlags(deltaMemory);
+  unsigned int i;
+  for (i = 0; i < deltaMemory->numLists; i++) {
+    if (getDeltaListSize(&deltaMemory->deltaLists[i + 1]) > 0) {
+      setOne(deltaMemory->flags, i, 1);
+      deltaMemory->numTransfers++;
+    }
+  }
+}
+
+/**********************************************************************/
+void emptyDeltaLists(DeltaMemory *deltaMemory)
+{
+  // Zero all the delta list headers
+  DeltaList *deltaLists = deltaMemory->deltaLists;
+  memset(deltaLists, 0, getSizeOfDeltaLists(deltaMemory->numLists));
+
+  /*
+   * Initialize delta lists to be empty. We keep 2 extra delta list
+   * descriptors, one before the first real entry and one after so that we
+   * don't need to bounds check the array access when calculating
+   * preceeding and following gap sizes.
+   *
+   * Because the delta list headers were zeroed, the head guard list is
+   * already at offset zero and size zero.
+   *
+   * The end guard list contains guard bytes so that the bit field
+   * utilities can safely read past the end of any byte we are interested
+   * in.
+   */
+  uint64_t numBits = (uint64_t) deltaMemory->size * CHAR_BIT;
+  deltaLists[deltaMemory->numLists + 1].startOffset = numBits - GUARD_BITS;
+  deltaLists[deltaMemory->numLists + 1].size        = GUARD_BITS;
+
+  // Set all the bits in the end guard list.  Do not use the bit field
+  // utilities.
+  memset(deltaMemory->memory + deltaMemory->size - POST_FIELD_GUARD_BYTES,
+         ~0, POST_FIELD_GUARD_BYTES);
+
+  // Evenly space out the real delta lists.  The sizes are already zero, so
+  // we just need to set the starting offsets.
+  uint64_t spacing = (numBits - GUARD_BITS) / deltaMemory->numLists;
+  uint64_t offset = spacing / 2;
+  unsigned int i;
+  for (i = 1; i <= deltaMemory->numLists; i++) {
+    deltaLists[i].startOffset = offset;
+    offset += spacing;
+  }
+
+  // Update the statistics
+  deltaMemory->discardCount  += deltaMemory->recordCount;
+  deltaMemory->recordCount    = 0;
+  deltaMemory->collisionCount = 0;
+}
+
+/**********************************************************************/
+/**
+ * Compute the Huffman coding parameters for the given mean delta
+ *
+ * @param meanDelta  The mean delta value
+ * @param minBits    The number of bits in the minimal key code
+ * @param minKeys    The number of keys used in a minimal code
+ * @param incrKeys   The number of keys used for another code bit
+ **/
+static void computeCodingConstants(unsigned int    meanDelta,
+                                   unsigned short *minBits,
+                                   unsigned int   *minKeys,
+                                   unsigned int   *incrKeys)
+{
+  // We want to compute the rounded value of log(2) * meanDelta.  Since we
+  // cannot always use floating point, use a really good integer approximation.
+  *incrKeys = (836158UL * meanDelta + 603160UL) / 1206321UL;
+  *minBits  = computeBits(*incrKeys + 1);
+  *minKeys  = (1 << *minBits) - *incrKeys;
+}
+
+/**********************************************************************/
+/**
+ * Rebalance a range of delta lists within memory.
+ *
+ * @param deltaMemory  A delta memory structure
+ * @param first        The first delta list index
+ * @param last         The last delta list index
+ **/
+static void rebalanceDeltaMemory(const DeltaMemory *deltaMemory,
+                                 unsigned int first, unsigned int last)
+{
+  if (first == last) {
+    DeltaList *deltaList = &deltaMemory->deltaLists[first];
+    uint64_t newStart = deltaMemory->tempOffsets[first];
+    // We need to move only one list, and we know it is safe to do so
+    if (getDeltaListStart(deltaList) != newStart) {
+      // Compute the first source byte
+      uint64_t source = getDeltaListByteStart(deltaList);
+      // Update the delta list location
+      deltaList->startOffset = newStart;
+      // Now use the same computation to locate the first destination byte
+      uint64_t destination = getDeltaListByteStart(deltaList);
+      memmove(deltaMemory->memory + destination, deltaMemory->memory + source,
+              getDeltaListByteSize(deltaList));
+    }
+  } else {
+    // There is more than one list.  Divide the problem in half, and use
+    // recursive calls to process each half.  Note that after this
+    // computation, first <= middle, and middle < last.
+    unsigned int middle = (first + last) / 2;
+    const DeltaList *deltaList = &deltaMemory->deltaLists[middle];
+    uint64_t newStart = deltaMemory->tempOffsets[middle];
+    // The direction that our middle list is moving determines which half
+    // of the problem must be processed first.
+    if (newStart > getDeltaListStart(deltaList)) {
+      rebalanceDeltaMemory(deltaMemory, middle + 1, last);
+      rebalanceDeltaMemory(deltaMemory, first, middle);
+    } else {
+      rebalanceDeltaMemory(deltaMemory, first, middle);
+      rebalanceDeltaMemory(deltaMemory, middle + 1, last);
+    }
+  }
+}
+
+/**********************************************************************/
+int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size,
+                          unsigned int firstList, unsigned int numLists,
+                          unsigned int meanDelta, unsigned int numPayloadBits)
+{
+  if (numLists == 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize delta memory with 0 "
+                                     "delta lists");
+  }
+  byte *memory = NULL;
+  int result = ALLOCATE(size, byte, "delta list", &memory);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  uint64_t *tempOffsets = NULL;
+  result = ALLOCATE(numLists + 2, uint64_t, "delta list temp",
+                    &tempOffsets);
+  if (result != UDS_SUCCESS) {
+    FREE(memory);
+    return result;
+  }
+  byte *flags = NULL;
+  result = ALLOCATE(getSizeOfFlags(numLists), byte, "delta list flags",
+                    &flags);
+  if (result != UDS_SUCCESS) {
+    FREE(memory);
+    FREE(tempOffsets);
+    return result;
+  }
+
+  computeCodingConstants(meanDelta, &deltaMemory->minBits,
+                         &deltaMemory->minKeys, &deltaMemory->incrKeys);
+  deltaMemory->valueBits       = numPayloadBits;
+  deltaMemory->memory          = memory;
+  deltaMemory->deltaLists      = NULL;
+  deltaMemory->tempOffsets     = tempOffsets;
+  deltaMemory->flags           = flags;
+  deltaMemory->bufferedWriter  = NULL;
+  deltaMemory->size            = size;
+  deltaMemory->rebalanceTime   = 0;
+  deltaMemory->rebalanceCount  = 0;
+  deltaMemory->recordCount     = 0;
+  deltaMemory->collisionCount  = 0;
+  deltaMemory->discardCount    = 0;
+  deltaMemory->overflowCount   = 0;
+  deltaMemory->firstList       = firstList;
+  deltaMemory->numLists        = numLists;
+  deltaMemory->numTransfers    = 0;
+  deltaMemory->transferStatus  = UDS_SUCCESS;
+  deltaMemory->tag             = 'm';
+
+  // Allocate the delta lists.
+  result = ALLOCATE(deltaMemory->numLists + 2, DeltaList,
+                    "delta lists", &deltaMemory->deltaLists);
+  if (result != UDS_SUCCESS) {
+    uninitializeDeltaMemory(deltaMemory);
+    return result;
+  }
+
+  emptyDeltaLists(deltaMemory);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void uninitializeDeltaMemory(DeltaMemory *deltaMemory)
+{
+  FREE(deltaMemory->flags);
+  deltaMemory->flags = NULL;
+  FREE(deltaMemory->tempOffsets);
+  deltaMemory->tempOffsets = NULL;
+  FREE(deltaMemory->deltaLists);
+  deltaMemory->deltaLists = NULL;
+  FREE(deltaMemory->memory);
+  deltaMemory->memory = NULL;
+}
+
+/**********************************************************************/
+void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory,
+                               size_t size, unsigned int numLists,
+                               unsigned int meanDelta,
+                               unsigned int numPayloadBits)
+{
+  computeCodingConstants(meanDelta, &deltaMemory->minBits,
+                         &deltaMemory->minKeys, &deltaMemory->incrKeys);
+  deltaMemory->valueBits       = numPayloadBits;
+  deltaMemory->memory          = memory;
+  deltaMemory->deltaLists      = NULL;
+  deltaMemory->tempOffsets     = NULL;
+  deltaMemory->flags           = NULL;
+  deltaMemory->bufferedWriter  = NULL;
+  deltaMemory->size            = size;
+  deltaMemory->rebalanceTime   = 0;
+  deltaMemory->rebalanceCount  = 0;
+  deltaMemory->recordCount     = 0;
+  deltaMemory->collisionCount  = 0;
+  deltaMemory->discardCount    = 0;
+  deltaMemory->overflowCount   = 0;
+  deltaMemory->firstList       = 0;
+  deltaMemory->numLists        = numLists;
+  deltaMemory->numTransfers    = 0;
+  deltaMemory->transferStatus  = UDS_SUCCESS;
+  deltaMemory->tag             = 'p';
+}
+
+/**********************************************************************/
+bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory)
+{
+  return deltaMemory->numTransfers == 0;
+}
+
+/**********************************************************************/
+int startRestoringDeltaMemory(DeltaMemory *deltaMemory)
+{
+  // Extend and balance memory to receive the delta lists
+  int result = extendDeltaMemory(deltaMemory, 0, 0, false);
+  if (result != UDS_SUCCESS) {
+    return UDS_SUCCESS;
+  }
+
+  // The tail guard list needs to be set to ones
+  DeltaList *deltaList = &deltaMemory->deltaLists[deltaMemory->numLists + 1];
+  setOne(deltaMemory->memory, getDeltaListStart(deltaList),
+         getDeltaListSize(deltaList));
+
+  flagNonEmptyDeltaLists(deltaMemory);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int readDeltaListSaveInfo(BufferedReader *reader,
+                                 DeltaListSaveInfo *dlsi)
+{
+  byte buffer[sizeof(DeltaListSaveInfo)];
+  int result = readFromBufferedReader(reader, buffer, sizeof(buffer));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  dlsi->tag =  buffer[0];
+  dlsi->bitOffset = buffer[1];
+  dlsi->byteCount = getUInt16LE(&buffer[2]);
+  dlsi->index = getUInt32LE(&buffer[4]);
+  return result;
+}
+
+/**********************************************************************/
+int readSavedDeltaList(DeltaListSaveInfo *dlsi,
+                       byte data[DELTA_LIST_MAX_BYTE_COUNT],
+                       BufferedReader *bufferedReader)
+{
+  int result = readDeltaListSaveInfo(bufferedReader, dlsi);
+  if (result == UDS_END_OF_FILE) {
+    return UDS_END_OF_FILE;
+  }
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result, "failed to read delta list data");
+  }
+  if ((dlsi->bitOffset >= CHAR_BIT)
+      || (dlsi->byteCount > DELTA_LIST_MAX_BYTE_COUNT)) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "corrupt delta list data");
+  }
+  if (dlsi->tag == 'z') {
+    return UDS_END_OF_FILE;
+  }
+  result = readFromBufferedReader(bufferedReader, data, dlsi->byteCount);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result, "failed to read delta list data");
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi,
+                     const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  unsigned int listNumber = dlsi->index - deltaMemory->firstList;
+  if (listNumber >= deltaMemory->numLists) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "invalid delta list number %u not in"
+                                     " range [%u,%u)",
+                                     dlsi->index, deltaMemory->firstList,
+                                     deltaMemory->firstList
+                                     + deltaMemory->numLists);
+  }
+
+  if (getField(deltaMemory->flags, listNumber, 1) == 0) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "unexpected delta list number %u",
+                                     dlsi->index);
+  }
+
+  DeltaList *deltaList = &deltaMemory->deltaLists[listNumber + 1];
+  uint16_t bitSize = getDeltaListSize(deltaList);
+  unsigned int byteCount
+    = ((unsigned int) dlsi->bitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT;
+  if (dlsi->byteCount != byteCount) {
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "unexpected delta list size %u != %u",
+                                     dlsi->byteCount, byteCount);
+  }
+
+  moveBits(data, dlsi->bitOffset, deltaMemory->memory,
+           getDeltaListStart(deltaList), bitSize);
+  setZero(deltaMemory->flags, listNumber, 1);
+  deltaMemory->numTransfers--;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void abortRestoringDeltaMemory(DeltaMemory *deltaMemory)
+{
+  clearTransferFlags(deltaMemory);
+  emptyDeltaLists(deltaMemory);
+}
+
+/**********************************************************************/
+void startSavingDeltaMemory(DeltaMemory *deltaMemory,
+                            BufferedWriter *bufferedWriter)
+{
+  flagNonEmptyDeltaLists(deltaMemory);
+  deltaMemory->bufferedWriter = bufferedWriter;
+}
+
+/**********************************************************************/
+int finishSavingDeltaMemory(DeltaMemory *deltaMemory)
+{
+  unsigned int i;
+  for (i = 0;
+       !areDeltaMemoryTransfersDone(deltaMemory)
+         && (i < deltaMemory->numLists);
+       i++) {
+    lazyFlushDeltaList(deltaMemory, i);
+  }
+  if (deltaMemory->numTransfers > 0) {
+    deltaMemory->transferStatus
+      = logWarningWithStringError(UDS_CORRUPT_DATA,
+                                  "Not all delta lists written");
+  }
+  deltaMemory->bufferedWriter = NULL;
+  return deltaMemory->transferStatus;
+}
+
+/**********************************************************************/
+void abortSavingDeltaMemory(DeltaMemory *deltaMemory)
+{
+  clearTransferFlags(deltaMemory);
+  deltaMemory->bufferedWriter = NULL;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int writeDeltaListSaveInfo(BufferedWriter *bufferedWriter,
+                                  DeltaListSaveInfo *dlsi)
+{
+  byte buffer[sizeof(DeltaListSaveInfo)];
+  buffer[0] = dlsi->tag;
+  buffer[1] = dlsi->bitOffset;
+  storeUInt16LE(&buffer[2], dlsi->byteCount);
+  storeUInt32LE(&buffer[4], dlsi->index);
+  return writeToBufferedWriter(bufferedWriter, buffer, sizeof(buffer));
+}
+
+/**********************************************************************/
+void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex)
+{
+  ASSERT_LOG_ONLY((getField(deltaMemory->flags, flushIndex, 1) != 0),
+                  "flush bit is set");
+  setZero(deltaMemory->flags, flushIndex, 1);
+  deltaMemory->numTransfers--;
+
+  DeltaList *deltaList = &deltaMemory->deltaLists[flushIndex + 1];
+  DeltaListSaveInfo dlsi;
+  dlsi.tag       = deltaMemory->tag;
+  dlsi.bitOffset = getDeltaListStart(deltaList) % CHAR_BIT;
+  dlsi.byteCount = getDeltaListByteSize(deltaList);
+  dlsi.index     = deltaMemory->firstList + flushIndex;
+
+  int result = writeDeltaListSaveInfo(deltaMemory->bufferedWriter, &dlsi);
+  if (result != UDS_SUCCESS) {
+    if (deltaMemory->transferStatus == UDS_SUCCESS) {
+      logWarningWithStringError(result, "failed to write delta list memory");
+      deltaMemory->transferStatus = result;
+    }
+  }
+  result = writeToBufferedWriter(deltaMemory->bufferedWriter,
+                                 deltaMemory->memory
+                                 + getDeltaListByteStart(deltaList),
+                                 dlsi.byteCount);
+  if (result != UDS_SUCCESS) {
+    if (deltaMemory->transferStatus == UDS_SUCCESS) {
+      logWarningWithStringError(result, "failed to write delta list memory");
+      deltaMemory->transferStatus = result;
+    }
+  }
+}
+
+/**********************************************************************/
+int writeGuardDeltaList(BufferedWriter *bufferedWriter)
+{
+  DeltaListSaveInfo dlsi;
+  dlsi.tag       = 'z';
+  dlsi.bitOffset = 0;
+  dlsi.byteCount = 0;
+  dlsi.index     = 0;
+  int result = writeToBufferedWriter(bufferedWriter, (const byte *) &dlsi,
+                                     sizeof(DeltaListSaveInfo));
+  if (result != UDS_SUCCESS) {
+    logWarningWithStringError(result, "failed to write guard delta list");
+  }
+  return result;
+}
+
+/**********************************************************************/
+int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex,
+                      size_t growingSize, bool doCopy)
+{
+  if (!isMutable(deltaMemory)) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "Attempt to read into an immutable delta"
+                                   " list memory");
+  }
+
+  AbsTime startTime = currentTime(CLOCK_MONOTONIC);
+
+  // Calculate the amount of space that is in use.  Include the space that
+  // has a planned use.
+  DeltaList *deltaLists = deltaMemory->deltaLists;
+  size_t usedSpace = growingSize;
+  unsigned int i;
+  for (i = 0; i <= deltaMemory->numLists + 1; i++) {
+    usedSpace += getDeltaListByteSize(&deltaLists[i]);
+  }
+
+  if (deltaMemory->size < usedSpace) {
+    return UDS_OVERFLOW;
+  }
+
+  // Compute the new offsets of the delta lists
+  size_t spacing = (deltaMemory->size - usedSpace) / deltaMemory->numLists;
+  deltaMemory->tempOffsets[0] = 0;
+  for (i = 0; i <= deltaMemory->numLists; i++) {
+    deltaMemory->tempOffsets[i + 1] = (deltaMemory->tempOffsets[i]
+                                       + getDeltaListByteSize(&deltaLists[i])
+                                       + spacing);
+    deltaMemory->tempOffsets[i] *= CHAR_BIT;
+    deltaMemory->tempOffsets[i]
+      += getDeltaListStart(&deltaLists[i]) % CHAR_BIT;
+    if (i == 0) {
+      deltaMemory->tempOffsets[i + 1] -= spacing / 2;
+    }
+    if (i + 1 == growingIndex) {
+      deltaMemory->tempOffsets[i + 1] += growingSize;
+    }
+  }
+  deltaMemory->tempOffsets[deltaMemory->numLists + 1]
+    = (deltaMemory->size * CHAR_BIT
+       - getDeltaListSize(&deltaLists[deltaMemory->numLists + 1]));
+  // When we rebalance the delta list, we will include the end guard list
+  // in the rebalancing.  It contains the end guard data, which must be
+  // copied.
+  if (doCopy) {
+    rebalanceDeltaMemory(deltaMemory, 1, deltaMemory->numLists + 1);
+    AbsTime endTime = currentTime(CLOCK_MONOTONIC);
+    deltaMemory->rebalanceCount++;
+    deltaMemory->rebalanceTime += timeDifference(endTime, startTime);
+  } else {
+    for (i = 1; i <= deltaMemory->numLists + 1; i++) {
+      deltaLists[i].startOffset = deltaMemory->tempOffsets[i];
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int validateDeltaLists(const DeltaMemory *deltaMemory)
+{
+  // Validate the delta index fields set by restoring a delta index
+  if (deltaMemory->collisionCount > deltaMemory->recordCount) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "delta index contains more collisions"
+                                     " (%ld) than records (%ld)",
+                                     deltaMemory->collisionCount,
+                                     deltaMemory->recordCount);
+  }
+
+  // Validate the delta lists
+  DeltaList *deltaLists = deltaMemory->deltaLists;
+  if (getDeltaListStart(&deltaLists[0]) != 0) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "the head guard delta list does not start"
+                                     " at 0: %llu",
+                                     getDeltaListStart(&deltaLists[0]));
+  }
+  uint64_t numBits = getDeltaListEnd(&deltaLists[deltaMemory->numLists + 1]);
+  if (numBits != deltaMemory->size * CHAR_BIT) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "the tail guard delta list does not end "
+                                     "at end of allocated memory:  %" PRIu64
+                                     " != %zd",
+                                     numBits, deltaMemory->size * CHAR_BIT);
+  }
+  int numGuardBits = getDeltaListSize(&deltaLists[deltaMemory->numLists + 1]);
+  if (numGuardBits < GUARD_BITS) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "the tail guard delta list does not "
+                                     "contain sufficient guard bits:  %d < %d",
+                                     numGuardBits, GUARD_BITS);
+  }
+  unsigned int i;
+  for (i = 0; i <= deltaMemory->numLists + 1; i++) {
+    if (getDeltaListStart(&deltaLists[i]) > getDeltaListEnd(&deltaLists[i])) {
+      return logWarningWithStringError(UDS_BAD_STATE,
+                                       "invalid delta list %u: [%" PRIu64
+                                       ", %llu)",
+                                       i,
+                                       getDeltaListStart(&deltaLists[i]),
+                                       getDeltaListEnd(&deltaLists[i]));
+    }
+    if (i > deltaMemory->numLists) {
+      // The rest of the checks do not apply to the tail guard list
+      continue;
+    }
+    if (getDeltaListEnd(&deltaLists[i])
+        > getDeltaListStart(&deltaLists[i + 1])) {
+      return logWarningWithStringError(UDS_BAD_STATE,
+                                       "delta lists %u and %u overlap:  %"
+                                       PRIu64 " > %llu",
+                                       i, i + 1,
+                                       getDeltaListEnd(&deltaLists[i]),
+                                       getDeltaListStart(&deltaLists[i + 1]));
+    }
+    if (i == 0) {
+      // The rest of the checks do not apply to the head guard list
+      continue;
+    }
+    if (deltaLists[i].saveOffset > getDeltaListSize(&deltaLists[i])) {
+      return logWarningWithStringError(UDS_BAD_STATE,
+                                       "delta lists %u saved offset is larger"
+                                       " than the list:  %u > %u",
+                                       i, deltaLists[i].saveOffset,
+                                       getDeltaListSize(&deltaLists[i]));
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory)
+{
+  return (deltaMemory->size
+          + getSizeOfDeltaLists(deltaMemory->numLists)
+          + getSizeOfFlags(deltaMemory->numLists)
+          + getSizeOfTempOffsets(deltaMemory->numLists));
+}
+
+/**********************************************************************/
+size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta,
+                          unsigned int numPayloadBits)
+{
+  unsigned short minBits;
+  unsigned int incrKeys, minKeys;
+  computeCodingConstants(meanDelta, &minBits, &minKeys, &incrKeys);
+  // On average, each delta is encoded into about minBits+1.5 bits.
+  return (numEntries * (numPayloadBits + minBits + 1) + numEntries / 2);
+}
diff --git a/uds/deltaMemory.h b/uds/deltaMemory.h
new file mode 100644
index 0000000..1ffb3fd
--- /dev/null
+++ b/uds/deltaMemory.h
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.h#1 $
+ */
+
+#ifndef DELTAMEMORY_H
+#define DELTAMEMORY_H 1
+
+#include "bits.h"
+#include "bufferedReader.h"
+#include "bufferedWriter.h"
+#include "compiler.h"
+#include "cpu.h"
+#include "timeUtils.h"
+
+/*
+ * We encode the delta list information into 16 bytes per list.
+ *
+ * Because the master index has 1 million delta lists, each byte of header
+ * information ends up costing us 1MB.  We have an incentive to keep the
+ * size down.
+ *
+ * The master index delta list memory is currently about 780MB in size,
+ * which is more than 6 gigabits.  Therefore we need at least 33 bits to
+ * address the master index memory and we use the uint64_t type.
+ *
+ * The master index delta lists have 256 entries of about 24 bits each,
+ * which is 6K bits.  The index needs 13 bits to represent the size of a
+ * delta list and we use the uint16_t type.
+ */
+
+typedef struct deltaList {
+  uint64_t startOffset;  // The offset of the delta list start within memory
+  uint16_t size;         // The number of bits in the delta list
+  uint16_t saveOffset;   // Where the last search "found" the key
+  unsigned int saveKey;  // The key for the record just before saveOffset.
+} DeltaList;
+
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) deltaMemory {
+  byte *memory;                   // The delta list memory
+  DeltaList *deltaLists;          // The delta list headers
+  uint64_t *tempOffsets;          // Temporary starts of delta lists
+  byte *flags;                    // Transfer flags
+  BufferedWriter *bufferedWriter; // Buffered writer for saving an index
+  size_t size;                 // The size of delta list memory
+  RelTime rebalanceTime;       // The time spent rebalancing
+  int rebalanceCount;          // Number of memory rebalances
+  unsigned short valueBits;    // The number of bits of value
+  unsigned short minBits;      // The number of bits in the minimal key code
+  unsigned int minKeys;        // The number of keys used in a minimal code
+  unsigned int incrKeys;       // The number of keys used for another code bit
+  long recordCount;            // The number of records in the index
+  long collisionCount;         // The number of collision records
+  long discardCount;           // The number of records removed
+  long overflowCount;          // The number of UDS_OVERFLOWs detected
+  unsigned int firstList;      // The index of the first delta list
+  unsigned int numLists;       // The number of delta lists
+  unsigned int numTransfers;   // Number of transfer flags that are set
+  int transferStatus;          // Status of the transfers in progress
+  byte tag;                    // Tag belonging to this delta index
+} DeltaMemory;
+
+typedef struct deltaListSaveInfo {
+  uint8_t tag;         // Tag identifying which delta index this list is in
+  uint8_t bitOffset;   // Bit offset of the start of the list data
+  uint16_t byteCount;  // Number of bytes of list data
+  uint32_t index;      // The delta list number within the delta index
+} DeltaListSaveInfo;
+
+// The maximum size of a single delta list (in bytes).  We add guard bytes
+// to this because such a buffer can be used with moveBits.
+enum { DELTA_LIST_MAX_BYTE_COUNT = ((UINT16_MAX + CHAR_BIT) / CHAR_BIT
+                                    + POST_FIELD_GUARD_BYTES) };
+
+/**
+ * Initialize delta list memory.
+ *
+ * @param deltaMemory     A delta memory structure
+ * @param size            The initial size of the memory array
+ * @param firstList       The index of the first delta list
+ * @param numLists        The number of delta lists
+ * @param meanDelta       The mean delta
+ * @param numPayloadBits  The number of payload bits
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size,
+                          unsigned int firstList, unsigned int numLists,
+                          unsigned int meanDelta, unsigned int numPayloadBits)
+  __attribute__((warn_unused_result));
+
+/**
+ * Uninitialize delta list memory.
+ *
+ * @param deltaMemory  A delta memory structure
+ **/
+void uninitializeDeltaMemory(DeltaMemory *deltaMemory);
+
+/**
+ * Initialize delta list memory to refer to a cached page.
+ *
+ * @param deltaMemory     A delta memory structure
+ * @param memory          The memory page
+ * @param size            The size of the memory page
+ * @param numLists        The number of delta lists
+ * @param meanDelta       The mean delta
+ * @param numPayloadBits  The number of payload bits
+ **/
+void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory,
+                               size_t size, unsigned int numLists,
+                               unsigned int meanDelta,
+                               unsigned int numPayloadBits);
+
+/**
+ * Empty the delta lists.
+ *
+ * @param deltaMemory  The delta memory
+ **/
+void emptyDeltaLists(DeltaMemory *deltaMemory);
+
+/**
+ * Is there a delta list memory save or restore in progress?
+ *
+ * @param deltaMemory  A delta memory structure
+ *
+ * @return true if there are no delta lists that need to be saved or
+ *         restored
+ **/
+bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory);
+
+/**
+ * Start restoring delta list memory from a file descriptor
+ *
+ * @param deltaMemory     A delta memory structure
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int startRestoringDeltaMemory(DeltaMemory *deltaMemory)
+  __attribute__((warn_unused_result));
+
+/**
+ * Read a saved delta list from a file descriptor
+ *
+ * @param dlsi            The DeltaListSaveInfo describing the delta list
+ * @param data            The saved delta list bit stream
+ * @param bufferedReader  The buffered reader to read the delta list from
+ *
+ * @return error code or UDS_SUCCESS
+ *         or UDS_END_OF_FILE at end of the data stream
+ **/
+int readSavedDeltaList(DeltaListSaveInfo *dlsi,
+                       byte data[DELTA_LIST_MAX_BYTE_COUNT],
+                       BufferedReader *bufferedReader)
+  __attribute__((warn_unused_result));
+
+/**
+ * Restore a saved delta list
+ *
+ * @param deltaMemory  A delta memory structure
+ * @param dlsi         The DeltaListSaveInfo describing the delta list
+ * @param data         The saved delta list bit stream
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi,
+                     const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort restoring delta list memory from an input stream.
+ *
+ * @param deltaMemory  A delta memory structure
+ **/
+void abortRestoringDeltaMemory(DeltaMemory *deltaMemory);
+
+/**
+ * Start saving delta list memory to a buffered output stream
+ *
+ * @param deltaMemory     A delta memory structure
+ * @param bufferedWriter  The index state component being written
+ **/
+void startSavingDeltaMemory(DeltaMemory *deltaMemory,
+                            BufferedWriter *bufferedWriter);
+
+/**
+ * Finish saving delta list memory to an output stream.  Force the writing
+ * of all of the remaining data.  If an error occurred asynchronously
+ * during the save operation, it will be returned here.
+ *
+ * @param deltaMemory  A delta memory structure
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int finishSavingDeltaMemory(DeltaMemory *deltaMemory)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort saving delta list memory to an output stream.  If an error
+ * occurred asynchronously during the save operation, it will be dropped.
+ *
+ * @param deltaMemory  A delta memory structure
+ **/
+void abortSavingDeltaMemory(DeltaMemory *deltaMemory);
+
+/**
+ * Flush a delta list to an output stream
+ *
+ * @param deltaMemory  A delta memory structure
+ * @param flushIndex   Index of the delta list that may need to be flushed.
+ **/
+void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex);
+
+/**
+ * Write a guard delta list to mark the end of the saved data
+ *
+ * @param bufferedWriter  The buffered writer to write the guard delta list to
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int writeGuardDeltaList(BufferedWriter *bufferedWriter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Extend the memory used by the delta lists and rebalance the lists in the
+ * new chunk.
+ *
+ * <p> The delta memory contains N delta lists, which are guarded by two
+ * empty delta lists.  The valid delta lists are numbered 1 to N, and the
+ * guards are numbered 0 and (N+1);
+ *
+ * <p> When the delta lista are bit streams, it is possible that the tail
+ * of list J and the head of list (J+1) are in the same byte.  In this case
+ * oldOffsets[j]+sizes[j]==oldOffset[j]-1.  We handle this correctly.
+ *
+ * @param deltaMemory   A delta memory structure
+ * @param growingIndex  Index of the delta list that needs additional space
+ *                      left before it (from 1 to N+1).
+ * @param growingSize   Number of additional bytes needed before growingIndex
+ * @param doCopy        True to copy the data, False to just balance the space
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex,
+                      size_t growingSize, bool doCopy)
+  __attribute__((warn_unused_result));
+
+/**
+ * Validate the delta list headers.
+ *
+ * @param deltaMemory   A delta memory structure
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int validateDeltaLists(const DeltaMemory *deltaMemory)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of bytes allocated for delta index entries and any
+ * associated overhead.
+ *
+ * @param deltaMemory   A delta memory structure
+ *
+ * @return The number of bytes allocated
+ **/
+size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory);
+
+/**
+ * Get the expected number of bits used in a delta index
+ *
+ * @param numEntries      The number of index entries
+ * @param meanDelta       The mean delta value
+ * @param numPayloadBits  The number of bits in the payload or value
+ *
+ * @return  The expected size of a delta index in bits
+ **/
+size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta,
+                          unsigned int numPayloadBits)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the bit offset to the start of the delta list bit stream
+ *
+ * @param deltaList  The delta list header
+ *
+ * @return the start of the delta list
+ **/
+static INLINE uint64_t getDeltaListStart(const DeltaList *deltaList)
+{
+  return deltaList->startOffset;
+}
+
+/**
+ * Get the number of bits in a delta list bit stream
+ *
+ * @param deltaList  The delta list header
+ *
+ * @return the size of the delta list
+ **/
+static INLINE uint16_t getDeltaListSize(const DeltaList *deltaList)
+{
+  return deltaList->size;
+}
+
+/**
+ * Get the bit offset to the end of the delta list bit stream
+ *
+ * @param deltaList  The delta list header
+ *
+ * @return the end of the delta list
+ **/
+static INLINE uint64_t getDeltaListEnd(const DeltaList *deltaList)
+{
+  return getDeltaListStart(deltaList) + getDeltaListSize(deltaList);
+}
+
+/**
+ * Identify mutable vs. immutable delta memory
+ *
+ * Mutable delta memory contains delta lists that can be modified, and is
+ * initialized using initializeDeltaMemory().
+ *
+ * Immutable delta memory contains packed delta lists, cannot be modified,
+ * and is initialized using initializeDeltaMemoryPage().
+ *
+ * For mutable delta memory, all of the following expressions are true.
+ * And for immutable delta memory, all of the following expressions are
+ * false.
+ *             deltaLists != NULL
+ *             tempOffsets != NULL
+ *             flags != NULL
+ *
+ * @param deltaMemory  A delta memory structure
+ *
+ * @return true if the delta memory is mutable
+ **/
+static INLINE bool isMutable(const DeltaMemory *deltaMemory)
+{
+  return deltaMemory->deltaLists != NULL;
+}
+
+/**
+ * Lazily flush a delta list to an output stream
+ *
+ * @param deltaMemory  A delta memory structure
+ * @param flushIndex   Index of the delta list that may need to be flushed.
+ **/
+static INLINE void lazyFlushDeltaList(DeltaMemory *deltaMemory,
+                                      unsigned int flushIndex)
+{
+  if (getField(deltaMemory->flags, flushIndex, 1) != 0) {
+    flushDeltaList(deltaMemory, flushIndex);
+  }
+}
+#endif /* DELTAMEMORY_H */
diff --git a/uds/errors.c b/uds/errors.c
new file mode 100644
index 0000000..5aab19e
--- /dev/null
+++ b/uds/errors.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/errors.c#11 $
+ */
+
+#include "errors.h"
+
+#include "common.h"
+#include "permassert.h"
+#include "stringUtils.h"
+
+#ifdef __KERNEL__
+#include <linux/errno.h>
+#endif
+
+static const struct errorInfo successful = { "UDS_SUCCESS", "Success" };
+
+#ifdef __KERNEL__
+static const char *const messageTable[] = {
+  [EPERM]   = "Operation not permitted",
+  [ENOENT]  = "No such file or directory",
+  [ESRCH]   = "No such process",
+  [EINTR]   = "Interrupted system call",
+  [EIO]     = "Input/output error",
+  [ENXIO]   = "No such device or address",
+  [E2BIG]   = "Argument list too long",
+  [ENOEXEC] = "Exec format error",
+  [EBADF]   = "Bad file descriptor",
+  [ECHILD]  = "No child processes",
+  [EAGAIN]  = "Resource temporarily unavailable",
+  [ENOMEM]  = "Cannot allocate memory",
+  [EACCES]  = "Permission denied",
+  [EFAULT]  = "Bad address",
+  [ENOTBLK] = "Block device required",
+  [EBUSY]   = "Device or resource busy",
+  [EEXIST]  = "File exists",
+  [EXDEV]   = "Invalid cross-device link",
+  [ENODEV]  = "No such device",
+  [ENOTDIR] = "Not a directory",
+  [EISDIR]  = "Is a directory",
+  [EINVAL]  = "Invalid argument",
+  [ENFILE]  = "Too many open files in system",
+  [EMFILE]  = "Too many open files",
+  [ENOTTY]  = "Inappropriate ioctl for device",
+  [ETXTBSY] = "Text file busy",
+  [EFBIG]   = "File too large",
+  [ENOSPC]  = "No space left on device",
+  [ESPIPE]  = "Illegal seek",
+  [EROFS]   = "Read-only file system",
+  [EMLINK]  = "Too many links",
+  [EPIPE]   = "Broken pipe",
+  [EDOM]    = "Numerical argument out of domain",
+  [ERANGE]  = "Numerical result out of range"
+};
+#endif
+
+static const struct errorInfo errorList[] = {
+  { "UDS_UNINITIALIZED", "UDS library is not initialized" },
+  { "UDS_SHUTTINGDOWN", "UDS library is shutting down" },
+  { "UDS_EMODULE_LOAD", "Could not load modules" },
+  { "UDS_ENOTHREADS", "Could not create a new thread" },
+  { "UDS_NOCONTEXT", "Could not find the requested library context" },
+  { "UDS_DISABLED", "UDS library context is disabled" },
+  { "UDS_CORRUPT_COMPONENT", "Corrupt saved component" },
+  { "UDS_UNKNOWN_ERROR", "Unknown error" },
+  { "UDS_UNUSED_CODE_8", "Unused error code 8" },
+  { "UDS_UNUSED_CODE_9", "Unused error code 9" },
+  { "UDS_UNSUPPORTED_VERSION", "Unsupported version" },
+  { "UDS_NO_INDEXSESSION", "Index session not known" },
+  { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" },
+  { "UDS_SHORT_READ", "Could not read requested number of bytes" },
+  { "UDS_UNUSED_CODE_14", "Unused error code 14" },
+  { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" },
+  { "UDS_VOLUME_OVERFLOW", "Memory overflow due to storage failure" },
+  { "UDS_UNUSED_CODE_17", "Unused error code 17" },
+  { "UDS_UNUSED_CODE_18", "Unused error code 18" },
+  { "UDS_UNUSED_CODE_19", "Unused error code 19" },
+  { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" },
+  { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" },
+  { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" },
+  { "UDS_UNUSED_CODE_23", "Unused error code 23" },
+  { "UDS_UNUSED_CODE_24", "Unused error code 24" },
+  { "UDS_UNUSED_CODE_25", "Unused error code 25" },
+  { "UDS_UNUSED_CODE_26", "Unused error code 26" },
+  { "UDS_UNUSED_CODE_27", "Unused error code 27" },
+  { "UDS_INVALID_MEMORY_SIZE",
+    "Configured memory too small or unsupported size" },
+  { "UDS_UNUSED_CODE_29", "Unused error code 29" },
+  { "UDS_INDEX_NAME_REQUIRED", "An index name is required" },
+  { "UDS_CONF_REQUIRED", "A configuration is required" },
+  { "UDS_UNUSED_CODE_32", "Unused error code 32" },
+  { "UDS_UNUSED_CODE_33", "Unused error code 33" },
+  { "UDS_UNUSED_CODE_34", "Unused error code 34" },
+  { "UDS_UNUSED_CODE_35", "Unused error code 35" },
+  { "UDS_UNUSED_CODE_36", "Unused error code 36" },
+  { "UDS_NO_INDEX", "No index found" },
+  { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" },
+  { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" },
+  { "UDS_UNUSED_CODE_40", "Unused error code 40" },
+  { "UDS_UNUSED_CODE_41", "Unused error code 41" },
+  { "UDS_UNUSED_CODE_42", "Unused error code 42" },
+  { "UDS_UNUSED_CODE_43", "Unused error code 43" },
+  { "UDS_END_OF_FILE", "Unexpected end of file" },
+  { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" },
+  { "UDS_UNUSED_CODE_46", "Unused error code 46" },
+  { "UDS_INSUFFICIENT_INDEX_SPACE", "Insufficient index space" },
+  { "UDS_UNUSED_CODE_48", "Unused error code 48" },
+  { "UDS_UNUSED_CODE_49", "Unused error code 49" },
+  { "UDS_SUSPENDED", "Index suspended"},
+  { "UDS_UNUSED_CODE_51", "Unused error code 51" },
+  { "UDS_INDEXSESSION_IN_USE", "Index session in use"},
+  { "UDS_CALLBACK_REQUIRED", "A callback function is required"},
+  { "UDS_INVALID_OPERATION_TYPE", "Invalid type of request operation"},
+};
+
+static const struct errorInfo internalErrorList[] = {
+  { "UDS_INTERNAL_UNUSED_0", "Unused internal error 0" },
+  { "UDS_OVERFLOW", "Index overflow" },
+  { "UDS_INTERNAL_UNUSED_2", "Unused internal error 2" },
+  { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" },
+  { "UDS_BAD_STATE", "UDS data structures are in an invalid state" },
+  { "UDS_DUPLICATE_NAME",
+    "Attempt to enter the same name into a delta index twice" },
+  { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" },
+  { "UDS_INJECTED_ERROR", "Injected error" },
+  { "UDS_ASSERTION_FAILED", "Assertion failed" },
+  { "UDS_INTERNAL_UNUSED_9", "Unused internal error 9" },
+  { "UDS_QUEUED", "Request queued" },
+  { "UDS_INTERNAL_UNUSED_11", "Unused internal error 11" },
+  { "UDS_INTERNAL_UNUSED_12", "Unused internal error 12" },
+  { "UDS_BUFFER_ERROR", "Buffer error" },
+  { "UDS_INTERNAL_UNUSED_14", "Unused internal error 14" },
+  { "UDS_INTERNAL_UNUSED_15", "Unused internal error 15" },
+  { "UDS_NO_DIRECTORY", "Expected directory is missing" },
+  { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" },
+  { "UDS_INTERNAL_UNUSED_18", "Unused internal error 18" },
+  { "UDS_INTERNAL_UNUSED_19", "Unused internal error 19" },
+  { "UDS_ALREADY_REGISTERED", "Error range already registered" },
+  { "UDS_BAD_IO_DIRECTION", "Bad I/O direction" },
+  { "UDS_INCORRECT_ALIGNMENT", "Offset not at block alignment" },
+  { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" },
+};
+
+typedef struct errorBlock {
+  const char      *name;
+  int              base;
+  int              last;
+  int              max;
+  const ErrorInfo *infos;
+} ErrorBlock;
+
+enum {
+  MAX_ERROR_BLOCKS = 6          // needed for testing
+};
+
+static struct errorInformation {
+  int        allocated;
+  int        count;
+  ErrorBlock blocks[MAX_ERROR_BLOCKS];
+} registeredErrors = {
+  .allocated = MAX_ERROR_BLOCKS,
+  .count     = 2,
+  .blocks    = {
+    {
+      .name  = "UDS Error",
+      .base  = UDS_ERROR_CODE_BASE,
+      .last  = UDS_ERROR_CODE_LAST,
+      .max   = UDS_ERROR_CODE_BLOCK_END,
+      .infos = errorList,
+    },
+    {
+      .name  = "UDS Internal Error",
+      .base  = UDS_INTERNAL_ERROR_CODE_BASE,
+      .last  = UDS_INTERNAL_ERROR_CODE_LAST,
+      .max   = UDS_INTERNAL_ERROR_CODE_BLOCK_END,
+      .infos = internalErrorList,
+    }
+  }
+};
+
+/**
+ * Fetch the error info (if any) for the error number.
+ *
+ * @param errnum        the error number
+ * @param infoPtr       the place to store the info for this error (if known),
+ *                      otherwise set to NULL
+ *
+ * @return              the name of the error block (if known), NULL othersise
+ **/
+static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr)
+{
+
+  if (errnum == UDS_SUCCESS) {
+    if (infoPtr != NULL) {
+      *infoPtr = &successful;
+    }
+    return NULL;
+  }
+
+  ErrorBlock *block;
+  for (block = registeredErrors.blocks;
+       block < registeredErrors.blocks + registeredErrors.count;
+       ++block) {
+    if ((errnum >= block->base) && (errnum < block->last)) {
+      if (infoPtr != NULL) {
+        *infoPtr = block->infos + (errnum - block->base);
+      }
+      return block->name;
+    } else if ((errnum >= block->last) && (errnum < block->max)) {
+      if (infoPtr != NULL) {
+        *infoPtr = NULL;
+      }
+      return block->name;
+    }
+  }
+  if (infoPtr != NULL) {
+    *infoPtr = NULL;
+  }
+  return NULL;
+}
+
+/**
+ * Return string describing a system error message
+ *
+ * @param errnum  System error number
+ * @param buf     Buffer that can be used to contain the return value
+ * @param buflen  Length of the buffer
+ *
+ * @return The error string, which may be a string constant or may be
+ *         returned in the buf argument
+ **/
+#ifdef __KERNEL__
+static const char *systemStringError(int errnum, char *buf, size_t buflen)
+{
+  const char *errorString = NULL;
+  if ((errnum > 0) && (errnum < COUNT_OF(messageTable))) {
+    errorString = messageTable[errnum];
+  }
+
+  size_t len = ((errorString == NULL)
+                ? snprintf(buf, buflen, "Unknown error %d", errnum)
+                : snprintf(buf, buflen, "%s", errorString));
+  if (len < buflen) {
+    return buf;
+  }
+
+  buf[0] = '\0';
+  return "System error";
+}
+#else
+static INLINE const char *systemStringError(int errnum, char *buf,
+                                            size_t buflen)
+{
+  return strerror_r(errnum, buf, buflen);
+}
+#endif
+
+/*****************************************************************************/
+const char *stringError(int errnum, char *buf, size_t buflen)
+{
+  if (buf == NULL) {
+    return NULL;
+  }
+
+  char *buffer = buf;
+  char *bufEnd = buf + buflen;
+
+  if (isUnrecoverable(errnum)) {
+    buffer = appendToBuffer(buffer, bufEnd, "Unrecoverable error: ");
+    errnum = sansUnrecoverable(errnum);
+  }
+
+  const ErrorInfo *info      = NULL;
+  const char      *blockName = getErrorInfo(errnum, &info);
+
+  if (blockName != NULL) {
+    if (info != NULL) {
+      buffer = appendToBuffer(buffer, bufEnd,
+                              "%s: %s", blockName, info->message);
+    } else {
+      buffer = appendToBuffer(buffer, bufEnd,
+                              "Unknown %s %d", blockName, errnum);
+    }
+  } else if (info != NULL) {
+    buffer = appendToBuffer(buffer, bufEnd, "%s", info->message);
+  } else {
+    const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer);
+    if (tmp != buffer) {
+      buffer = appendToBuffer(buffer, bufEnd, "%s", tmp);
+    } else {
+      buffer += strlen(tmp);
+    }
+  }
+  return buf;
+}
+
+/*****************************************************************************/
+const char *stringErrorName(int errnum, char *buf, size_t buflen)
+{
+  errnum = sansUnrecoverable(errnum);
+
+  char *buffer = buf;
+  char *bufEnd = buf + buflen;
+
+  const ErrorInfo *info      = NULL;
+  const char      *blockName = getErrorInfo(errnum, &info);
+
+  if (blockName != NULL) {
+    if (info != NULL) {
+      buffer = appendToBuffer(buffer, bufEnd, "%s", info->name);
+    } else {
+      buffer = appendToBuffer(buffer, bufEnd, "%s %d", blockName, errnum);
+    }
+  } else if (info != NULL) {
+    buffer = appendToBuffer(buffer, bufEnd, "%s", info->name);
+  } else {
+    const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer);
+    if (tmp != buffer) {
+      buffer = appendToBuffer(buffer, bufEnd, "%s", tmp);
+    } else {
+      buffer += strlen(tmp);
+    }
+  }
+  return buf;
+}
+
+/*****************************************************************************/
+int registerErrorBlock(const char      *blockName,
+                       int              firstError,
+                       int              lastReservedError,
+                       const ErrorInfo *infos,
+                       size_t           infoSize)
+{
+  int result = ASSERT(firstError < lastReservedError,
+                      "bad error block range");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (registeredErrors.count == registeredErrors.allocated) {
+    // could reallocate and grow, but should never happen
+    return UDS_OVERFLOW;
+  }
+
+  ErrorBlock *block;
+  for (block = registeredErrors.blocks;
+       block < registeredErrors.blocks + registeredErrors.count;
+       ++block) {
+    if (strcmp(blockName, block->name) == 0) {
+      return UDS_DUPLICATE_NAME;
+    }
+    // check for overlap in error ranges
+    if ((firstError < block->max) && (lastReservedError > block->base)) {
+      return UDS_ALREADY_REGISTERED;
+    }
+  }
+
+  registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) {
+    .name  = blockName,
+    .base  = firstError,
+    .last  = firstError + (infoSize / sizeof(ErrorInfo)),
+    .max   = lastReservedError,
+    .infos = infos
+  };
+
+  return UDS_SUCCESS;
+}
diff --git a/uds/errors.h b/uds/errors.h
new file mode 100644
index 0000000..faccd5a
--- /dev/null
+++ b/uds/errors.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/errors.h#4 $
+ */
+
+#ifndef ERRORS_H
+#define ERRORS_H
+
+#include "compiler.h"
+#include "typeDefs.h"
+#include "uds-error.h"
+
+enum udsInternalErrorCodes {
+  /** Used as a base value for reporting internal errors */
+  UDS_INTERNAL_ERROR_CODE_BASE = 66560,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_0        = UDS_INTERNAL_ERROR_CODE_BASE + 0,
+  /** Index overflow */
+  UDS_OVERFLOW                 = UDS_INTERNAL_ERROR_CODE_BASE + 1,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_2        = UDS_INTERNAL_ERROR_CODE_BASE + 2,
+  /** Invalid argument passed to internal routine */
+  UDS_INVALID_ARGUMENT         = UDS_INTERNAL_ERROR_CODE_BASE + 3,
+  /** UDS data structures are in an invalid state */
+  UDS_BAD_STATE                = UDS_INTERNAL_ERROR_CODE_BASE + 4,
+  /** Attempt to enter the same name into an internal structure twice */
+  UDS_DUPLICATE_NAME           = UDS_INTERNAL_ERROR_CODE_BASE + 5,
+  /** An internal protocol violation between system components */
+  UDS_UNEXPECTED_RESULT        = UDS_INTERNAL_ERROR_CODE_BASE + 6,
+  /** An error created by test case processing */
+  UDS_INJECTED_ERROR           = UDS_INTERNAL_ERROR_CODE_BASE + 7,
+  /** An assertion failed */
+  UDS_ASSERTION_FAILED         = UDS_INTERNAL_ERROR_CODE_BASE + 8,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_9        = UDS_INTERNAL_ERROR_CODE_BASE + 9,
+  /** Not an actual error, but reporting that the result will be delayed */
+  UDS_QUEUED                   = UDS_INTERNAL_ERROR_CODE_BASE + 10,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_11       = UDS_INTERNAL_ERROR_CODE_BASE + 11,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_12       = UDS_INTERNAL_ERROR_CODE_BASE + 12,
+  /** A problem has occured with a Buffer */
+  UDS_BUFFER_ERROR             = UDS_INTERNAL_ERROR_CODE_BASE + 13,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_14       = UDS_INTERNAL_ERROR_CODE_BASE + 14,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_15       = UDS_INTERNAL_ERROR_CODE_BASE + 15,
+  /** No directory was found where one was expected */
+  UDS_NO_DIRECTORY             = UDS_INTERNAL_ERROR_CODE_BASE + 16,
+  /** Checkpoint not completed */
+  UDS_CHECKPOINT_INCOMPLETE    = UDS_INTERNAL_ERROR_CODE_BASE + 17,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_18       = UDS_INTERNAL_ERROR_CODE_BASE + 18,
+  /** Unused */
+  UDS_INTERNAL_UNUSED_19       = UDS_INTERNAL_ERROR_CODE_BASE + 19,
+  /** This error range has already been registered */
+  UDS_ALREADY_REGISTERED       = UDS_INTERNAL_ERROR_CODE_BASE + 20,
+  /** Either read-only or write-only */
+  UDS_BAD_IO_DIRECTION         = UDS_INTERNAL_ERROR_CODE_BASE + 21,
+  /** Cannot do I/O at this offset */
+  UDS_INCORRECT_ALIGNMENT      = UDS_INTERNAL_ERROR_CODE_BASE + 22,
+  /** Attempt to read or write data outside the bounds established for it */
+  UDS_OUT_OF_RANGE             = UDS_INTERNAL_ERROR_CODE_BASE + 23,
+  /** One more than the last UDS_INTERNAL error code */
+  UDS_INTERNAL_ERROR_CODE_LAST,
+  /** One more than the last error this block will ever use */
+  UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440
+};
+
+enum {
+  ERRBUF_SIZE = 128 // default size for buffer passed to stringError
+};
+
+// Error attributes - or into top half of error code
+enum { UDS_UNRECOVERABLE = (1 << 17) };
+
+const char *stringError(int errnum, char *buf, size_t buflen);
+const char *stringErrorName(int errnum, char *buf, size_t buflen);
+
+/*
+ * Identify that an result code is a successful result.
+ *
+ * @param result  A result code
+ *
+ * @return true if the result represents a success.
+ */
+__attribute__((warn_unused_result))
+static INLINE bool isSuccessful(int result)
+{
+  return (result == UDS_SUCCESS) || (result == UDS_QUEUED);
+}
+
+/*
+ * Identify that an result code has been marked unrecoverable.
+ *
+ * @param result  A result code
+ *
+ * @return true if the result has been marked unrecoverable.
+ */
+__attribute__((warn_unused_result))
+static INLINE bool isUnrecoverable(int result)
+{
+  return (result & UDS_UNRECOVERABLE) != 0;
+}
+
+/*
+ * Mark a result code as unrecoverable.
+ *
+ * @param result  A result code
+ *
+ * @return the result code with the unrecoverable marker added
+ */
+__attribute__((warn_unused_result))
+static INLINE int makeUnrecoverable(int result)
+{
+  return isSuccessful(result) ? result : (result | UDS_UNRECOVERABLE);
+}
+
+/*
+ * Remove the unrecoverable marker from a result code.
+ *
+ * @param result  A result code
+ *
+ * @return the result code with the unrecoverable marker removed
+ */
+__attribute__((warn_unused_result))
+static INLINE int sansUnrecoverable(int result)
+{
+  return result & ~UDS_UNRECOVERABLE;
+}
+
+typedef struct errorInfo {
+  const char *name;
+  const char *message;
+} ErrorInfo;
+
+/**
+ * Register an error code block for stringError and stringErrorName.
+ *
+ * @param blockName             the name of the block of error codes
+ * @param firstError            the first error code in the block
+ * @param lastReservedError     one past the highest possible error in the bloc
+ * @param infos                 a pointer to the error info array for the block
+ * @param infoSize              the size of the error info array, which
+ *                              determines the last actual error for which
+ *                              information is available
+ *
+ * @return a success or error code, particularly UDS_DUPLICATE_NAME if the
+ *         block name is already present, or UDS_ALREADY_REGISTERED if a
+ *         block with the specified error code is present
+ **/
+int registerErrorBlock(const char      *blockName,
+                       int              firstError,
+                       int              lastReservedError,
+                       const ErrorInfo *infos,
+                       size_t           infoSize);
+
+/**
+ * Return the first error between result1 and result2.
+ *
+ * @param result1       A success or error code.
+ * @param result2       A success or error code.
+ *
+ * @return result1 if that is an error, else result2
+ **/
+static INLINE int firstError(int result1, int result2)
+{
+  return result1 == UDS_SUCCESS ? result2 : result1;
+}
+
+#endif /* ERRORS_H */
diff --git a/uds/geometry.c b/uds/geometry.c
new file mode 100644
index 0000000..6d8cfa6
--- /dev/null
+++ b/uds/geometry.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/geometry.c#3 $
+ */
+
+#include "geometry.h"
+
+#include "deltaIndex.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "uds.h"
+
+/**********************************************************************/
+static int initializeGeometry(Geometry    *geometry,
+                              size_t       bytesPerPage,
+                              unsigned int recordPagesPerChapter,
+                              unsigned int chaptersPerVolume,
+                              unsigned int sparseChaptersPerVolume)
+{
+  int result = ASSERT_WITH_ERROR_CODE(bytesPerPage >= BYTES_PER_RECORD,
+                                      UDS_BAD_STATE,
+                                      "page is smaller than a record: %zu",
+                                      bytesPerPage);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT_WITH_ERROR_CODE(chaptersPerVolume > sparseChaptersPerVolume,
+                                  UDS_INVALID_ARGUMENT,
+                                  "sparse chapters per volume (%u) must be less"
+                                  " than chapters per volume (%u)",
+                                  sparseChaptersPerVolume,
+                                  chaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  geometry->bytesPerPage            = bytesPerPage;
+  geometry->recordPagesPerChapter   = recordPagesPerChapter;
+  geometry->chaptersPerVolume       = chaptersPerVolume;
+  geometry->sparseChaptersPerVolume = sparseChaptersPerVolume;
+  geometry->denseChaptersPerVolume  =
+    chaptersPerVolume - sparseChaptersPerVolume;
+
+  // Calculate the number of records in a page, chapter, and volume.
+  geometry->recordsPerPage = bytesPerPage / BYTES_PER_RECORD;
+  geometry->recordsPerChapter
+    = geometry->recordsPerPage * recordPagesPerChapter;
+  geometry->recordsPerVolume
+    = (unsigned long) geometry->recordsPerChapter * chaptersPerVolume;
+  geometry->openChapterLoadRatio = DEFAULT_OPEN_CHAPTER_LOAD_RATIO;
+
+  // Initialize values for delta chapter indexes.
+  geometry->chapterMeanDelta     = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS;
+  geometry->chapterPayloadBits   = computeBits(recordPagesPerChapter - 1);
+  // We want 1 delta list for every 64 records in the chapter.  The "| 077"
+  // ensures that the chapterDeltaListBits computation does not underflow.
+  geometry->chapterDeltaListBits
+    = computeBits((geometry->recordsPerChapter - 1) | 077) - 6;
+  geometry->deltaListsPerChapter = 1 << geometry->chapterDeltaListBits;
+  // We need enough address bits to achieve the desired mean delta.
+  geometry->chapterAddressBits
+    = (DEFAULT_CHAPTER_MEAN_DELTA_BITS - geometry->chapterDeltaListBits
+       + computeBits(geometry->recordsPerChapter - 1));
+  // Let the delta index code determine how many pages are needed for the index
+  geometry->indexPagesPerChapter
+    = getDeltaIndexPageCount(geometry->recordsPerChapter,
+                             geometry->deltaListsPerChapter,
+                             geometry->chapterMeanDelta,
+                             geometry->chapterPayloadBits,
+                             bytesPerPage);
+
+  // Now that we have the size of a chapter index, we can calculate the
+  // space used by chapters and volumes.
+  geometry->pagesPerChapter
+    = geometry->indexPagesPerChapter + recordPagesPerChapter;
+  geometry->pagesPerVolume  = geometry->pagesPerChapter * chaptersPerVolume;
+  geometry->headerPagesPerVolume = 1;
+  geometry->bytesPerVolume = bytesPerPage *
+    (geometry->pagesPerVolume + geometry->headerPagesPerVolume);
+  geometry->bytesPerChapter = bytesPerPage * geometry->pagesPerChapter;
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeGeometry(size_t       bytesPerPage,
+                 unsigned int recordPagesPerChapter,
+                 unsigned int chaptersPerVolume,
+                 unsigned int sparseChaptersPerVolume,
+                 Geometry   **geometryPtr)
+{
+  Geometry *geometry;
+  int result = ALLOCATE(1, Geometry, "geometry", &geometry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = initializeGeometry(geometry, bytesPerPage, recordPagesPerChapter,
+                              chaptersPerVolume, sparseChaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    freeGeometry(geometry);
+    return result;
+  }
+
+  *geometryPtr = geometry;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int copyGeometry(Geometry *source, Geometry **geometryPtr)
+{
+  return makeGeometry(source->bytesPerPage,
+                      source->recordPagesPerChapter,
+                      source->chaptersPerVolume,
+                      source->sparseChaptersPerVolume,
+                      geometryPtr);
+}
+
+/**********************************************************************/
+void freeGeometry(Geometry *geometry)
+{
+  FREE(geometry);
+}
+
+/**********************************************************************/
+uint64_t mapToVirtualChapterNumber(Geometry     *geometry,
+                                   uint64_t      newestVirtualChapter,
+                                   unsigned int  physicalChapter)
+{
+  unsigned int newestPhysicalChapter
+    = mapToPhysicalChapter(geometry, newestVirtualChapter);
+  uint64_t virtualChapter
+    = newestVirtualChapter - newestPhysicalChapter + physicalChapter;
+  if (physicalChapter > newestPhysicalChapter) {
+    virtualChapter -= geometry->chaptersPerVolume;
+  }
+  return virtualChapter;
+}
+
+/**********************************************************************/
+bool hasSparseChapters(const Geometry *geometry,
+                       uint64_t        oldestVirtualChapter,
+                       uint64_t        newestVirtualChapter)
+{
+  return (isSparse(geometry)
+          && ((newestVirtualChapter - oldestVirtualChapter + 1)
+              > geometry->denseChaptersPerVolume));
+}
+
+/**********************************************************************/
+bool isChapterSparse(const Geometry *geometry,
+                     uint64_t        oldestVirtualChapter,
+                     uint64_t        newestVirtualChapter,
+                     uint64_t        virtualChapterNumber)
+{
+  return (hasSparseChapters(geometry, oldestVirtualChapter,
+                            newestVirtualChapter)
+          && ((virtualChapterNumber + geometry->denseChaptersPerVolume)
+              <= newestVirtualChapter));
+}
+
+/**********************************************************************/
+bool areSamePhysicalChapter(const Geometry *geometry,
+                            uint64_t        chapter1,
+                            uint64_t        chapter2)
+{
+  return ((chapter1 % geometry->chaptersPerVolume)
+          == (chapter2 % geometry->chaptersPerVolume));
+}
diff --git a/uds/geometry.h b/uds/geometry.h
new file mode 100644
index 0000000..47f771d
--- /dev/null
+++ b/uds/geometry.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/geometry.h#3 $
+ */
+
+#ifndef GEOMETRY_H
+#define GEOMETRY_H 1
+
+#include "compiler.h"
+#include "typeDefs.h"
+#include "uds.h"
+#include "uds-block.h"
+
+/**
+ * Geometry defines constants and a record that parameterize the layout of an
+ * Albireo index volume.
+ *
+ * <p>An index volume is divided into a fixed number of fixed-size
+ * chapters, each consisting of a fixed number of fixed-size
+ * pages. The volume layout is defined by two assumptions and four
+ * parameters. The assumptions (constants) are that index records are
+ * 64 bytes (32-byte block name plus 32-byte metadata) and that open
+ * chapter index hash slots are one byte long. The four parameters are
+ * the number of bytes in a page, the number of chapters in a volume,
+ * the number of record pages in a chapter, and the number of chapters
+ * that are sparse. From these parameters, we derive the rest of the
+ * layout and derived properties, ranging from the number of pages in
+ * a chapter to the number of records in the volume.
+ *
+ * <p>The default geometry is 64 KByte pages, 1024 chapters, 256
+ * record pages in a chapter, and zero sparse chapters. This will
+ * allow us to store 2^28 entries (indexing 1TB of 4K blocks) in an
+ * approximately 16.5 MByte volume using fourteen index pages in each
+ * chapter.
+ **/
+typedef struct geometry {
+  /** Length of a page in a chapter, in bytes */
+  size_t bytesPerPage;
+  /** Number of record pages in a chapter */
+  unsigned int recordPagesPerChapter;
+  /** Number of (total) chapters in a volume */
+  unsigned int chaptersPerVolume;
+  /** Number of sparsely-indexed chapters in a volume */
+  unsigned int sparseChaptersPerVolume;
+  /** Number of bits used to determine delta list numbers */
+  unsigned int chapterDeltaListBits;
+
+  // These are derived properties, expressed as fields for convenience.
+  /** Total number of pages in a volume, excluding header */
+  unsigned int pagesPerVolume;
+  /** Total number of header pages per volume */
+  unsigned int headerPagesPerVolume;
+  /** Total number of bytes in a volume, including header */
+  size_t bytesPerVolume;
+  /** Total number of bytes in a chapter */
+  size_t bytesPerChapter;
+  /** Number of pages in a chapter */
+  unsigned int pagesPerChapter;
+  /** Number of index pages in a chapter index */
+  unsigned int indexPagesPerChapter;
+  /** The minimum ratio of hash slots to records in an open chapter */
+  unsigned int openChapterLoadRatio;
+  /** Number of records that fit on a page */
+  unsigned int recordsPerPage;
+  /** Number of records that fit in a chapter */
+  unsigned int recordsPerChapter;
+  /** Number of records that fit in a volume */
+  uint64_t recordsPerVolume;
+  /** Number of deltaLists per chapter index */
+  unsigned int deltaListsPerChapter;
+  /** Mean delta in chapter indexes */
+  unsigned int chapterMeanDelta;
+  /** Number of bits needed for record page numbers */
+  unsigned int chapterPayloadBits;
+  /** Number of bits used to compute addresses for chapter delta lists */
+  unsigned int chapterAddressBits;
+  /** Number of densely-indexed chapters in a volume */
+  unsigned int denseChaptersPerVolume;
+} Geometry;
+
+enum {
+  /* The number of bytes in a record (name + metadata) */
+  BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_MAX_BLOCK_DATA_SIZE),
+
+  /* The default length of a page in a chapter, in bytes */
+  DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD,
+
+  /* The default maximum number of records per page */
+  DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD,
+
+  /** The default number of record pages in a chapter */
+  DEFAULT_RECORD_PAGES_PER_CHAPTER = 256,
+
+  /** The default number of record pages in a chapter for a small index */
+  SMALL_RECORD_PAGES_PER_CHAPTER = 64,
+
+  /** The default number of chapters in a volume */
+  DEFAULT_CHAPTERS_PER_VOLUME = 1024,
+
+  /** The default number of sparsely-indexed chapters in a volume */
+  DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0,
+
+  /** The log2 of the default mean delta */
+  DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16,
+
+  /** The log2 of the number of delta lists in a large chapter */
+  DEFAULT_CHAPTER_DELTA_LIST_BITS = 12,
+
+  /** The log2 of the number of delta lists in a small chapter */
+  SMALL_CHAPTER_DELTA_LIST_BITS = 10,
+
+  /** The default min ratio of slots to records in an open chapter */
+  DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2,
+
+  /** Checkpoint every n chapters written.  Default is to not checkpoint */
+  DEFAULT_CHECKPOINT_FREQUENCY = 0
+};
+
+/**
+ * Allocate and initialize all fields of a volume geometry using the
+ * specified layout parameters.
+ *
+ * @param bytesPerPage            The length of a page in a chapter, in bytes
+ * @param recordPagesPerChapter   The number of pages in a chapter
+ * @param chaptersPerVolume       The number of chapters in a volume
+ * @param sparseChaptersPerVolume The number of sparse chapters in a volume
+ * @param geometryPtr             A pointer to hold the new geometry
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeGeometry(size_t       bytesPerPage,
+                 unsigned int recordPagesPerChapter,
+                 unsigned int chaptersPerVolume,
+                 unsigned int sparseChaptersPerVolume,
+                 Geometry   **geometryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate a new geometry and initialize it with the same parameters as an
+ * existing geometry.
+ *
+ * @param source      The geometry record to copy
+ * @param geometryPtr A pointer to hold the new geometry
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int copyGeometry(Geometry  *source,
+                 Geometry **geometryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up a geometry and its memory.
+ *
+ * @param geometry The geometry record to free
+ **/
+void freeGeometry(Geometry *geometry);
+
+/**
+ * Map a virtual chapter number to a physical chapter number
+ *
+ * @param geometry        The geometry
+ * @param virtualChapter  The virtual chapter number
+ *
+ * @return the corresponding physical chapter number
+ **/
+__attribute__((warn_unused_result))
+static INLINE unsigned int mapToPhysicalChapter(const Geometry *geometry,
+                                                uint64_t virtualChapter)
+{
+  return (virtualChapter % geometry->chaptersPerVolume);
+}
+
+/**
+ * Convert a physical chapter number to its current virtual chapter number.
+ *
+ * @param geometry             The geometry
+ * @param newestVirtualChapter The number of the newest virtual chapter
+ * @param physicalChapter      The physical chapter number to convert
+ *
+ * @return The current virtual chapter number of the physical chapter
+ *         in question
+ **/
+uint64_t mapToVirtualChapterNumber(Geometry     *geometry,
+                                   uint64_t      newestVirtualChapter,
+                                   unsigned int  physicalChapter);
+
+/**
+ * Check whether this geometry is for a sparse index.
+ *
+ * @param geometry   The geometry to check
+ *
+ * @return true if this geometry has sparse chapters
+ **/
+__attribute__((warn_unused_result))
+static INLINE bool isSparse(const Geometry *geometry)
+{
+  return (geometry->sparseChaptersPerVolume > 0);
+}
+
+/**
+ * Check whether any sparse chapters have been filled.
+ *
+ * @param geometry             The geometry of the index
+ * @param oldestVirtualChapter The number of the oldest chapter in the
+ *                             index
+ * @param newestVirtualChapter The number of the newest chapter in the
+ *                             index
+ *
+ * @return true if the index has filled at least one sparse chapter
+ **/
+bool hasSparseChapters(const Geometry *geometry,
+                       uint64_t        oldestVirtualChapter,
+                       uint64_t        newestVirtualChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a chapter is sparse or dense.
+ *
+ * @param geometry             The geometry of the index containing the chapter
+ * @param oldestVirtualChapter The number of the oldest chapter in the index
+ * @param newestVirtualChapter The number of the newest chapter in the index
+ * @param virtualChapterNumber The number of the chapter to check
+ *
+ * @return true if the chapter is sparse
+ **/
+bool isChapterSparse(const Geometry *geometry,
+                     uint64_t        oldestVirtualChapter,
+                     uint64_t        newestVirtualChapter,
+                     uint64_t        virtualChapterNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether two virtual chapter numbers correspond to the same
+ * physical chapter.
+ *
+ * @param geometry The geometry of the index
+ * @param chapter1 The first chapter to compare
+ * @param chapter2 The second chapter to compare
+ *
+ * @return <code>true</code> if both chapters correspond to the same
+ *         physical chapter
+ **/
+bool areSamePhysicalChapter(const Geometry *geometry,
+                            uint64_t        chapter1,
+                            uint64_t        chapter2)
+  __attribute__((warn_unused_result));
+
+#endif /* GEOMETRY_H */
diff --git a/uds/hashUtils.c b/uds/hashUtils.c
new file mode 100644
index 0000000..45b2c81
--- /dev/null
+++ b/uds/hashUtils.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.c#2 $
+ */
+
+#include "hashUtils.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "permassert.h"
+#include "stringUtils.h"
+#include "uds.h"
+
+/**
+ * Convert a byte string to the hex representation.
+ *
+ * @param data          binary data to convert
+ * @param dataLen       length of binary data
+ * @param hex           target to write hex string into
+ * @param hexLen        capacity of target string
+ *
+ * @return              UDS_SUCCESS,
+ *                      or UDS_INVALID_ARGUMENT if hexLen
+ *                      is too short.
+ **/
+static int dataToHex(const unsigned char *data, size_t dataLen,
+                     char *hex, size_t hexLen)
+{
+  if (hexLen < 2 * dataLen + 1) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "hex data incorrect size");
+  }
+  size_t i;
+  for (i = 0; i < dataLen; ++i) {
+    int rc = fixedSprintf(__func__, &hex[2 * i], hexLen - (2 * i),
+                          UDS_INVALID_ARGUMENT, "%02X", data[i]);
+
+    if (rc != UDS_SUCCESS) {
+      return rc;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int chunkNameToHex(const UdsChunkName *chunkName,
+                   char *hexData, size_t hexDataLen)
+{
+  return dataToHex(chunkName->name, UDS_CHUNK_NAME_SIZE,
+                   hexData, hexDataLen);
+}
+
+/**********************************************************************/
+int chunkDataToHex(const UdsChunkData *chunkData,
+                   char *hexData, size_t hexDataLen)
+{
+  return dataToHex(chunkData->data, UDS_MAX_BLOCK_DATA_SIZE,
+                   hexData, hexDataLen);
+}
+
+/**********************************************************************/
+unsigned int computeBits(unsigned int maxValue)
+{
+  // __builtin_clz() counts leading (high-order) zero bits, so if
+  // we ever need this to be fast, under GCC we can do:
+  // return ((maxValue == 0) ? 0 : (32 - __builtin_clz(maxValue)));
+
+  unsigned int bits = 0;
+  while (maxValue > 0) {
+    maxValue >>= 1;
+    bits++;
+  }
+  return bits;
+}
+
+/**********************************************************************/
+void hashUtilsCompileTimeAssertions(void)
+{
+  STATIC_ASSERT((UDS_CHUNK_NAME_SIZE % sizeof(uint64_t)) == 0);
+  STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16);
+}
diff --git a/uds/hashUtils.h b/uds/hashUtils.h
new file mode 100644
index 0000000..2d6d0a8
--- /dev/null
+++ b/uds/hashUtils.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.h#1 $
+ */
+
+#ifndef HASH_UTILS_H
+#define HASH_UTILS_H 1
+
+#include "compiler.h"
+#include "common.h"
+#include "geometry.h"
+#include "numeric.h"
+#include "uds.h"
+
+// How various portions of a hash are apportioned.  Size dependent.
+enum {
+  MASTER_INDEX_BYTES_OFFSET  = 0,  // size 8
+  CHAPTER_INDEX_BYTES_OFFSET = 8,  // size 6
+  SAMPLE_BYTES_OFFSET        = 14, // size 2
+  MASTER_INDEX_BYTES_COUNT   = 8,
+  CHAPTER_INDEX_BYTES_COUNT  = 6,
+  SAMPLE_BYTES_COUNT         = 2,
+};
+
+/**
+ * Extract the portion of a block name used by the chapter index.
+ *
+ * @param name The block name
+ *
+ * @return The chapter index bytes
+ **/
+static INLINE uint64_t extractChapterIndexBytes(const UdsChunkName *name)
+{
+  // Get the high order 16 bits, then the low order 32 bits
+  uint64_t bytes
+    = (uint64_t) getUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET]) << 32;
+  bytes |= getUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2]);
+  return bytes;
+}
+
+/**
+ * Extract the portion of a block name used by the master index.
+ *
+ * @param name The block name
+ *
+ * @return The master index portion of the block name
+ **/
+static INLINE uint64_t extractMasterIndexBytes(const UdsChunkName *name)
+{
+  return getUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET]);
+}
+
+/**
+ * Extract the portion of a block name used for sparse sampling.
+ *
+ * @param name The block name
+ *
+ * @return The sparse sample portion of the block name
+ **/
+static INLINE uint32_t extractSamplingBytes(const UdsChunkName *name)
+{
+  return getUInt16BE(&name->name[SAMPLE_BYTES_OFFSET]);
+}
+
+/**
+ * For a given block, find the chapter delta list to use
+ *
+ * @param name     The block name to hash
+ * @param geometry The geometry to use
+ *
+ * @return The chapter delta list where we expect to find the given blockname
+ **/
+static INLINE unsigned int hashToChapterDeltaList(const UdsChunkName *name,
+                                                  const Geometry     *geometry)
+{
+  return (unsigned int) ((extractChapterIndexBytes(name)
+                          >> geometry->chapterAddressBits)
+                         & ((1 << geometry->chapterDeltaListBits) - 1));
+}
+
+/**
+ * For a given block, find the chapter delta address to use
+ *
+ * @param name     The block name to hash
+ * @param geometry The geometry to use
+ *
+ * @return The chapter delta address to use
+ **/
+static INLINE unsigned int hashToChapterDeltaAddress(const UdsChunkName *name,
+                                                     const Geometry *geometry)
+{
+  return (unsigned int) (extractChapterIndexBytes(name)
+                         & ((1 << geometry->chapterAddressBits) - 1));
+}
+
+/**
+ * For a given block name, find the slot in the open chapter hash table
+ * where it is expected to reside.
+ *
+ * @param name      The block name to hash
+ * @param slotCount The size of the hash table
+ *
+ * @return the record number in the index page where we expect to find
+ #         the given blockname
+ **/
+static INLINE unsigned int nameToHashSlot(const UdsChunkName *name,
+                                          unsigned int slotCount)
+{
+  return (unsigned int) (extractChapterIndexBytes(name) % slotCount);
+}
+
+/**
+ * Convert a chunk name to hex to make it more readable.
+ *
+ * @param chunkName  The chunk name
+ * @param hexData    The resulting hexdata from the given chunk name
+ * @param hexDataLen The capacity of hexData
+ *
+ * @return              UDS_SUCCESS,
+ *                      or UDS_INVALID_ARGUMENT if hexDataLen
+ *                      is too short.
+ **/
+int chunkNameToHex(const UdsChunkName *chunkName,
+                   char               *hexData,
+                   size_t              hexDataLen)
+  __attribute__((warn_unused_result));
+
+/**
+ * Convert chunk data to hex to make it more readable.
+ *
+ * @param chunkData  The chunk data
+ * @param hexData    The resulting hexdata from the given chunk data
+ * @param hexDataLen The capacity of hexData
+ *
+ * @return              UDS_SUCCESS,
+ *                      or UDS_INVALID_ARGUMENT if hexDataLen
+ *                      is too short.
+ **/
+int chunkDataToHex(const UdsChunkData *chunkData,
+                   char               *hexData,
+                   size_t              hexDataLen)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the number of bits required to store a field with the given
+ * maximum value.
+ *
+ * @param maxValue   The maximum value of the field
+ *
+ * @return           the number of bits required
+ **/
+unsigned int computeBits(unsigned int maxValue)
+  __attribute__((warn_unused_result));
+
+/**
+ * FOR TESTING. Set the portion of a block name used by the chapter index.
+ *
+ * @param name   The block name
+ * @param value  The value to store
+ **/
+static INLINE void setChapterIndexBytes(UdsChunkName *name, uint64_t value)
+{
+  // Store the high order bytes, then the low-order bytes
+  storeUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET],
+                (uint16_t)(value >> 32));
+  storeUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2],
+                (uint32_t)value);
+}
+
+/**
+ * FOR TESTING. Set the bits used to find a chapter delta list
+ *
+ * @param name     The block name
+ * @param geometry The geometry to use
+ * @param value    The value to store
+ **/
+static INLINE void setChapterDeltaListBits(UdsChunkName   *name,
+                                           const Geometry *geometry,
+                                           uint64_t        value)
+{
+  uint64_t deltaAddress = hashToChapterDeltaAddress(name, geometry);
+  deltaAddress |= value << geometry->chapterAddressBits;
+  setChapterIndexBytes(name, deltaAddress);
+}
+
+/**
+ * FOR TESTING. Set the portion of a block name used by the master index.
+ *
+ * @param name  The block name
+ * @param val   The value to store
+ **/
+static INLINE void setMasterIndexBytes(UdsChunkName *name, uint64_t val)
+{
+  storeUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET], val);
+}
+
+/**
+ * Set the portion of a block name used for sparse sampling.
+ *
+ * @param name   The block name
+ * @param value  The value to store
+ **/
+static INLINE void setSamplingBytes(UdsChunkName *name, uint32_t value)
+{
+  storeUInt16BE(&name->name[SAMPLE_BYTES_OFFSET], (uint16_t)value);
+}
+
+/**
+ * Special function wrapper required for compile-time assertions. This
+ * function will fail to compile if UDS_CHUNK_NAME_SIZE is not an integer
+ * multiple of 8.
+ **/
+void hashUtilsCompileTimeAssertions(void);
+
+#endif /* HASH_UTILS_H */
diff --git a/uds/index.c b/uds/index.c
new file mode 100644
index 0000000..a84d50f
--- /dev/null
+++ b/uds/index.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/index.c#15 $
+ */
+
+#include "index.h"
+
+#include "hashUtils.h"
+#include "indexCheckpoint.h"
+#include "indexInternals.h"
+#include "logger.h"
+
+static const uint64_t NO_LAST_CHECKPOINT = UINT_MAX;
+
+
+/**
+ * Replay an index which was loaded from a checkpoint.
+ *
+ * @param index                 The index to replay
+ * @param lastCheckpointChapter The number of the chapter where the
+ *                              last checkpoint was made
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+static int replayIndexFromCheckpoint(Index *index,
+                                     uint64_t lastCheckpointChapter)
+{
+  // Find the volume chapter boundaries
+  uint64_t lowestVCN, highestVCN;
+  bool isEmpty = false;
+  IndexLookupMode oldLookupMode = index->volume->lookupMode;
+  index->volume->lookupMode = LOOKUP_FOR_REBUILD;
+  int result = findVolumeChapterBoundaries(index->volume, &lowestVCN,
+                                           &highestVCN, &isEmpty);
+  index->volume->lookupMode = oldLookupMode;
+  if (result != UDS_SUCCESS) {
+    return logFatalWithStringError(result,
+                                   "cannot replay index: "
+                                   "unknown volume chapter boundaries");
+  }
+  if (lowestVCN > highestVCN) {
+    logFatal("cannot replay index: no valid chapters exist");
+    return UDS_CORRUPT_COMPONENT;
+  }
+
+  if (isEmpty) {
+    // The volume is empty, so the index should also be empty
+    if (index->newestVirtualChapter != 0) {
+      logFatal("cannot replay index from empty volume");
+      return UDS_CORRUPT_COMPONENT;
+    }
+    return UDS_SUCCESS;
+  }
+
+  unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume;
+  index->oldestVirtualChapter = lowestVCN;
+  index->newestVirtualChapter = highestVCN + 1;
+  if (index->newestVirtualChapter == lowestVCN + chaptersPerVolume) {
+    // skip the chapter shadowed by the open chapter
+    index->oldestVirtualChapter++;
+  }
+
+  uint64_t firstReplayChapter = lastCheckpointChapter;
+  if (firstReplayChapter < index->oldestVirtualChapter) {
+    firstReplayChapter = index->oldestVirtualChapter;
+  }
+  return replayVolume(index, firstReplayChapter);
+}
+
+/**********************************************************************/
+static int loadIndex(Index *index, bool allowReplay)
+{
+  bool replayRequired = false;
+
+  int result = loadIndexState(index->state, &replayRequired);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (replayRequired && !allowReplay) {
+    return logErrorWithStringError(
+      UDS_INDEX_NOT_SAVED_CLEANLY,
+      "index not saved cleanly: open chapter missing");
+  }
+
+  uint64_t lastCheckpointChapter
+    = ((index->lastCheckpoint != NO_LAST_CHECKPOINT)
+       ? index->lastCheckpoint : 0);
+
+  logInfo("loaded index from chapter %llu through chapter %llu",
+          index->oldestVirtualChapter, lastCheckpointChapter);
+
+  if (replayRequired) {
+    result = replayIndexFromCheckpoint(index, lastCheckpointChapter);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  unsigned int i;
+  for (i = 0; i < index->zoneCount; i++) {
+    setActiveChapters(index->zones[i]);
+  }
+
+  index->loadedType = replayRequired ? LOAD_REPLAY : LOAD_LOAD;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int rebuildIndex(Index *index)
+{
+  // Find the volume chapter boundaries
+  uint64_t lowestVCN, highestVCN;
+  bool isEmpty = false;
+  IndexLookupMode oldLookupMode = index->volume->lookupMode;
+  index->volume->lookupMode = LOOKUP_FOR_REBUILD;
+  int result = findVolumeChapterBoundaries(index->volume, &lowestVCN,
+                                           &highestVCN, &isEmpty);
+  index->volume->lookupMode = oldLookupMode;
+  if (result != UDS_SUCCESS) {
+    return logFatalWithStringError(result,
+                                   "cannot rebuild index: "
+                                   "unknown volume chapter boundaries");
+  }
+  if (lowestVCN > highestVCN) {
+    logFatal("cannot rebuild index: no valid chapters exist");
+    return UDS_CORRUPT_COMPONENT;
+  }
+
+  if (isEmpty) {
+    index->newestVirtualChapter = index->oldestVirtualChapter = 0;
+  } else {
+    unsigned int numChapters = index->volume->geometry->chaptersPerVolume;
+    index->newestVirtualChapter = highestVCN + 1;
+    index->oldestVirtualChapter = lowestVCN;
+    if (index->newestVirtualChapter
+        == (index->oldestVirtualChapter + numChapters)) {
+      // skip the chapter shadowed by the open chapter
+      index->oldestVirtualChapter++;
+    }
+  }
+
+  if ((index->newestVirtualChapter - index->oldestVirtualChapter) >
+      index->volume->geometry->chaptersPerVolume) {
+    return logFatalWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "cannot rebuild index: "
+                                   "volume chapter boundaries too large");
+  }
+
+  setMasterIndexOpenChapter(index->masterIndex, 0);
+  if (isEmpty) {
+    index->loadedType = LOAD_EMPTY;
+    return UDS_SUCCESS;
+  }
+
+  result = replayVolume(index, index->oldestVirtualChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int i;
+  for (i = 0; i < index->zoneCount; i++) {
+    setActiveChapters(index->zones[i]);
+  }
+
+  index->loadedType = LOAD_REBUILD;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeIndex(IndexLayout                  *layout,
+              const Configuration          *config,
+              const struct uds_parameters  *userParams,
+              unsigned int                  zoneCount,
+              LoadType                      loadType,
+              IndexLoadContext             *loadContext,
+              Index                       **newIndex)
+{
+  Index *index;
+  int result = allocateIndex(layout, config, userParams, zoneCount, loadType,
+                             &index);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "could not allocate index");
+  }
+
+  index->loadContext = loadContext;
+
+  uint64_t nonce = getVolumeNonce(layout);
+  result = makeMasterIndex(config, zoneCount, nonce, &index->masterIndex);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return logErrorWithStringError(result, "could not make master index");
+  }
+
+  result = addIndexStateComponent(index->state, MASTER_INDEX_INFO, NULL,
+                                  index->masterIndex);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  result = addIndexStateComponent(index->state, &INDEX_PAGE_MAP_INFO,
+                                  index->volume->indexPageMap, NULL);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  result = makeChapterWriter(index, getIndexVersion(layout),
+                             &index->chapterWriter);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  if ((loadType == LOAD_LOAD) || (loadType == LOAD_REBUILD)) {
+    if (!index->existed) {
+      freeIndex(index);
+      return UDS_NO_INDEX;
+    }
+    result = loadIndex(index, loadType == LOAD_REBUILD);
+    switch (result) {
+    case UDS_SUCCESS:
+      break;
+    case ENOMEM:
+      // We should not try a rebuild for this error.
+      logErrorWithStringError(result, "index could not be loaded");
+      break;
+    default:
+      logErrorWithStringError(result, "index could not be loaded");
+      if (loadType == LOAD_REBUILD) {
+        result = rebuildIndex(index);
+        if (result != UDS_SUCCESS) {
+          logErrorWithStringError(result, "index could not be rebuilt");
+        }
+      }
+      break;
+    }
+  } else {
+    index->loadedType = LOAD_CREATE;
+    discardIndexStateData(index->state);
+  }
+
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return logUnrecoverable(result, "fatal error in makeIndex");
+  }
+
+  if (index->loadContext != NULL) {
+    lockMutex(&index->loadContext->mutex);
+    index->loadContext->status = INDEX_READY;
+    // If we get here, suspend is meaningless, but notify any thread trying
+    // to suspend us so it doesn't hang.
+    broadcastCond(&index->loadContext->cond);
+    unlockMutex(&index->loadContext->mutex);
+  }
+
+  index->hasSavedOpenChapter = index->loadedType == LOAD_LOAD;
+  *newIndex = index;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeIndex(Index *index)
+{
+  if (index == NULL) {
+    return;
+  }
+  freeChapterWriter(index->chapterWriter);
+
+  if (index->masterIndex != NULL) {
+    freeMasterIndex(index->masterIndex);
+  }
+  releaseIndex(index);
+}
+
+/**********************************************************************/
+int saveIndex(Index *index)
+{
+  waitForIdleChapterWriter(index->chapterWriter);
+  int result = finishCheckpointing(index);
+  if (result != UDS_SUCCESS) {
+    logInfo("save index failed");
+    return result;
+  }
+  beginSave(index, false, index->newestVirtualChapter);
+
+  result = saveIndexState(index->state);
+  if (result != UDS_SUCCESS) {
+    logInfo("save index failed");
+    index->lastCheckpoint = index->prevCheckpoint;
+  } else {
+    index->hasSavedOpenChapter = true;
+    logInfo("finished save (vcn %llu)", index->lastCheckpoint);
+  }
+  return result;
+}
+
+/**
+ * Get the zone for a request.
+ *
+ * @param index The index
+ * @param request The request
+ *
+ * @return The zone for the request
+ **/
+static IndexZone *getRequestZone(Index *index, Request *request)
+{
+  return index->zones[request->zoneNumber];
+}
+
+/**
+ * Search an index zone. This function is only correct for LRU.
+ *
+ * @param zone              The index zone to query.
+ * @param request           The request originating the query.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int searchIndexZone(IndexZone *zone, Request *request)
+{
+  MasterIndexRecord record;
+  int result = getMasterIndexRecord(zone->index->masterIndex,
+                                    &request->chunkName, &record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  bool found = false;
+  if (record.isFound) {
+    result = getRecordFromZone(zone, request, &found, record.virtualChapter);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (found) {
+      request->location = computeIndexRegion(zone, record.virtualChapter);
+    }
+  }
+
+  /*
+   * If a record has overflowed a chapter index in more than one chapter
+   * (or overflowed in one chapter and collided with an existing record),
+   * it will exist as a collision record in the master index, but we won't
+   * find it in the volume. This case needs special handling.
+   */
+  bool     overflowRecord = (record.isFound && record.isCollision && !found);
+  uint64_t chapter        = zone->newestVirtualChapter;
+  if (found || overflowRecord) {
+    if ((request->action == REQUEST_QUERY)
+        && (!request->update || overflowRecord)) {
+      /* This is a query without update, or with nothing to update */
+      return UDS_SUCCESS;
+    }
+
+    if (record.virtualChapter != chapter) {
+      /*
+       * Update the master index to reference the new chapter for the block.
+       * If the record had been deleted or dropped from the chapter index, it
+       * will be back.
+       */
+      result = setMasterIndexRecordChapter(&record, chapter);
+    } else if (request->action != REQUEST_UPDATE) {
+      /* The record is already in the open chapter, so we're done */
+      return UDS_SUCCESS;
+    }
+  } else {
+    // The record wasn't in the master index, so check whether the name
+    // is in a cached sparse chapter.
+    if (!isMasterIndexSample(zone->index->masterIndex, &request->chunkName)
+        && isSparse(zone->index->volume->geometry)) {
+      // Passing UINT64_MAX triggers a search of the entire sparse cache.
+      result = searchSparseCacheInZone(zone, request, UINT64_MAX, &found);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+
+      if (found) {
+        request->location = LOC_IN_SPARSE;
+      }
+    }
+
+    if (request->action == REQUEST_QUERY) {
+      if (!found || !request->update) {
+        // This is a query without update or for a new record, so we're done.
+        return UDS_SUCCESS;
+      }
+    }
+
+    /*
+     * Add a new entry to the master index referencing the open chapter.
+     * This needs to be done both for new records, and for records from
+     * cached sparse chapters.
+     */
+    result = putMasterIndexRecord(&record, chapter);
+  }
+
+  if (result == UDS_OVERFLOW) {
+    /*
+     * The master index encountered a delta list overflow.  The condition
+     * was already logged. We will go on without adding the chunk to the
+     * open chapter.
+     */
+    return UDS_SUCCESS;
+  }
+
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  UdsChunkData *metadata;
+  if (!found || (request->action == REQUEST_UPDATE)) {
+    // This is a new record or we're updating an existing record.
+    metadata = &request->newMetadata;
+  } else {
+    // This is a duplicate, so move the record to the open chapter (for LRU).
+    metadata = &request->oldMetadata;
+  }
+  return putRecordInZone(zone, request, metadata);
+}
+
+/**********************************************************************/
+static int removeFromIndexZone(IndexZone *zone, Request *request)
+{
+  MasterIndexRecord record;
+  int result = getMasterIndexRecord(zone->index->masterIndex,
+                                    &request->chunkName, &record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (!record.isFound) {
+    // The name does not exist in master index, so there is nothing to remove.
+    return UDS_SUCCESS;
+  }
+
+  if (!record.isCollision) {
+    // Non-collision records are hints, so resolve the name in the chapter.
+    bool found;
+    int result = getRecordFromZone(zone, request, &found,
+                                   record.virtualChapter);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    if (!found) {
+      // The name does not exist in the chapter, so there is nothing to remove.
+      return UDS_SUCCESS;
+    }
+  }
+
+  request->location = computeIndexRegion(zone, record.virtualChapter);
+
+  /*
+   * Delete the master index entry for the named record only. Note that a
+   * later search might later return stale advice if there is a colliding name
+   * in the same chapter, but it's a very rare case (1 in 2^21).
+   */
+  result = removeMasterIndexRecord(&record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // If the record is in the open chapter, we must remove it or mark it
+  // deleted to avoid trouble if the record is added again later.
+  if (request->location == LOC_IN_OPEN_CHAPTER) {
+    bool hashExists = false;
+    removeFromOpenChapter(zone->openChapter, &request->chunkName, &hashExists);
+    result = ASSERT(hashExists, "removing record not found in open chapter");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Simulate the creation of a sparse cache barrier message by the triage
+ * queue, and the later execution of that message in an index zone.
+ *
+ * If the index receiving the request is multi-zone or dense, this function
+ * does nothing. This simulation is an optimization for single-zone sparse
+ * indexes. It also supports unit testing of indexes without routers and
+ * queues.
+ *
+ * @param zone     the index zone responsible for the index request
+ * @param request  the index request about to be executed
+ *
+ * @return UDS_SUCCESS always
+ **/
+static int simulateIndexZoneBarrierMessage(IndexZone *zone, Request *request)
+{
+  // Do nothing unless this is a single-zone sparse index.
+  if ((zone->index->zoneCount > 1)
+      || !isSparse(zone->index->volume->geometry)) {
+    return UDS_SUCCESS;
+  }
+
+  // Check if the index request is for a sampled name in a sparse chapter.
+  uint64_t sparseVirtualChapter = triageIndexRequest(zone->index, request);
+  if (sparseVirtualChapter == UINT64_MAX) {
+    // Not indexed, not a hook, or in a chapter that is still dense, which
+    // means there should be no change to the sparse chapter index cache.
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * The triage queue would have generated and enqueued a barrier message
+   * preceding this request, which we simulate by directly invoking the
+   * execution hook for an equivalent message.
+   */
+  BarrierMessageData barrier = { .virtualChapter = sparseVirtualChapter };
+  return executeSparseCacheBarrierMessage(zone, &barrier);
+}
+
+/**********************************************************************/
+static int dispatchIndexZoneRequest(IndexZone *zone, Request *request)
+{
+  if (!request->requeued) {
+    // Single-zone sparse indexes don't have a triage queue to generate cache
+    // barrier requests, so see if we need to synthesize a barrier.
+    int result = simulateIndexZoneBarrierMessage(zone, request);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Set the default location. It will be overwritten if we find the chunk.
+  request->location = LOC_UNAVAILABLE;
+
+  int result;
+  switch (request->action) {
+  case REQUEST_INDEX:
+  case REQUEST_UPDATE:
+  case REQUEST_QUERY:
+    result = makeUnrecoverable(searchIndexZone(zone, request));
+    break;
+
+  case REQUEST_DELETE:
+    result = makeUnrecoverable(removeFromIndexZone(zone, request));
+    break;
+
+  default:
+    result = logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                       "attempted to execute invalid action:"
+                                       " %d",
+                                       request->action);
+    break;
+  }
+
+  return result;
+}
+
+/**********************************************************************/
+int dispatchIndexRequest(Index *index, Request *request)
+{
+  return dispatchIndexZoneRequest(getRequestZone(index, request), request);
+}
+
+/**********************************************************************/
+static int rebuildIndexPageMap(Index *index, uint64_t vcn)
+{
+  Geometry *geometry = index->volume->geometry;
+  unsigned int chapter = mapToPhysicalChapter(geometry, vcn);
+  unsigned int expectedListNumber = 0;
+  unsigned int indexPageNumber;
+  for (indexPageNumber = 0;
+       indexPageNumber < geometry->indexPagesPerChapter;
+       indexPageNumber++) {
+    DeltaIndexPage *chapterIndexPage;
+    int result = getPage(index->volume, chapter, indexPageNumber,
+                         CACHE_PROBE_INDEX_FIRST, NULL, &chapterIndexPage);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "failed to read index page %u"
+                                     " in chapter %u",
+                                     indexPageNumber, chapter);
+    }
+    unsigned int lowestDeltaList = chapterIndexPage->lowestListNumber;
+    unsigned int highestDeltaList = chapterIndexPage->highestListNumber;
+    if (lowestDeltaList != expectedListNumber) {
+      return logErrorWithStringError(UDS_CORRUPT_DATA,
+                                     "chapter %u index page %u is corrupt",
+                                     chapter, indexPageNumber);
+    }
+    result = updateIndexPageMap(index->volume->indexPageMap, vcn, chapter,
+                                indexPageNumber, highestDeltaList);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "failed to update chapter %u index page"
+                                     " %u",
+                                     chapter, indexPageNumber);
+    }
+    expectedListNumber = highestDeltaList + 1;
+  }
+  return UDS_SUCCESS;
+}
+
+/**
+ * Add an entry to the master index when rebuilding.
+ *
+ * @param index                The index to query.
+ * @param name                 The block name of interest.
+ * @param virtualChapter       The virtual chapter number to write to the
+ *                             master index
+ * @param willBeSparseChapter  True if this entry will be in the sparse portion
+ *                             of the index at the end of rebuilding
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int replayRecord(Index              *index,
+                        const UdsChunkName *name,
+                        uint64_t            virtualChapter,
+                        bool                willBeSparseChapter)
+{
+  if (willBeSparseChapter && !isMasterIndexSample(index->masterIndex, name)) {
+    // This entry will be in a sparse chapter after the rebuild completes,
+    // and it is not a sample, so just skip over it.
+    return UDS_SUCCESS;
+  }
+
+  MasterIndexRecord record;
+  int result = getMasterIndexRecord(index->masterIndex, name, &record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  bool updateRecord;
+  if (record.isFound) {
+    if (record.isCollision) {
+      if (record.virtualChapter == virtualChapter) {
+        /* The record is already correct, so we don't need to do anything */
+        return UDS_SUCCESS;
+      }
+      updateRecord = true;
+    } else if (record.virtualChapter == virtualChapter) {
+      /*
+       * There is a master index entry pointing to the current
+       * chapter, but we don't know if it is for the same name as the
+       * one we are currently working on or not. For now, we're just
+       * going to assume that it isn't. This will create one extra
+       * collision record if there was a deleted record in the current
+       * chapter.
+       */
+      updateRecord = false;
+    } else {
+      /*
+       * If we're rebuilding, we don't normally want to go to disk to see if
+       * the record exists, since we will likely have just read the record from
+       * disk (i.e. we know it's there). The exception to this is when we
+       * already find an entry in the master index that has a different chapter.
+       * In this case, we need to search that chapter to determine if the
+       * master index entry was for the same record or a different one.
+       */
+      result = searchVolumePageCache(index->volume, NULL, name,
+                                     record.virtualChapter, NULL,
+                                     &updateRecord);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+  } else {
+    updateRecord = false;
+  }
+
+  if (updateRecord) {
+    /*
+     * Update the master index to reference the new chapter for the block.
+     * If the record had been deleted or dropped from the chapter index, it
+     * will be back.
+     */
+    result = setMasterIndexRecordChapter(&record, virtualChapter);
+  } else {
+    /*
+     * Add a new entry to the master index referencing the open
+     * chapter. This should be done regardless of whether we are a brand
+     * new record or a sparse record, i.e. one that doesn't exist in the
+     * index but does on disk, since for a sparse record, we would want to
+     * un-sparsify if it did exist.
+     */
+    result = putMasterIndexRecord(&record, virtualChapter);
+  }
+
+  if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) {
+    /* Ignore duplicate record and delta list overflow errors */
+    return UDS_SUCCESS;
+  }
+
+  return result;
+}
+
+/**********************************************************************/
+void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber)
+{
+  index->prevCheckpoint = index->lastCheckpoint;
+  index->lastCheckpoint = ((openChapterNumber == 0)
+                           ? NO_LAST_CHECKPOINT
+                           : openChapterNumber - 1);
+
+  const char *what = (checkpoint ? "checkpoint" : "save");
+  logInfo("beginning %s (vcn %llu)", what, index->lastCheckpoint);
+}
+
+/**
+ * Suspend the index if necessary and wait for a signal to resume.
+ *
+ * @param index  The index to replay
+ *
+ * @return <code>true</code> if the replay should terminate
+ **/
+static bool checkForSuspend(Index *index)
+{
+  if (index->loadContext == NULL) {
+    return false;
+  }
+
+  lockMutex(&index->loadContext->mutex);
+  if (index->loadContext->status != INDEX_SUSPENDING) {
+    unlockMutex(&index->loadContext->mutex);
+    return false;
+  }
+
+  // Notify that we are suspended and wait for the resume.
+  index->loadContext->status = INDEX_SUSPENDED;
+  broadcastCond(&index->loadContext->cond);
+
+  while ((index->loadContext->status != INDEX_OPENING)
+         && (index->loadContext->status != INDEX_FREEING)) {
+    waitCond(&index->loadContext->cond, &index->loadContext->mutex);
+  }
+
+  bool retVal = (index->loadContext->status == INDEX_FREEING);
+  unlockMutex(&index->loadContext->mutex);
+  return retVal;
+}
+
+/**********************************************************************/
+int replayVolume(Index *index, uint64_t fromVCN)
+{
+  int result;
+  uint64_t uptoVCN = index->newestVirtualChapter;
+  logInfo("Replaying volume from chapter %llu through chapter %"
+          PRIu64,
+          fromVCN, uptoVCN);
+  setMasterIndexOpenChapter(index->masterIndex, uptoVCN);
+  setMasterIndexOpenChapter(index->masterIndex, fromVCN);
+
+  /*
+   * At least two cases to deal with here!
+   * - index loaded but replaying from lastCheckpoint; maybe full, maybe not
+   * - index failed to load, full rebuild
+   *   Starts empty, then dense-only, then dense-plus-sparse.
+   *   Need to sparsify while processing individual chapters.
+   */
+  IndexLookupMode oldLookupMode = index->volume->lookupMode;
+  index->volume->lookupMode = LOOKUP_FOR_REBUILD;
+  /*
+   * Go through each record page of each chapter and add the records back to
+   * the master index.  This should not cause anything to be written to either
+   * the open chapter or on disk volume.  Also skip the on disk chapter
+   * corresponding to upto, as this would have already been
+   * purged from the master index when the chapter was opened.
+   *
+   * Also, go through each index page for each chapter and rebuild the
+   * index page map.
+   */
+  const Geometry *geometry = index->volume->geometry;
+  uint64_t oldIPMupdate = getLastUpdate(index->volume->indexPageMap);
+  uint64_t vcn;
+  for (vcn = fromVCN; vcn < uptoVCN; ++vcn) {
+    if (checkForSuspend(index)) {
+      logInfo("Replay interrupted by index shutdown at chapter %llu", vcn);
+      return UDS_SHUTTINGDOWN;
+    }
+
+    bool willBeSparseChapter = isChapterSparse(geometry, fromVCN, uptoVCN,
+                                               vcn);
+    unsigned int chapter = mapToPhysicalChapter(geometry, vcn);
+    prefetchVolumePages(&index->volume->volumeStore,
+                        mapToPhysicalPage(geometry, chapter, 0),
+                        geometry->pagesPerChapter);
+    setMasterIndexOpenChapter(index->masterIndex, vcn);
+    result = rebuildIndexPageMap(index, vcn);
+    if (result != UDS_SUCCESS) {
+      index->volume->lookupMode = oldLookupMode;
+      return logErrorWithStringError(result,
+                                     "could not rebuild index page map for"
+                                     " chapter %u",
+                                     chapter);
+    }
+
+    unsigned int j;
+    for (j = 0; j < geometry->recordPagesPerChapter; j++) {
+      unsigned int recordPageNumber = geometry->indexPagesPerChapter + j;
+      byte *recordPage;
+      result = getPage(index->volume, chapter, recordPageNumber,
+                       CACHE_PROBE_RECORD_FIRST, &recordPage, NULL);
+      if (result != UDS_SUCCESS) {
+        index->volume->lookupMode = oldLookupMode;
+        return logUnrecoverable(result, "could not get page %d",
+                                recordPageNumber);
+      }
+      unsigned int k;
+      for (k = 0; k < geometry->recordsPerPage; k++) {
+        const byte *nameBytes = recordPage + (k * BYTES_PER_RECORD);
+
+        UdsChunkName name;
+        memcpy(&name.name, nameBytes, UDS_CHUNK_NAME_SIZE);
+
+        result = replayRecord(index, &name, vcn, willBeSparseChapter);
+        if (result != UDS_SUCCESS) {
+          char hexName[(2 * UDS_CHUNK_NAME_SIZE) + 1];
+          if (chunkNameToHex(&name, hexName, sizeof(hexName)) != UDS_SUCCESS) {
+            strncpy(hexName, "<unknown>", sizeof(hexName));
+          }
+          index->volume->lookupMode = oldLookupMode;
+          return logUnrecoverable(result,
+                                  "could not find block %s during rebuild",
+                                  hexName);
+        }
+      }
+    }
+  }
+  index->volume->lookupMode = oldLookupMode;
+
+  // We also need to reap the chapter being replaced by the open chapter
+  setMasterIndexOpenChapter(index->masterIndex, uptoVCN);
+
+  uint64_t newIPMupdate = getLastUpdate(index->volume->indexPageMap);
+
+  if (newIPMupdate != oldIPMupdate) {
+    logInfo("replay changed index page map update from %llu to %llu",
+            oldIPMupdate, newIPMupdate);
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void getIndexStats(Index *index, UdsIndexStats *counters)
+{
+  uint64_t cwAllocated = getChapterWriterMemoryAllocated(index->chapterWriter);
+  // We're accessing the master index while not on a zone thread, but that's
+  // safe to do when acquiring statistics.
+  MasterIndexStats denseStats, sparseStats;
+  getMasterIndexStats(index->masterIndex, &denseStats, &sparseStats);
+
+  counters->entriesIndexed   = (denseStats.recordCount
+                                + sparseStats.recordCount);
+  counters->memoryUsed       = ((uint64_t) denseStats.memoryAllocated
+                                + (uint64_t) sparseStats.memoryAllocated
+                                + (uint64_t) getCacheSize(index->volume)
+                                + cwAllocated);
+  counters->collisions       = (denseStats.collisionCount
+                                + sparseStats.collisionCount);
+  counters->entriesDiscarded = (denseStats.discardCount
+                                + sparseStats.discardCount);
+  counters->checkpoints      = getCheckpointCount(index->checkpoint);
+}
+
+/**********************************************************************/
+void advanceActiveChapters(Index *index)
+{
+  index->newestVirtualChapter++;
+  if (areSamePhysicalChapter(index->volume->geometry,
+                             index->newestVirtualChapter,
+                             index->oldestVirtualChapter)) {
+    index->oldestVirtualChapter++;
+  }
+}
+
+/**********************************************************************/
+uint64_t triageIndexRequest(Index *index, Request *request)
+{
+  MasterIndexTriage triage;
+  lookupMasterIndexName(index->masterIndex, &request->chunkName, &triage);
+  if (!triage.inSampledChapter) {
+    // Not indexed or not a hook.
+    return UINT64_MAX;
+  }
+
+  IndexZone *zone = getRequestZone(index, request);
+  if (!isZoneChapterSparse(zone, triage.virtualChapter)) {
+    return UINT64_MAX;
+  }
+
+  // XXX Optimize for a common case by remembering the chapter from the most
+  // recent barrier message and skipping this chapter if is it the same.
+
+  // Return the sparse chapter number to trigger the barrier messages.
+  return triage.virtualChapter;
+}
diff --git a/uds/index.h b/uds/index.h
new file mode 100644
index 0000000..d2bc805
--- /dev/null
+++ b/uds/index.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/index.h#3 $
+ */
+
+#ifndef INDEX_H
+#define INDEX_H
+
+#include "chapterWriter.h"
+#include "indexLayout.h"
+#include "indexSession.h"
+#include "indexZone.h"
+#include "loadType.h"
+#include "masterIndexOps.h"
+#include "volume.h"
+
+
+/**
+ * Index checkpoint state private to indexCheckpoint.c.
+ **/
+typedef struct indexCheckpoint IndexCheckpoint;
+
+typedef struct index {
+  bool               existed;
+  bool               hasSavedOpenChapter;
+  LoadType           loadedType;
+  IndexLoadContext  *loadContext;
+  IndexLayout       *layout;
+  IndexState        *state;
+  MasterIndex       *masterIndex;
+  Volume            *volume;
+  unsigned int       zoneCount;
+  IndexZone        **zones;
+
+  /*
+   * ATTENTION!!!
+   * The meaning of the next two fields has changed.
+   *
+   * They now represent the oldest and newest chapters only at load time,
+   * and when the index is quiescent. At other times, they may lag individual
+   * zones' views of the index depending upon the progress made by the chapter
+   * writer.
+   */
+  uint64_t       oldestVirtualChapter;
+  uint64_t       newestVirtualChapter;
+
+  uint64_t       lastCheckpoint;
+  uint64_t       prevCheckpoint;
+  ChapterWriter *chapterWriter;
+
+  // checkpoint state used by indexCheckpoint.c
+  IndexCheckpoint *checkpoint;
+} Index;
+
+/**
+ * Construct a new index from the given configuration.
+ *
+ * @param layout       The index layout
+ * @param config       The configuration to use
+ * @param userParams   The index session parameters.  If NULL, the default
+ *                     session parameters will be used.
+ * @param zoneCount    The number of zones for this index to use
+ * @param loadType     How to create the index:  it can be create only, allow
+ *                     loading from files, and allow rebuilding from the volume
+ * @param loadContext  The load context to use
+ * @param newIndex     A pointer to hold a pointer to the new index
+ *
+ * @return         UDS_SUCCESS or an error code
+ **/
+int makeIndex(IndexLayout                  *layout,
+              const Configuration          *config,
+              const struct uds_parameters  *userParams,
+              unsigned int                  zoneCount,
+              LoadType                      loadType,
+              IndexLoadContext             *loadContext,
+              Index                       **newIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Save an index.
+ *
+ * Before saving an index and while saving an index, the caller must ensure
+ * that there are no index requests in progress.
+ *
+ * Some users follow saveIndex immediately with a freeIndex.  But some tests
+ * use the IndexLayout to modify the saved index.  The Index will then have
+ * some cached information that does not reflect these updates.
+ *
+ * @param index   The index to save
+ *
+ * @return        UDS_SUCCESS if successful
+ **/
+int saveIndex(Index *index) __attribute__((warn_unused_result));
+
+/**
+ * Clean up the index and its memory.
+ *
+ * @param index   The index to destroy.
+ **/
+void freeIndex(Index *index);
+
+/**
+ * Perform the index operation specified by the action field of a UDS request.
+ *
+ * For UDS API requests, this searches the index for the chunk name in the
+ * request. If the chunk name is already present in the index, the location
+ * field of the request will be set to the IndexRegion where it was found. If
+ * the action is not DELETE, the oldMetadata field of the request will also be
+ * filled in with the prior metadata for the name.
+ *
+ * If the API request action is:
+ *
+ *   REQUEST_INDEX, a record will be added to the open chapter with the
+ *     metadata in the request for new records, and the existing metadata for
+ *     existing records
+ *
+ *   REQUEST_UPDATE, a record will be added to the open chapter with the
+ *     metadata in the request
+ *
+ *   REQUEST_QUERY, if the update flag is set in the request, any record
+ *     found will be moved to the open chapter. In all other cases the contents
+ *     of the index will remain unchanged.
+ *
+ *   REQUEST_REMOVE, the any entry with the name will removed from the index
+ *
+ * For non-API requests, no chunk name search is involved.
+ *
+ * @param index       The index
+ * @param request     The originating request
+ *
+ * @return UDS_SUCCESS, UDS_QUEUED, or an error code
+ **/
+int dispatchIndexRequest(Index *index, Request *request)
+  __attribute__((warn_unused_result));
+
+/**
+ * Internal helper to prepare the index for saving.
+ *
+ * @param index              the index
+ * @param checkpoint         whether the save is a checkpoint
+ * @param openChapterNumber  the virtual chapter number of the open chapter
+ **/
+void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber);
+
+/**
+ * Replay the volume file to repopulate the master index.
+ *
+ * @param index         The index
+ * @param fromVCN       The virtual chapter to start replaying
+ *
+ * @return              UDS_SUCCESS if successful
+ **/
+int replayVolume(Index *index, uint64_t fromVCN)
+  __attribute__((warn_unused_result));
+
+/**
+ * Gather statistics from the master index, volume, and cache.
+ *
+ * @param index     The index
+ * @param counters  the statistic counters for the index
+ **/
+void getIndexStats(Index *index, UdsIndexStats *counters);
+
+/**
+ * Set lookup state for this index.  Disabling lookups means assume
+ * all records queried are new (intended for debugging uses, e.g.,
+ * albfill).
+ *
+ * @param index     The index
+ * @param enabled   The new lookup state
+ **/
+void setIndexLookupState(Index *index, bool enabled);
+
+/**
+ * Advance the newest virtual chapter. If this will overwrite the oldest
+ * virtual chapter, advance that also.
+ *
+ * @param index The index to advance
+ **/
+void advanceActiveChapters(Index *index);
+
+/**
+ * Triage an index request, deciding whether it requires that a sparse cache
+ * barrier message precede it.
+ *
+ * This resolves the chunk name in the request in the master index,
+ * determining if it is a hook or not, and if a hook, what virtual chapter (if
+ * any) it might be found in. If a virtual chapter is found, it checks whether
+ * that chapter appears in the sparse region of the index. If all these
+ * conditions are met, the (sparse) virtual chapter number is returned. In all
+ * other cases it returns <code>UINT64_MAX</code>.
+ *
+ * @param index    the index that will process the request
+ * @param request  the index request containing the chunk name to triage
+ *
+ * @return the sparse chapter number for the sparse cache barrier message, or
+ *         <code>UINT64_MAX</code> if the request does not require a barrier
+ **/
+uint64_t triageIndexRequest(Index *index, Request *request)
+  __attribute__((warn_unused_result));
+
+#endif /* INDEX_H */
diff --git a/uds/indexCheckpoint.c b/uds/indexCheckpoint.c
new file mode 100644
index 0000000..9c803b6
--- /dev/null
+++ b/uds/indexCheckpoint.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.c#2 $
+ */
+
+#include "indexCheckpoint.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "threads.h"
+#include "typeDefs.h"
+
+/**
+ * index checkpointState values
+ *
+ * @note The order of these values is significant,
+ *       see indexState.c doIndexStateCheckpointInZone().
+ **/
+typedef enum checkpointState {
+  NOT_CHECKPOINTING,
+  CHECKPOINT_IN_PROGRESS,
+  CHECKPOINT_ABORTING
+} CheckpointState;
+
+/**
+ * Private structure which tracks checkpointing.
+ **/
+struct indexCheckpoint {
+  Mutex            mutex;       // covers this group of fields
+  uint64_t         chapter;     // vcn of the starting chapter
+  CheckpointState  state;       // is checkpoint in progress or aborting
+  unsigned int     zonesBusy;   // count of zones not yet done
+  unsigned int     frequency;   // number of chapters between checkpoints
+  uint64_t         checkpoints; // number of checkpoints this session
+};
+
+/**
+ * Enum return value of indexCheckpointTrigger function.
+ **/
+typedef enum indexCheckpointTriggerValue {
+  ICTV_IDLE,       //< no checkpointing right now
+  ICTV_START,      //< start a new checkpoint now
+  ICTV_CONTINUE,   //< continue checkpointing if needed
+  ICTV_FINISH,     //< finish checkpointing, next time will start new cycle
+  ICTV_ABORT       //< immediately abort checkpointing
+} IndexCheckpointTriggerValue;
+
+typedef int CheckpointFunction(Index *index, unsigned int zone);
+
+//  These functions are called while holding the checkpoint->mutex but are
+//  expected to release it.
+//
+static CheckpointFunction doCheckpointStart;
+static CheckpointFunction doCheckpointProcess;
+static CheckpointFunction doCheckpointFinish;
+static CheckpointFunction doCheckpointAbort;
+
+CheckpointFunction *const checkpointFuncs[] = {
+  NULL,
+  doCheckpointStart,
+  doCheckpointProcess,
+  doCheckpointFinish,
+  doCheckpointAbort
+};
+
+/**********************************************************************/
+int makeIndexCheckpoint(Index *index)
+{
+  IndexCheckpoint *checkpoint;
+  int result
+    = ALLOCATE(1, IndexCheckpoint, "IndexCheckpoint", &checkpoint);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initMutex(&checkpoint->mutex);
+  if (result != UDS_SUCCESS) {
+    FREE(checkpoint);
+    return result;
+  }
+
+  checkpoint->checkpoints = 0;
+
+  index->checkpoint = checkpoint;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeIndexCheckpoint(IndexCheckpoint *checkpoint)
+{
+  if (checkpoint != NULL) {
+    destroyMutex(&checkpoint->mutex);
+    FREE(checkpoint);
+  }
+}
+
+/**********************************************************************/
+unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint)
+{
+  lockMutex(&checkpoint->mutex);
+  unsigned int frequency = checkpoint->frequency;
+  unlockMutex(&checkpoint->mutex);
+  return frequency;
+}
+
+/**********************************************************************/
+unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint,
+                                         unsigned int     frequency)
+{
+  lockMutex(&checkpoint->mutex);
+  unsigned int oldFrequency = checkpoint->frequency;
+  checkpoint->frequency = frequency;
+  unlockMutex(&checkpoint->mutex);
+  return oldFrequency;
+}
+
+/**********************************************************************/
+uint64_t getCheckpointCount(IndexCheckpoint *checkpoint)
+{
+  return checkpoint->checkpoints;
+}
+
+/**********************************************************************/
+static IndexCheckpointTriggerValue
+getCheckpointAction(IndexCheckpoint *checkpoint,
+                    uint64_t virtualChapter)
+{
+  if (checkpoint->frequency == 0) {
+    return ICTV_IDLE;
+  }
+  unsigned int value = virtualChapter % checkpoint->frequency;
+  if (checkpoint->state == CHECKPOINT_ABORTING) {
+    return ICTV_ABORT;
+  } else if (checkpoint->state == CHECKPOINT_IN_PROGRESS) {
+    if (value == checkpoint->frequency - 1) {
+      return ICTV_FINISH;
+    } else {
+      return ICTV_CONTINUE;
+    }
+  } else {
+    if (value == 0) {
+      return ICTV_START;
+    } else {
+      return ICTV_IDLE;
+    }
+  }
+}
+
+/**********************************************************************/
+int processCheckpointing(Index        *index,
+                         unsigned int  zone,
+                         uint64_t      newVirtualChapter)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+  lockMutex(&checkpoint->mutex);
+
+  IndexCheckpointTriggerValue ictv
+    = getCheckpointAction(checkpoint, newVirtualChapter);
+
+  if (ictv == ICTV_START) {
+    checkpoint->chapter = newVirtualChapter;
+  }
+
+  CheckpointFunction *func = checkpointFuncs[ictv];
+  if (func == NULL) {
+    // nothing to do in idle state
+    unlockMutex(&checkpoint->mutex);
+    return UDS_SUCCESS;
+  }
+
+  return (*func)(index, zone);
+}
+
+/**********************************************************************/
+int processChapterWriterCheckpointSaves(Index *index)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+
+  int result = UDS_SUCCESS;
+
+  lockMutex(&checkpoint->mutex);
+  if (checkpoint->state == CHECKPOINT_IN_PROGRESS) {
+    result =
+      performIndexStateCheckpointChapterSynchronizedSaves(index->state);
+
+    if (result != UDS_SUCCESS) {
+      checkpoint->state = CHECKPOINT_ABORTING;
+      logInfo("checkpoint failed");
+      index->lastCheckpoint = index->prevCheckpoint;
+    }
+  }
+
+  unlockMutex(&checkpoint->mutex);
+  return result;
+}
+
+/**
+ *  Helper function used to abort checkpoint if an error has occurred.
+ *
+ *  @param index        the index
+ *  @param result       the error result
+ *
+ *  @return result
+ **/
+static int abortCheckpointing(Index *index, int result)
+{
+  if (index->checkpoint->state != NOT_CHECKPOINTING) {
+    index->checkpoint->state = CHECKPOINT_ABORTING;
+    logInfo("checkpoint failed");
+    index->lastCheckpoint = index->prevCheckpoint;
+  }
+  return result;
+}
+
+/**********************************************************************/
+int finishCheckpointing(Index *index)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+
+  int result = processChapterWriterCheckpointSaves(index);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  lockMutex(&checkpoint->mutex);
+
+  unsigned int z;
+  for (z = 0; z < index->zoneCount; ++z) {
+    if (checkpoint->state != CHECKPOINT_IN_PROGRESS) {
+      break;
+    }
+    result = doCheckpointFinish(index, z);
+    // reacquire mutex released by doCheckpointFinish
+    lockMutex(&checkpoint->mutex);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+  }
+
+  if ((result == UDS_SUCCESS) &&
+      (checkpoint->state == CHECKPOINT_IN_PROGRESS)) {
+    result = finishIndexStateCheckpoint(index->state);
+    if (result == UDS_SUCCESS) {
+      checkpoint->state = NOT_CHECKPOINTING;
+    }
+  }
+
+  unlockMutex(&checkpoint->mutex);
+  return result;
+}
+
+/**
+ * Starts an incremental checkpoint.
+ *
+ * Called by the first zone to finish a chapter which starts a checkpoint.
+ *
+ * @param index the index
+ * @param zone  the zone number
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int doCheckpointStart(Index *index, unsigned int zone)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+  beginSave(index, true, checkpoint->chapter);
+  int result = startIndexStateCheckpoint(index->state);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "cannot start index checkpoint");
+    index->lastCheckpoint = index->prevCheckpoint;
+    unlockMutex(&checkpoint->mutex);
+    return result;
+  }
+
+  checkpoint->state = CHECKPOINT_IN_PROGRESS;
+  checkpoint->zonesBusy = index->zoneCount;
+
+  return doCheckpointProcess(index, zone);
+}
+
+/**********************************************************************/
+static int doCheckpointProcess(Index *index, unsigned int zone)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+  unlockMutex(&checkpoint->mutex);
+  CompletionStatus status = CS_NOT_COMPLETED;
+  int result = performIndexStateCheckpointInZone(index->state, zone, &status);
+  if (result != UDS_SUCCESS) {
+    lockMutex(&checkpoint->mutex);
+    logErrorWithStringError(result, "cannot continue index checkpoint");
+    result = abortCheckpointing(index, result);
+    unlockMutex(&checkpoint->mutex);
+  } else if (status == CS_JUST_COMPLETED) {
+    lockMutex(&checkpoint->mutex);
+    if (--checkpoint->zonesBusy == 0) {
+      checkpoint->checkpoints += 1;
+      logInfo("finished checkpoint");
+      result = finishIndexStateCheckpoint(index->state);
+      if (result != UDS_SUCCESS) {
+        logErrorWithStringError(result, "%s checkpoint finish failed",
+                                __func__);
+      }
+      checkpoint->state = NOT_CHECKPOINTING;
+    }
+    unlockMutex(&checkpoint->mutex);
+  }
+  return result;
+}
+
+/**********************************************************************/
+static int doCheckpointAbort(Index *index, unsigned int zone)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+  CompletionStatus status = CS_NOT_COMPLETED;
+  int result = abortIndexStateCheckpointInZone(index->state, zone, &status);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "cannot abort index checkpoint");
+  } else if (status == CS_JUST_COMPLETED) {
+    if (--checkpoint->zonesBusy == 0) {
+      logInfo("aborted checkpoint");
+      result = abortIndexStateCheckpoint(index->state);
+      if (result != UDS_SUCCESS) {
+        logErrorWithStringError(result, "checkpoint abort failed");
+      }
+      checkpoint->state = NOT_CHECKPOINTING;
+    }
+  }
+  unlockMutex(&checkpoint->mutex);
+
+  return result;
+}
+
+/**********************************************************************/
+static int doCheckpointFinish(Index *index, unsigned int zone)
+{
+  IndexCheckpoint *checkpoint = index->checkpoint;
+  CompletionStatus status = CS_NOT_COMPLETED;
+  unlockMutex(&checkpoint->mutex);
+  int result = finishIndexStateCheckpointInZone(index->state, zone, &status);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "cannot finish index checkpoint");
+    lockMutex(&checkpoint->mutex);
+    result = abortCheckpointing(index, result);
+    unlockMutex(&checkpoint->mutex);
+  } else if (status == CS_JUST_COMPLETED) {
+    lockMutex(&checkpoint->mutex);
+    if (--checkpoint->zonesBusy == 0) {
+      checkpoint->checkpoints += 1;
+      logInfo("finished checkpoint");
+      result = finishIndexStateCheckpoint(index->state);
+      if (result != UDS_SUCCESS) {
+        logErrorWithStringError(result, "%s checkpoint finish failed",
+                                __func__);
+      }
+      checkpoint->state = NOT_CHECKPOINTING;
+    }
+    unlockMutex(&checkpoint->mutex);
+  }
+  return result;
+}
diff --git a/uds/indexCheckpoint.h b/uds/indexCheckpoint.h
new file mode 100644
index 0000000..02d2936
--- /dev/null
+++ b/uds/indexCheckpoint.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.h#1 $
+ */
+
+#ifndef INDEX_CHECKPOINT_H
+#define INDEX_CHECKPOINT_H
+
+#include "index.h"
+
+/**
+ * Construct and initialize the checkpoint sub-structure of an index.
+ *
+ * @param index  the index receive the new checkpoint structure.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIndexCheckpoint(Index *index) __attribute__((warn_unused_result));
+
+/**
+ * Free the checkpoint sub-structure of an index.
+ *
+ * @param checkpoint  the structure to free
+ **/
+void freeIndexCheckpoint(IndexCheckpoint *checkpoint);
+
+/**
+ * Get the current checkpointing frequency of an index.
+ *
+ * @param checkpoint  the checkpoint state of the index
+ *
+ * @return the number of chapters between checkpoints
+ **/
+unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set checkpointing frequency for the index.
+ *
+ * @param checkpoint  the checkpoint state of the index
+ * @param frequency   The new checkpointing frequency
+ *
+ * @return the old checkpointing frequency
+ **/
+unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint,
+                                         unsigned int     frequency);
+
+/**
+ * Gets the number of checkpoints completed during the lifetime of this index
+ *
+ * @param checkpoint  the checkpoint state of the index
+ *
+ * @return            the number of checkpoints completed
+ **/
+uint64_t getCheckpointCount(IndexCheckpoint *checkpoint)
+  __attribute__((warn_unused_result));
+
+/**
+ * If incremental checkpointing is in progress, finish it.
+ *
+ * @param index     The index
+ *
+ * @return          UDS_SUCCESS or an error code
+ *
+ * @note        This function is called automatically during normal operation;
+ *              its presence here is for tests that expect checkpointing to
+ *              have completed at some point in their logic.  It is not an
+ *              error to call this function if checkpointing is not in
+ *              progress, it silently returns success.
+ **/
+int finishCheckpointing(Index *index) __attribute__((warn_unused_result));
+
+/**
+ * Process one zone's incremental checkpoint operation. Automatically
+ * starts, processes, and finishes a checkpoint over multiple invocations
+ * as successive chapters are closed and written.
+ *
+ * Uses its own mutex to serialize the starting and finishing or aborting,
+ * but allows parallel execution of the incremental progress.
+ *
+ * @param index             The index to checkpoint
+ * @param zone              The current zone number
+ * @param newVirtualChapter The number of the chapter which the calling
+ *                          zone has just opened
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int processCheckpointing(Index        *index,
+                         unsigned int  zone,
+                         uint64_t      newVirtualChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Process saves done outside any zone by the chapter writer.
+ *
+ * Grabs the mutex associated with processCheckpointing().
+ *
+ * @param index         The index to process.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int processChapterWriterCheckpointSaves(Index *index)
+  __attribute__((warn_unused_result));
+
+#endif // INDEX_CHECKPOINT_H
diff --git a/uds/indexComponent.c b/uds/indexComponent.c
new file mode 100644
index 0000000..c932b8d
--- /dev/null
+++ b/uds/indexComponent.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.c#8 $
+ */
+
+#include "indexComponent.h"
+
+#include "compiler.h"
+#include "errors.h"
+#include "indexLayout.h"
+#include "indexState.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "typeDefs.h"
+
+/*****************************************************************************/
+int makeIndexComponent(IndexState                *state,
+                       const IndexComponentInfo  *info,
+                       unsigned int               zoneCount,
+                       void                      *data,
+                       void                      *context,
+                       IndexComponent           **componentPtr)
+{
+  if ((info == NULL) || (info->name == NULL)) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "invalid component or directory specified");
+  }
+  if (info->loader == NULL) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "no .loader function specified "
+                                   "for component %s",
+                                   info->name);
+  }
+  if ((info->saver == NULL) && (info->incremental == NULL)) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "neither .saver function nor .incremental "
+                                   "function specified for component %s",
+                                   info->name);
+  }
+
+  IndexComponent *component = NULL;
+  int result = ALLOCATE(1, IndexComponent, "index component", &component);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  component->componentData = data;
+  component->context       = context;
+  component->info          = info;
+  component->numZones      = info->multiZone ? zoneCount : 1;
+  component->state         = state;
+  component->writeZones    = NULL;
+  *componentPtr = component;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static void freeWriteZones(IndexComponent *component)
+{
+  if (component->writeZones != NULL) {
+    unsigned int z;
+    for (z = 0; z < component->numZones; ++z) {
+      WriteZone *wz = component->writeZones[z];
+      if (wz == NULL) {
+        continue;
+      }
+      freeBufferedWriter(wz->writer);
+      FREE(wz);
+    }
+    FREE(component->writeZones);
+    component->writeZones = NULL;
+  }
+}
+
+/*****************************************************************************/
+void freeIndexComponent(IndexComponent **componentPtr)
+{
+  if (componentPtr == NULL) {
+    return;
+  }
+  IndexComponent *component = *componentPtr;
+  if (component == NULL) {
+    return;
+  }
+  *componentPtr = NULL;
+
+  freeWriteZones(component);
+  FREE(component);
+}
+
+/**
+ * Destroy, deallocate, and expunge a read portal.
+ *
+ * @param readPortal     the readzone array
+ **/
+static void freeReadPortal(ReadPortal *readPortal)
+{
+  if (readPortal == NULL) {
+    return;
+  }
+  unsigned int z;
+  for (z = 0; z < readPortal->zones; ++z) {
+    if (readPortal->readers[z] != NULL) {
+      freeBufferedReader(readPortal->readers[z]);
+    }
+  }
+  FREE(readPortal->readers);
+  FREE(readPortal);
+}
+
+/*****************************************************************************/
+int getBufferedReaderForPortal(ReadPortal      *portal,
+                               unsigned int     part,
+                               BufferedReader **readerPtr)
+{
+  if (part >= portal->zones) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "%s: cannot access zone %u of %u",
+                                   __func__, part, portal->zones);
+  }
+  IndexComponent *component = portal->component;
+  if (component->info->ioStorage && (portal->readers[part] == NULL)) {
+    int result = openStateBufferedReader(component->state,
+                                         component->info->kind, part,
+                                         &portal->readers[part]);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "%s: cannot make buffered reader "
+                                     "for zone %u", __func__, part);
+    }
+  }
+  *readerPtr = portal->readers[part];
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int readIndexComponent(IndexComponent *component)
+{
+  ReadPortal *portal;
+  int result = ALLOCATE(1, ReadPortal, "index component read portal", &portal);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  int readZones = component->state->loadZones;
+  result = ALLOCATE(readZones, BufferedReader *, "read zone buffered readers",
+                    &portal->readers);
+  if (result != UDS_SUCCESS) {
+    FREE(portal);
+    return result;
+  }
+
+  portal->component = component;
+  portal->zones = readZones;
+  result = (*component->info->loader)(portal);
+  freeReadPortal(portal);
+  return result;
+}
+
+/**
+ * Determine the writeZone structure for the specified component and zone.
+ *
+ * @param [in]  component      the index component
+ * @param [in]  zone           the zone number
+ * @param [out] writeZonePtr   the resulting write zone instance
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int resolveWriteZone(const IndexComponent  *component,
+                            unsigned int           zone,
+                            WriteZone            **writeZonePtr)
+{
+  int result = ASSERT(writeZonePtr != NULL,
+                      "output parameter is null");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (component->writeZones == NULL) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "cannot resolve index component write zone:"
+                                   " not allocated");
+  }
+
+  if (zone >= component->numZones) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "cannot resolve index component write zone:"
+                                   " zone out of range");
+  }
+  *writeZonePtr = component->writeZones[zone];
+  return UDS_SUCCESS;
+}
+
+/**
+ * Non-incremental save function used to emulate a regular save
+ * using an incremental save function as a basis.
+ *
+ * @param component    the index component
+ * @param writer       the buffered writer
+ * @param zone         the zone number
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int indexComponentSaverIncrementalWrapper(IndexComponent *component,
+                                                 BufferedWriter *writer,
+                                                 unsigned int    zone)
+{
+  IncrementalWriter incrFunc  = component->info->incremental;
+  bool              completed = false;
+
+  int result = (*incrFunc)(component, writer, zone, IWC_START, &completed);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (!completed) {
+    result = (*incrFunc)(component, writer, zone, IWC_FINISH, &completed);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  result = flushBufferedWriter(writer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Specify that writing to a specific zone file has finished.
+ *
+ * If a syncer has been registered with the index component, the file
+ * descriptor will be enqueued upon it for fsyncing and closing.
+ * If not, or if the enqueue fails, the file will be fsynced and closed
+ * immediately.
+ *
+ * @param writeZone    the index component write zone
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int doneWithZone(WriteZone *writeZone)
+{
+  const IndexComponent *component = writeZone->component;
+  if (writeZone->writer != NULL) {
+    int result = flushBufferedWriter(writeZone->writer);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "cannot flush buffered writer for "
+                                     "%s component (zone %u)",
+                                     component->info->name, writeZone->zone);
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**
+ * Construct the array of WriteZone instances for this component.
+ *
+ * @param component    the index component
+ *
+ * @return UDS_SUCCESS or an error code
+ *
+ * If this is a multizone component, each zone will be fully defined,
+ * otherwise zone 0 stands in for the single state file.
+ **/
+static int makeWriteZones(IndexComponent *component)
+{
+  unsigned int z;
+  if (component->writeZones != NULL) {
+    // just reinitialize states
+    for (z = 0; z < component->numZones; ++z) {
+      WriteZone *wz = component->writeZones[z];
+      wz->phase = IWC_IDLE;
+    }
+    return UDS_SUCCESS;
+  }
+
+  int result = ALLOCATE(component->numZones, WriteZone *,
+                        "index component write zones", &component->writeZones);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  for (z = 0; z < component->numZones; ++z) {
+    result = ALLOCATE(1, WriteZone, "plain write zone",
+                      &component->writeZones[z]);
+    if (result != UDS_SUCCESS) {
+      freeWriteZones(component);
+      return result;
+    }
+    *component->writeZones[z] = (WriteZone) {
+      .component = component,
+      .phase     = IWC_IDLE,
+      .zone      = z,
+    };
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static int openBufferedWriters(IndexComponent *component)
+{
+  int result = UDS_SUCCESS;
+  WriteZone **wzp;
+  for (wzp = component->writeZones;
+       wzp < component->writeZones + component->numZones;
+       ++wzp) {
+    WriteZone *wz = *wzp;
+    wz->phase = IWC_START;
+
+    result = ASSERT(wz->writer == NULL, "write zone writer already exists");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    if (component->info->ioStorage) {
+      int result = openStateBufferedWriter(component->state,
+                                           component->info->kind, wz->zone,
+                                           &wz->writer);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static int startIndexComponentSave(IndexComponent *component)
+{
+  int result = makeWriteZones(component);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = openBufferedWriters(component);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int startIndexComponentIncrementalSave(IndexComponent *component)
+{
+  return startIndexComponentSave(component);
+}
+
+/*****************************************************************************/
+int writeIndexComponent(IndexComponent *component)
+{
+  Saver saver = component->info->saver;
+  if ((saver == NULL) && (component->info->incremental != NULL)) {
+    saver = indexComponentSaverIncrementalWrapper;
+  }
+
+  int result = startIndexComponentSave(component);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int z;
+  for (z = 0; z < component->numZones; ++z) {
+    WriteZone *writeZone = component->writeZones[z];
+
+    result = (*saver)(component, writeZone->writer, z);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+
+    result = doneWithZone(writeZone);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+
+    freeBufferedWriter(writeZone->writer);
+    writeZone->writer = NULL;
+  }
+
+  if (result != UDS_SUCCESS) {
+    freeWriteZones(component);
+    return logErrorWithStringError(result, "index component write failed");
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Close a specific buffered writer in a component write zone.
+ *
+ * @param writeZone    the write zone
+ *
+ * @return UDS_SUCCESS or an error code
+ *
+ * @note closing a buffered writer causes its file descriptor to be
+ *       passed to doneWithZone
+ **/
+static int closeBufferedWriter(WriteZone *writeZone)
+{
+  if (writeZone->writer == NULL) {
+    return UDS_SUCCESS;
+  }
+
+  int result = doneWithZone(writeZone);
+  freeBufferedWriter(writeZone->writer);
+  writeZone->writer = NULL;
+
+  return result;
+}
+
+/**
+ * Faux incremental saver function for index components which only define
+ * a simple saver.  Conforms to IncrementalWriter signature.
+ *
+ * @param [in]  component      the index component
+ * @param [in]  writer         the buffered writer that does the output
+ * @param [in]  zone           the zone number
+ * @param [in]  command        the incremental writer command
+ * @param [out] completed      if non-NULL, set to whether the save is complete
+ *
+ * @return UDS_SUCCESS or an error code
+ *
+ * @note This wrapper always calls the non-incremental saver when
+ *       the IWC_START command is issued, and always reports that
+ *       the save is complete unless the saver failed.
+ **/
+static int wrapSaverAsIncremental(IndexComponent           *component,
+                                  BufferedWriter           *writer,
+                                  unsigned int              zone,
+                                  IncrementalWriterCommand  command,
+                                  bool                     *completed)
+{
+  int result = UDS_SUCCESS;
+
+  if ((command >= IWC_START) && (command <= IWC_FINISH)) {
+    result = (*component->info->saver)(component, writer, zone);
+    if ((result == UDS_SUCCESS) && (writer != NULL)) {
+      noteBufferedWriterUsed(writer);
+    }
+  }
+  if ((result == UDS_SUCCESS) && (completed != NULL)) {
+    *completed = true;
+  }
+  return result;
+}
+
+/**
+ * Return the appropriate incremental writer function depending on
+ * the component's type and whether this is the first zone.
+ *
+ * @param component    the index component
+ *
+ * @return the correct IncrementalWriter function to use, or
+ *         NULL signifying no progress can be made at this time.
+ **/
+static IncrementalWriter getIncrementalWriter(IndexComponent *component)
+{
+  IncrementalWriter incrFunc = component->info->incremental;
+
+  if (incrFunc == NULL) {
+    incrFunc = &wrapSaverAsIncremental;
+  }
+
+  return incrFunc;
+}
+
+/*****************************************************************************/
+int performIndexComponentZoneSave(IndexComponent   *component,
+                                  unsigned int      zone,
+                                  CompletionStatus *completed)
+{
+  CompletionStatus comp = CS_NOT_COMPLETED;
+
+  WriteZone *wz = NULL;
+  int result = resolveWriteZone(component, zone, &wz);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (wz->phase == IWC_IDLE) {
+    comp = CS_COMPLETED_PREVIOUSLY;
+  } else if (wz->phase == IWC_DONE) {
+    comp = CS_JUST_COMPLETED;
+    wz->phase = IWC_IDLE;
+  } else if (!component->info->chapterSync) {
+    bool done = false;
+    IncrementalWriter incrFunc = getIncrementalWriter(component);
+    int result = (*incrFunc)(component, wz->writer, zone, wz->phase, &done);
+    if (result != UDS_SUCCESS) {
+      if (wz->phase == IWC_ABORT) {
+        wz->phase = IWC_IDLE;
+      } else {
+        wz->phase = IWC_ABORT;
+      }
+      return result;
+    }
+    if (done) {
+      comp = CS_JUST_COMPLETED;
+      wz->phase = IWC_IDLE;
+    } else if (wz->phase == IWC_START) {
+      wz->phase = IWC_CONTINUE;
+    }
+  }
+
+  if (completed != NULL) {
+    *completed = comp;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int performIndexComponentChapterWriterSave(IndexComponent *component)
+{
+  WriteZone *wz = NULL;
+  int result = resolveWriteZone(component, 0, &wz);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) {
+    bool done = false;
+    IncrementalWriter incrFunc = getIncrementalWriter(component);
+    int result = ASSERT(incrFunc != NULL, "no writer function");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = (*incrFunc)(component, wz->writer, 0, wz->phase, &done);
+    if (result != UDS_SUCCESS) {
+      if (wz->phase == IWC_ABORT) {
+        wz->phase = IWC_IDLE;
+      } else {
+        wz->phase = IWC_ABORT;
+      }
+      return result;
+    }
+    if (done) {
+      wz->phase = IWC_DONE;
+    } else if (wz->phase == IWC_START) {
+      wz->phase = IWC_CONTINUE;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int finishIndexComponentZoneSave(IndexComponent   *component,
+                                 unsigned int      zone,
+                                 CompletionStatus *completed)
+{
+  WriteZone *wz = NULL;
+  int result = resolveWriteZone(component, zone, &wz);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  CompletionStatus comp;
+  switch (wz->phase) {
+    case IWC_IDLE:
+      comp = CS_COMPLETED_PREVIOUSLY;
+      break;
+
+    case IWC_DONE:
+      comp = CS_JUST_COMPLETED;
+      break;
+
+    default:
+      comp = CS_NOT_COMPLETED;
+  }
+
+  IncrementalWriter incrFunc = getIncrementalWriter(component);
+  if ((wz->phase >= IWC_START) && (wz->phase < IWC_ABORT)) {
+    bool done = false;
+    int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done);
+    if (result != UDS_SUCCESS) {
+      wz->phase = IWC_ABORT;
+      return result;
+    }
+    if (!done) {
+      logWarning("finish incremental save did not complete for %s zone %u",
+                 component->info->name, zone);
+      return UDS_CHECKPOINT_INCOMPLETE;
+    }
+    wz->phase = IWC_IDLE;
+    comp = CS_JUST_COMPLETED;
+  }
+
+  if (completed != NULL) {
+    *completed = comp;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int finishIndexComponentIncrementalSave(IndexComponent *component)
+{
+  unsigned int zone;
+  for (zone = 0; zone < component->numZones; ++zone) {
+    WriteZone *wz = component->writeZones[zone];
+    IncrementalWriter incrFunc = getIncrementalWriter(component);
+    if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) {
+      // Note: this is only safe if no other threads are currently processing
+      // this particular index
+      bool done = false;
+      int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      if (!done) {
+        logWarning("finishing incremental save did not complete for %s zone %u",
+                   component->info->name, zone);
+        return UDS_UNEXPECTED_RESULT;
+      }
+      wz->phase = IWC_IDLE;
+    }
+
+    if ((wz->writer != NULL) && !wasBufferedWriterUsed(wz->writer)) {
+      return logErrorWithStringError(UDS_CHECKPOINT_INCOMPLETE,
+                                     "component %s zone %u did not get written",
+                                     component->info->name, zone);
+    }
+
+    int result = closeBufferedWriter(wz);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int abortIndexComponentZoneSave(IndexComponent   *component,
+                                unsigned int      zone,
+                                CompletionStatus *status)
+{
+  WriteZone *wz = NULL;
+  int result = resolveWriteZone(component, zone, &wz);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  CompletionStatus comp = CS_COMPLETED_PREVIOUSLY;
+
+  IncrementalWriter incrFunc = getIncrementalWriter(component);
+  if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) {
+    result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL);
+    wz->phase = IWC_IDLE;
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    comp = CS_JUST_COMPLETED;
+  }
+
+  if (status != NULL) {
+    *status = comp;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int abortIndexComponentIncrementalSave(IndexComponent *component)
+{
+  int result = UDS_SUCCESS;
+  unsigned int zone;
+  for (zone = 0; zone < component->numZones; ++zone) {
+    WriteZone *wz = component->writeZones[zone];
+    IncrementalWriter incrFunc = getIncrementalWriter(component);
+    if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) {
+      // Note: this is only safe if no other threads are currently processing
+      // this particular index
+      result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL);
+      wz->phase = IWC_IDLE;
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+
+    int result = closeBufferedWriter(wz);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int discardIndexComponent(IndexComponent *component)
+{
+  if (!component->info->ioStorage) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  unsigned int numZones = 0;
+  unsigned int saveSlot = 0;
+  int result = findLatestIndexSaveSlot(component->state->layout, &numZones,
+                                       &saveSlot);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int oldSaveSlot = component->state->saveSlot;
+  component->state->saveSlot = saveSlot;
+
+  unsigned int z;
+  for (z = 0; z < numZones; ++z) {
+    BufferedWriter *writer;
+    int result = openStateBufferedWriter(component->state,
+                                         component->info->kind, z, &writer);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+    result = writeZerosToBufferedWriter(writer, UDS_BLOCK_SIZE);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+    result = flushBufferedWriter(writer);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+    freeBufferedWriter(writer);
+  }
+
+  component->state->saveSlot = oldSaveSlot;
+  return result;
+}
diff --git a/uds/indexComponent.h b/uds/indexComponent.h
new file mode 100644
index 0000000..22066b1
--- /dev/null
+++ b/uds/indexComponent.h
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.h#5 $
+ */
+
+#ifndef INDEX_COMPONENT_H
+#define INDEX_COMPONENT_H 1
+
+#include "common.h"
+
+#include "bufferedReader.h"
+#include "bufferedWriter.h"
+#include "compiler.h"
+#include "regionIdentifiers.h"
+
+typedef enum completionStatus {
+  CS_NOT_COMPLETED,             // operation has not completed
+  CS_JUST_COMPLETED,            // operation just completed
+  CS_COMPLETED_PREVIOUSLY       // operation completed previously
+} CompletionStatus;
+
+typedef struct readPortal {
+  struct indexComponent  *component;
+  BufferedReader        **readers;
+  unsigned int            zones;
+} ReadPortal;
+
+/**
+ * Prototype for functions which can load an index component from its
+ * saved state.
+ *
+ * @param portal        A component portal which can be used to load the
+ *                        specified component.
+ * @return UDS_SUCCESS or an error code
+ **/
+typedef int (*Loader)(ReadPortal *portal);
+
+/**
+ * Prototype for functions which can save an index component.
+ *
+ * @param component     The index component.
+ * @param writer        A buffered writer.
+ * @param zone          The zone number.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+typedef int (*Saver)(struct indexComponent *component,
+                     BufferedWriter        *writer,
+                     unsigned int           zone);
+
+/**
+ * Command code used by IncrementalWriter function protocol.
+ **/
+typedef enum incrementalWriterCommand {
+  IWC_START,    //< start an incremental save
+  IWC_CONTINUE, //< continue an incremental save
+  IWC_FINISH,   //< force finish of incremental save
+  IWC_ABORT,    //< abort incremental save
+  IWC_IDLE = -1,//< not a command, used internally to signify not in progress
+  IWC_DONE = -2 //< not a command, used internally to signify async completion
+} IncrementalWriterCommand;
+
+typedef struct writeZone {
+  struct indexComponent    *component;
+  IncrementalWriterCommand  phase;
+  BufferedWriter           *writer;
+  unsigned int              zone;
+} WriteZone;
+
+/**
+ * @param [in]  component       The index component.
+ * @param [in]  writer          A buffered writer.
+ * @param [in]  zone            The zone number (0 for non-multi-zone).
+ * @param [in]  command         The incremental writer command.
+ * @param [out] completed       If non-NULL, set to whether save is done.
+ *
+ * @return      UDS_SUCCESS or an error code
+ **/
+typedef int (*IncrementalWriter)(struct indexComponent    *component,
+                                 BufferedWriter           *writer,
+                                 unsigned int              zone,
+                                 IncrementalWriterCommand  command,
+                                 bool                     *completed);
+
+/**
+ * The structure describing how to load or save an index component.
+ * At least one of saver or incremental must be specified.
+ **/
+typedef struct indexComponentInfo {
+  RegionKind         kind;        // Region kind
+  const char        *name;        // The name of the component (for logging)
+  bool               saveOnly;    // Used for saves but not checkpoints
+  bool               chapterSync; // Saved by the chapter writer
+  bool               multiZone;   // Does this component have multiple zones?
+  bool               ioStorage;   // Do we do I/O directly to storage?
+  Loader             loader;      // The function load this component
+  Saver              saver;       // The function to store this component
+  IncrementalWriter  incremental; // The function for incremental writing
+} IndexComponentInfo;
+
+/**
+ * The structure representing a savable (and loadable) part of an index.
+ **/
+typedef struct indexComponent {
+  const IndexComponentInfo  *info;          // IndexComponentInfo specification
+  void                      *componentData; // The object to load or save
+  void                      *context;       // The context used to load or save
+  struct indexState         *state;         // The index state
+  unsigned int               numZones;      // Number of zones in write portal
+  WriteZone               **writeZones;     // State for writing component
+} IndexComponent;
+
+/**
+ * Make an index component
+ *
+ * @param state         The index state in which this component instance
+ *                        shall reside.
+ * @param info          The component info specification for this component.
+ * @param zoneCount     How many active zones are in use.
+ * @param data          Component-specific data.
+ * @param context       Component-specific context.
+ * @param componentPtr  Where to store the resulting component.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIndexComponent(struct indexState         *state,
+                       const IndexComponentInfo  *info,
+                       unsigned int               zoneCount,
+                       void                      *data,
+                       void                      *context,
+                       IndexComponent           **componentPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy and index component.
+ *
+ * @param componentPtr  A pointer to the component to be freed.
+ **/
+void freeIndexComponent(IndexComponent **componentPtr);
+
+/**
+ * Return the index component name for this component.
+ **/
+static INLINE const char *indexComponentName(IndexComponent *component)
+{
+  return component->info->name;
+}
+
+/**
+ * Return the index component data for this component.
+ **/
+static INLINE void *indexComponentData(IndexComponent *component)
+{
+  return component->componentData;
+}
+
+/**
+ * Return the index component context for this component.
+ **/
+static INLINE void *indexComponentContext(IndexComponent *component)
+{
+  return component->context;
+}
+
+/**
+ * Determine whether this component may be skipped for a checkpoint.
+ *
+ * @param component     the component,
+ *
+ * @return whether the component may be skipped
+ **/
+static INLINE bool skipIndexComponentOnCheckpoint(IndexComponent *component)
+{
+  return component->info->saveOnly;
+}
+
+/**
+ * Determine whether actual saving during a checkpoint should be
+ * invoked by the chapter writer thread.
+ **/
+static INLINE bool
+deferIndexComponentCheckpointToChapterWriter(IndexComponent *component)
+{
+  return component->info->chapterSync;
+}
+
+/**
+ * Determine whether a replay is required if component is missing.
+ *
+ * @param component     the component
+ *
+ * @return whether the component is final (that is, contains shutdown state)
+ **/
+static INLINE bool
+missingIndexComponentRequiresReplay(IndexComponent *component)
+{
+  return component->info->saveOnly;
+}
+
+/**
+ * Read a component's state.
+ *
+ * @param component  The component to read.
+ *
+ * @return UDS_SUCCESS, an error code from reading, or UDS_INVALID_ARGUMENT
+ *         if the component is NULL.
+ **/
+int readIndexComponent(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write a state file.
+ *
+ * @param component  The component to write
+ *
+ * @return UDS_SUCCESS, an error code from writing, or UDS_INVALID_ARGUMENT
+ *         if the component is NULL.
+ **/
+int writeIndexComponent(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Start an incremental save for this component (all zones).
+ *
+ * @param [in] component        The index component.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ **/
+int startIndexComponentIncrementalSave(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Perform an incremental save for a component in a particular zone.
+ *
+ * @param [in]  component       The index component.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Pointer to hold completion status result.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ *
+ * @note        If an incremental save is not supported, a regular
+ *              save will be performed if this is the first call in zone 0.
+ **/
+ int performIndexComponentZoneSave(IndexComponent   *component,
+                                   unsigned int      zone,
+                                   CompletionStatus *completed)
+  __attribute__((warn_unused_result));
+
+/**
+ * Perform an incremental save for a non-multizone component synchronized
+ * with the chapter writer.
+ *
+ * @param component     The index component.
+ **/
+int performIndexComponentChapterWriterSave(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Force the completion of an incremental save currently in progress in
+ * a particular zone.
+ *
+ * @param [in]  component       The index component.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Pointer to hold completion status result.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ **/
+int finishIndexComponentZoneSave(IndexComponent   *component,
+                                 unsigned int      zone,
+                                 CompletionStatus *completed)
+  __attribute__((warn_unused_result));
+
+/**
+ * Force the completion of an incremental save in all zones and complete
+ * the overal save.
+ *
+ * @param [in]  component       The index component.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ *
+ * @note        If all zones call finishIndexComponentZoneSave first, only
+ *              the common non-index-related completion code is required,
+ *              which protects access to the index data structures from the
+ *              invoking thread.
+ **/
+int finishIndexComponentIncrementalSave(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort the incremental save currently in progress in a particular zone.
+ *
+ * @param [in]  component       The index component.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Pointer to hold completion status result.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ *
+ * @note        "Completed" in this case means completed or aborted.
+ *              Once any zone calls this function the entire save is
+ *              useless unless every zone indicates CS_COMPLETED_PREVIOUSLY.
+ **/
+int abortIndexComponentZoneSave(IndexComponent   *component,
+                                unsigned int      zone,
+                                CompletionStatus *completed)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort an incremental save currently in progress
+ *
+ * @param [in] component        The index component.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ *
+ * @note        If all zones call abortIndexComponentZoneSave first, only
+ *              the common non-index-related completion code is required,
+ *              which protects access to the index data structures from the
+ *              invoking thread.
+ **/
+int abortIndexComponentIncrementalSave(IndexComponent *component)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove or invalidate component state.
+ *
+ * @param component  The component whose file is to be removed.  If NULL
+ *                   no action is taken.
+ **/
+__attribute__((warn_unused_result))
+int discardIndexComponent(IndexComponent *component);
+
+/**
+ * Get a buffered reader for the specified component part.
+ *
+ * @param [in]  portal          The component portal.
+ * @param [in]  part            The component ordinal number.
+ * @param [out] readerPtr       Where to put the buffered reader.
+ *
+ * @return UDS_SUCCESS or an error code.
+ *
+ * @note the reader is managed by the component portal
+ **/
+__attribute__((warn_unused_result))
+int getBufferedReaderForPortal(ReadPortal      *portal,
+                               unsigned int     part,
+                               BufferedReader **readerPtr);
+
+#endif /* INDEX_COMPONENT_H */
diff --git a/uds/indexConfig.c b/uds/indexConfig.c
new file mode 100644
index 0000000..7ef86f2
--- /dev/null
+++ b/uds/indexConfig.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.c#2 $
+ */
+
+#include "indexConfig.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+static const byte INDEX_CONFIG_MAGIC[]        = "ALBIC";
+static const byte INDEX_CONFIG_VERSION[]      = "06.02";
+static const byte INDEX_CONFIG_VERSION_6_01[] = "06.01";
+
+enum {
+  INDEX_CONFIG_MAGIC_LENGTH   = sizeof(INDEX_CONFIG_MAGIC) - 1,
+  INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION) - 1
+};
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeIndexConfig(Buffer *buffer, UdsConfiguration config)
+{
+  int result = getUInt32LEFromBuffer(buffer, &config->recordPagesPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->chaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->sparseChaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->cacheChapters);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->checkpointFrequency);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->masterIndexMeanDelta);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->bytesPerPage);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &config->sparseSampleRate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &config->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer) - contentLength(buffer),
+                           bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    result = UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/**********************************************************************/
+static int readVersion(BufferedReader    *reader,
+                       UdsConfiguration   conf,
+                       const char       **versionPtr)
+{
+  byte buffer[INDEX_CONFIG_VERSION_LENGTH];
+  int result = readFromBufferedReader(reader, buffer,
+                                      INDEX_CONFIG_VERSION_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot read index config version");
+  }
+  if (memcmp(INDEX_CONFIG_VERSION, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0) {
+    Buffer *buffer;
+    result = makeBuffer(sizeof(*conf), &buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                    bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return logErrorWithStringError(result, "cannot read config data");
+    }
+    clearBuffer(buffer);
+    result = decodeIndexConfig(buffer, conf);
+    freeBuffer(&buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (versionPtr != NULL) {
+      *versionPtr = "current";
+    }
+    return result;
+  } else if (memcmp(INDEX_CONFIG_VERSION_6_01, buffer,
+                    INDEX_CONFIG_VERSION_LENGTH) == 0) {
+    struct udsConfiguration6_01 oldConf;
+    result = readFromBufferedReader(reader, &oldConf, sizeof(oldConf));
+    if (result != UDS_SUCCESS) {
+      logErrorWithStringError(result,
+                              "failed to read version 6.01 config file");
+      return result;
+    }
+    conf->recordPagesPerChapter   = oldConf.recordPagesPerChapter;
+    conf->chaptersPerVolume       = oldConf.chaptersPerVolume;
+    conf->sparseChaptersPerVolume = oldConf.sparseChaptersPerVolume;
+    conf->cacheChapters           = oldConf.cacheChapters;
+    conf->checkpointFrequency     = oldConf.checkpointFrequency;
+    conf->masterIndexMeanDelta    = oldConf.masterIndexMeanDelta;
+    conf->bytesPerPage            = oldConf.bytesPerPage;
+    conf->sparseSampleRate        = oldConf.sparseSampleRate;
+    conf->nonce                   = 0;
+    if (versionPtr != NULL) {
+      *versionPtr = "6.01";
+    }
+    return UDS_UNSUPPORTED_VERSION;
+  }
+
+  return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                 "unsupported configuration version: '%.*s'",
+                                 INDEX_CONFIG_VERSION_LENGTH, buffer);
+}
+
+/**********************************************************************/
+int readConfigContents(BufferedReader   *reader,
+                       UdsConfiguration  config)
+{
+  int result = verifyBufferedData(reader, INDEX_CONFIG_MAGIC,
+                                  INDEX_CONFIG_MAGIC_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  const char *version = NULL;
+  result = readVersion(reader, config, &version);
+  if (result != UDS_SUCCESS) {
+    if (result == UDS_UNSUPPORTED_VERSION) {
+      logNoticeWithStringError(result, "Found index config version %s",
+                               version);
+    } else {
+      logErrorWithStringError(result, "Failed to read index config");
+    }
+  }
+  return result;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int encodeIndexConfig(Buffer *buffer, UdsConfiguration config)
+{
+  int result = putUInt32LEIntoBuffer(buffer, config->recordPagesPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->chaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->sparseChaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->cacheChapters);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config-> checkpointFrequency);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->masterIndexMeanDelta);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->bytesPerPage);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, config->sparseSampleRate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, config->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*config),
+                           "%zu bytes encoded, of %zu expected",
+                           contentLength(buffer), sizeof(*config));
+  return result;
+}
+
+/**********************************************************************/
+int writeConfigContents(BufferedWriter   *writer,
+                        UdsConfiguration  config)
+{
+  int result = writeToBufferedWriter(writer, INDEX_CONFIG_MAGIC,
+                                     INDEX_CONFIG_MAGIC_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = writeToBufferedWriter(writer, INDEX_CONFIG_VERSION,
+                                 INDEX_CONFIG_VERSION_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  Buffer *buffer;
+  result = makeBuffer(sizeof(*config), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = encodeIndexConfig(buffer, config);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(writer, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  return result;
+}
+
+/**********************************************************************/
+int makeConfiguration(UdsConfiguration conf, Configuration **configPtr)
+{
+  *configPtr = NULL;
+  if (conf == NULL) {
+    return logErrorWithStringError(UDS_CONF_REQUIRED,
+                                   "received an invalid config");
+  }
+
+  Configuration *config;
+  int result = ALLOCATE(1, Configuration, "configuration", &config);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeGeometry(conf->bytesPerPage,
+                        conf->recordPagesPerChapter,
+                        conf->chaptersPerVolume,
+                        conf->sparseChaptersPerVolume,
+                        &config->geometry);
+  if (result != UDS_SUCCESS) {
+    freeConfiguration(config);
+    return result;
+  }
+
+  config->sparseSampleRate     = conf->sparseSampleRate;
+  config->cacheChapters        = conf->cacheChapters;
+  config->masterIndexMeanDelta = conf->masterIndexMeanDelta;
+
+  *configPtr = config;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeConfiguration(Configuration *config)
+{
+  if (config != NULL) {
+    freeGeometry(config->geometry);
+    FREE(config);
+  }
+}
diff --git a/uds/indexConfig.h b/uds/indexConfig.h
new file mode 100644
index 0000000..dab3d6a
--- /dev/null
+++ b/uds/indexConfig.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.h#2 $
+ */
+
+#ifndef INDEX_CONFIG_H
+#define INDEX_CONFIG_H 1
+
+#include "config.h"
+#include "geometry.h"
+
+/**
+ * A set of configuration parameters for the indexer.
+ **/
+struct configuration {
+  /* Parameters for the volume */
+
+  /* The volume layout */
+  Geometry *geometry;
+
+  /* Size of the page cache and sparse chapter index cache, in chapters */
+  unsigned int cacheChapters;
+
+  /** Parameters for the master index */
+
+  /* The mean delta for the master index */
+  unsigned int masterIndexMeanDelta;
+
+  /* Sampling rate for sparse indexing */
+  unsigned int sparseSampleRate;
+};
+
+#endif /* INDEX_CONFIG_H */
diff --git a/uds/indexInternals.c b/uds/indexInternals.c
new file mode 100644
index 0000000..48268c7
--- /dev/null
+++ b/uds/indexInternals.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.c#7 $
+ */
+
+#include "indexInternals.h"
+
+#include "errors.h"
+#include "indexCheckpoint.h"
+#include "indexStateData.h"
+#include "indexZone.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "openChapter.h"
+#include "request.h"
+#include "stringUtils.h"
+#include "threads.h"
+#include "typeDefs.h"
+#include "volume.h"
+#include "zone.h"
+
+static const unsigned int MAX_COMPONENT_COUNT = 4;
+
+/**********************************************************************/
+int allocateIndex(IndexLayout                  *layout,
+                  const Configuration          *config,
+                  const struct uds_parameters  *userParams,
+                  unsigned int                  zoneCount,
+                  LoadType                      loadType,
+                  Index                       **newIndex)
+{
+  unsigned int checkpoint_frequency
+    = userParams == NULL ? 0 : userParams->checkpoint_frequency;
+  if (checkpoint_frequency >= config->geometry->chaptersPerVolume) {
+    return UDS_BAD_CHECKPOINT_FREQUENCY;
+  }
+
+  Index *index;
+  int result = ALLOCATE(1, Index, "index", &index);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  index->existed             = (loadType != LOAD_CREATE);
+  index->hasSavedOpenChapter = true;
+  index->loadedType          = LOAD_UNDEFINED;
+
+  result = makeIndexCheckpoint(index);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+  setIndexCheckpointFrequency(index->checkpoint, checkpoint_frequency);
+
+  getIndexLayout(layout, &index->layout);
+  index->zoneCount = zoneCount;
+
+  result = ALLOCATE(index->zoneCount, IndexZone *, "zones",
+                    &index->zones);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  result = makeIndexState(layout, index->zoneCount, MAX_COMPONENT_COUNT,
+                          &index->state);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  result = addIndexStateComponent(index->state, &INDEX_STATE_INFO, index,
+                                  NULL);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+
+  result = makeVolume(config, index->layout, userParams,
+                      VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS, index->zoneCount,
+                      &index->volume);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return result;
+  }
+  index->volume->lookupMode  = LOOKUP_NORMAL;
+
+  unsigned int i;
+  for (i = 0; i < index->zoneCount; i++) {
+    result = makeIndexZone(index, i);
+    if (result != UDS_SUCCESS) {
+      freeIndex(index);
+      return logErrorWithStringError(result, "Could not create index zone");
+    }
+  }
+
+  result = addIndexStateComponent(index->state, &OPEN_CHAPTER_INFO, index,
+                                  NULL);
+  if (result != UDS_SUCCESS) {
+    freeIndex(index);
+    return logErrorWithStringError(result, "Could not create open chapter");
+  }
+
+  *newIndex = index;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void releaseIndex(Index *index)
+{
+  if (index == NULL) {
+    return;
+  }
+
+  if (index->zones != NULL) {
+    unsigned int i;
+    for (i = 0; i < index->zoneCount; i++) {
+      freeIndexZone(index->zones[i]);
+    }
+    FREE(index->zones);
+  }
+
+  freeVolume(index->volume);
+
+  freeIndexState(&index->state);
+  freeIndexCheckpoint(index->checkpoint);
+  putIndexLayout(&index->layout);
+  FREE(index);
+}
diff --git a/uds/indexInternals.h b/uds/indexInternals.h
new file mode 100644
index 0000000..16cb56a
--- /dev/null
+++ b/uds/indexInternals.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.h#3 $
+ */
+
+#ifndef INDEX_INTERNALS_H
+#define INDEX_INTERNALS_H
+
+#include "index.h"
+#include "loadType.h"
+#include "request.h"
+
+/**
+ * Construct a new index from the given configuration.
+ *
+ * @param layout      The index layout to use
+ * @param config      The configuration to use
+ * @param userParams  The index session parameters.  If NULL, the default
+ *                    session parameters will be used.
+ * @param zoneCount   The number of zones for this index to use
+ * @param loadType    How to create the index:  it can be create only, allow
+ *                    loading from files, and allow rebuilding from the volume
+ * @param newIndex    A pointer to hold a pointer to the new index
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int allocateIndex(IndexLayout                  *layout,
+                  const Configuration          *config,
+                  const struct uds_parameters  *userParams,
+                  unsigned int                  zoneCount,
+                  LoadType                      loadType,
+                  Index                       **newIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up the index and its memory.
+ *
+ * @param index    The index to destroy.
+ **/
+void releaseIndex(Index *index);
+
+#endif /* INDEX_INTERNALS_H */
diff --git a/uds/indexLayout.c b/uds/indexLayout.c
new file mode 100644
index 0000000..cb019ff
--- /dev/null
+++ b/uds/indexLayout.c
@@ -0,0 +1,2409 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.c#19 $
+ */
+
+#include "indexLayout.h"
+
+#include "buffer.h"
+#include "compiler.h"
+#include "config.h"
+#include "indexConfig.h"
+#include "layoutRegion.h"
+#include "logger.h"
+#include "masterIndexOps.h"
+#include "memoryAlloc.h"
+#include "nonce.h"
+#include "openChapter.h"
+
+/*
+ * Overall layout of an index on disk:
+ *
+ * The layout is divided into a number of fixed-size regions, the sizes of
+ * which are computed when the index is created. Every header and region
+ * begins on 4K block boundary. Save regions are further sub-divided into
+ * regions of their own.
+ *
+ * Each region has a kind and an instance number. Some kinds only have one
+ * instance and therefore use RL_SOLE_INSTANCE (-1) as the instance number.
+ * The RL_KIND_INDEX uses instances to represent sub-indices, where used.
+ * A save region can either hold a checkpoint or a clean shutdown (determined
+ * by the type). The instances determine which available save slot is used.
+ * The RL_KIND_MASTER_INDEX uses instances to record which zone is being saved.
+ *
+ *     +-+-+--------+--------+--------+-----+---  -+-+
+ *     | | |   I N D E X   0      101, 0    | ...  | |
+ *     |H|C+--------+--------+--------+-----+---  -+S|
+ *     |D|f| Volume | Save   | Save   |     |      |e|
+ *     |R|g| Region | Region | Region | ... | ...  |a|
+ *     | | | 201 -1 | 202  0 | 202  1 |     |      |l|
+ *     +-+-+--------+--------+--------+-----+---  -+-+
+ *
+ * The header contains the encoded regional layout table as well as
+ * the saved index configuration record. The sub-index regions and their
+ * subdivisions are maintained in the same table.
+ *
+ * There are at least two save regions per sub-index to preserve the old
+ * state should the saving of a state be incomplete. They are used in
+ * a round-robin fashion.
+ *
+ * Anatomy of a save region:
+ *
+ *     +-+-----+------+------+-----+   -+-----+
+ *     |H| IPM | MI   | MI   |     |    | OC  |
+ *     |D|     | zone | zone | ... |    |     |
+ *     |R| 301 | 302  | 302  |     |    | 303 |
+ *     | | -1  | 0    | 1    |     |    | -1  |
+ *     +-+-----+------+------+-----+   -+-----+
+ *
+ * Every region header has a type (and version). In save regions,
+ * the open chapter only appears in RL_TYPE_SAVE not RL_TYPE_CHECKPOINT,
+ * although the same space is reserved for both.
+ *
+ * The header contains the encoded regional layout table as well as the
+ * index state record for that save or checkpoint. Each save or checkpoint
+ * has a unique generation number and nonce which is used to seed the
+ * checksums of those regions.
+ */
+
+typedef struct indexSaveData_v1 {
+  uint64_t timestamp;           // ms since epoch...
+  uint64_t nonce;
+  uint32_t version;             // 1
+  uint32_t unused__; 
+} IndexSaveData;
+
+typedef struct indexSaveLayout {
+  LayoutRegion     indexSave;
+  LayoutRegion     header;
+  unsigned int     numZones;
+  LayoutRegion     indexPageMap;
+  LayoutRegion     freeSpace;
+  LayoutRegion    *masterIndexZones;
+  LayoutRegion    *openChapter;
+  IndexSaveType    saveType;
+  IndexSaveData    saveData;
+  Buffer          *indexStateBuffer;
+  bool             read;
+  bool             written;
+} IndexSaveLayout;
+
+typedef struct subIndexLayout {
+  LayoutRegion     subIndex;
+  uint64_t         nonce;
+  LayoutRegion     volume;
+  IndexSaveLayout *saves;
+} SubIndexLayout;
+
+typedef struct superBlockData_v1 {
+  byte     magicLabel[32];
+  byte     nonceInfo[32];
+  uint64_t nonce;
+  uint32_t version;             // 2
+  uint32_t blockSize;           // for verification
+  uint16_t numIndexes;          // 1
+  uint16_t maxSaves;
+  uint64_t openChapterBlocks;
+  uint64_t pageMapBlocks;
+} SuperBlockData;
+
+struct indexLayout {
+  IOFactory            *factory;
+  off_t                 offset;
+  struct index_version  indexVersion;
+  SuperBlockData        super;
+  LayoutRegion          header;
+  LayoutRegion          config;
+  SubIndexLayout        index;
+  LayoutRegion          seal;
+  uint64_t              totalBlocks;
+  int                   refCount;
+};
+
+/**
+ * Structure used to compute single file layout sizes.
+ *
+ * Note that the masterIndexBlocks represent all zones and are sized for
+ * the maximum number of blocks that would be needed regardless of the number
+ * of zones (up to the maximum value) that are used at run time.
+ *
+ * Similarly, the number of saves is sized for the minimum safe value
+ * assuming checkpointing is enabled, since that is also a run-time parameter.
+ **/
+typedef struct saveLayoutSizes {
+  Configuration config;                 // this is a captive copy
+  Geometry      geometry;               // this is a captive copy
+  unsigned int  numSaves;               // per sub-index
+  size_t        blockSize;              // in bytes
+  uint64_t      volumeBlocks;           // per sub-index
+  uint64_t      masterIndexBlocks;      // per save
+  uint64_t      pageMapBlocks;          // per save
+  uint64_t      openChapterBlocks;      // per save
+  uint64_t      saveBlocks;             // per sub-index
+  uint64_t      subIndexBlocks;         // per sub-index
+  uint64_t      totalBlocks;            // for whole layout
+} SaveLayoutSizes;
+
+enum {
+  INDEX_STATE_BUFFER_SIZE =  512,
+  MAX_SAVES               =    5,
+};
+
+static const byte SINGLE_FILE_MAGIC_1[32] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+enum {
+  SINGLE_FILE_MAGIC_1_LENGTH = sizeof(SINGLE_FILE_MAGIC_1),
+};
+
+static int reconstituteSingleFileLayout(IndexLayout    *layout,
+                                        SuperBlockData *super,
+                                        RegionTable    *table,
+                                        uint64_t        firstBlock)
+  __attribute__((warn_unused_result));
+static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl)
+  __attribute__((warn_unused_result));
+
+/*****************************************************************************/
+static INLINE uint64_t blockCount(uint64_t bytes, uint32_t blockSize)
+{
+  uint64_t blocks = bytes / blockSize;
+  if (bytes % blockSize > 0) {
+    ++blocks;
+  }
+  return blocks;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int computeSizes(SaveLayoutSizes        *sls,
+                        const UdsConfiguration  config,
+                        size_t                  blockSize,
+                        unsigned int            numCheckpoints)
+{
+  if (config->bytesPerPage % blockSize != 0) {
+    return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT,
+                                   "page size not a multiple of block size");
+  }
+
+  Configuration *cfg = NULL;
+  int result = makeConfiguration(config, &cfg);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot compute layout size");
+  }
+
+  memset(sls, 0, sizeof(*sls));
+
+  // internalize the configuration and geometry...
+
+  sls->geometry        = *cfg->geometry;
+  sls->config          = *cfg;
+  sls->config.geometry = &sls->geometry;
+
+  freeConfiguration(cfg);
+
+  sls->numSaves         = 2 + numCheckpoints;
+  sls->blockSize        = blockSize;
+  sls->volumeBlocks     = sls->geometry.bytesPerVolume / blockSize;
+
+  result = computeMasterIndexSaveBlocks(&sls->config, blockSize,
+                                        &sls->masterIndexBlocks);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot compute index save size");
+  }
+
+  sls->pageMapBlocks =
+    blockCount(computeIndexPageMapSaveSize(&sls->geometry), blockSize);
+  sls->openChapterBlocks =
+    blockCount(computeSavedOpenChapterSize(&sls->geometry), blockSize);
+  sls->saveBlocks = 1 + (sls->masterIndexBlocks +
+                         sls->pageMapBlocks + sls->openChapterBlocks);
+  sls->subIndexBlocks = sls->volumeBlocks + (sls->numSaves * sls->saveBlocks);
+  sls->totalBlocks = 3 + sls->subIndexBlocks;
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int udsComputeIndexSize(const UdsConfiguration  config,
+                        unsigned int            numCheckpoints,
+                        uint64_t               *indexSize)
+{
+  SaveLayoutSizes sizes;
+  int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, numCheckpoints);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (indexSize != NULL) {
+    *indexSize = sizes.totalBlocks * sizes.blockSize;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int openLayoutReader(IndexLayout     *layout,
+                            LayoutRegion    *lr,
+                            BufferedReader **readerPtr)
+{
+  off_t start = lr->startBlock * layout->super.blockSize;
+  size_t size = lr->numBlocks * layout->super.blockSize;
+  return openBufferedReader(layout->factory, start, size, readerPtr);
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int openLayoutWriter(IndexLayout     *layout,
+                            LayoutRegion    *lr,
+                            BufferedWriter **writerPtr)
+{
+  off_t start = lr->startBlock * layout->super.blockSize;
+  size_t size = lr->numBlocks * layout->super.blockSize;
+  return openBufferedWriter(layout->factory, start, size, writerPtr);
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int decodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData)
+{
+  int result = getUInt64LEFromBuffer(buffer, &saveData->timestamp);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &saveData->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &saveData->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &saveData->unused__);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // The unused padding has to be zeroed for correct nonce calculation
+  if (saveData->unused__ != 0) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer), sizeof(*saveData));
+  if (result != UDS_SUCCESS) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int decodeRegionHeader(Buffer *buffer, RegionHeader *header)
+{
+  int result = getUInt64LEFromBuffer(buffer, &header->magic);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->regionBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &header->type);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &header->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &header->numRegions);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &header->payload);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer), sizeof(*header));
+  if (result != UDS_SUCCESS) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int decodeLayoutRegion(Buffer *buffer, LayoutRegion *region)
+{
+  size_t cl1 = contentLength(buffer);
+
+  int result = getUInt64LEFromBuffer(buffer, &region->startBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &region->numBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &region->checksum);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &region->kind);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &region->instance);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(cl1 - contentLength(buffer) == sizeof(*region),
+                           "%zu bytes decoded, of %zu expected",
+                           cl1 - contentLength(buffer), sizeof(*region));
+  if (result != UDS_SUCCESS) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int loadRegionTable(BufferedReader *reader, RegionTable **tablePtr)
+{
+  Buffer *buffer;
+  int result = makeBuffer(sizeof(RegionHeader), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                  bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return logErrorWithStringError(result, "cannot read region table header");
+  }
+  result = resetBufferEnd(buffer, bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  RegionHeader header;
+  result = decodeRegionHeader(buffer, &header);
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (header.magic != REGION_MAGIC) {
+    return UDS_NO_INDEX;
+  }
+  if (header.version != 1) {
+    return logErrorWithStringError(UDS_UNSUPPORTED_VERSION,
+                                   "unknown region table version %" PRIu16,
+                                   header.version);
+  }
+
+  RegionTable *table;
+  result = ALLOCATE_EXTENDED(RegionTable, header.numRegions, LayoutRegion,
+                             "single file layout region table", &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  table->header = header;
+  result = makeBuffer(header.numRegions * sizeof(LayoutRegion), &buffer);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return result;
+  }
+  result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                  bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    freeBuffer(&buffer);
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "cannot read region table layouts");
+  }
+  result = resetBufferEnd(buffer, bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    freeBuffer(&buffer);
+    return result;
+  }
+  unsigned int i;
+  for (i = 0; i < header.numRegions; i++){
+    result = decodeLayoutRegion(buffer, &table->regions[i]);
+    if (result != UDS_SUCCESS) {
+      FREE(table);
+      freeBuffer(&buffer);
+      return result;
+    }
+  }
+  freeBuffer(&buffer);
+  *tablePtr = table;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int decodeSuperBlockData(Buffer *buffer, SuperBlockData *super)
+{
+  int result = getBytesFromBuffer(buffer, 32, super->magicLabel);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getBytesFromBuffer(buffer, 32, super->nonceInfo);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &super->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &super->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &super->blockSize);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &super->numIndexes);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEFromBuffer(buffer, &super->maxSaves);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = skipForward(buffer, 4);      // aligment
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &super->openChapterBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &super->pageMapBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer), sizeof(*super));
+  if (result != UDS_SUCCESS) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int readSuperBlockData(BufferedReader *reader,
+                              SuperBlockData *super,
+                              size_t          savedSize)
+{
+  if (savedSize != sizeof(SuperBlockData)) {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "unexpected super block data size %zu",
+                                   savedSize);
+  }
+
+  if (sizeof(super->magicLabel) != SINGLE_FILE_MAGIC_1_LENGTH) {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "super block magic label size incorrect");
+  }
+
+  Buffer *buffer;
+  int result = makeBuffer(savedSize, &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                  bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return logErrorWithStringError(result, "cannot read region table header");
+  }
+  result = resetBufferEnd(buffer, bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = decodeSuperBlockData(buffer, super);
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot read super block data");
+  }
+
+  if (memcmp(super->magicLabel, SINGLE_FILE_MAGIC_1,
+             SINGLE_FILE_MAGIC_1_LENGTH) != 0) {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "unknown superblock magic label");
+  }
+
+  if ((super->version < SUPER_VERSION_MINIMUM)
+      || (super->version > SUPER_VERSION_MAXIMUM)) {
+    return logErrorWithStringError(UDS_UNSUPPORTED_VERSION,
+                                   "unknown superblock version number %"
+                                   PRIu32,
+                                   super->version);
+  }
+
+  // We dropped the usage of multiple subindices before we ever ran UDS code in
+  // the kernel.  We do not have code that will handle multiple subindices.
+  if (super->numIndexes != 1) {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "invalid subindex count %" PRIu32,
+                                   super->numIndexes);
+  }
+
+  if (generateMasterNonce(super->nonceInfo, sizeof(super->nonceInfo)) !=
+      super->nonce)
+  {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "inconsistent superblock nonce");
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int allocateSingleFileParts(IndexLayout    *layout,
+                                   SuperBlockData *super)
+{
+  int result = ALLOCATE(super->maxSaves, IndexSaveLayout, __func__,
+                        &layout->index.saves);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int loadSuperBlock(IndexLayout    *layout,
+                          size_t          blockSize,
+                          uint64_t        firstBlock,
+                          BufferedReader *reader)
+{
+  RegionTable *table = NULL;
+  int result = loadRegionTable(reader, &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (table->header.type != RH_TYPE_SUPER) {
+    FREE(table);
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "not a superblock region table");
+  }
+
+  SuperBlockData superBlockData;
+  result = readSuperBlockData(reader, &superBlockData, table->header.payload);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return logErrorWithStringError(result, "unknown superblock format");
+  }
+
+  if (superBlockData.blockSize != blockSize) {
+    FREE(table);
+    return logErrorWithStringError(UDS_WRONG_INDEX_CONFIG,
+                                   "superblock saved blockSize %" PRIu32
+                                   " differs from supplied blockSize %zu",
+                                   superBlockData.blockSize, blockSize);
+  }
+  initializeIndexVersion(&layout->indexVersion, superBlockData.version);
+
+  result = allocateSingleFileParts(layout, &superBlockData);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return result;
+  }
+
+  result = reconstituteSingleFileLayout(layout, &superBlockData, table,
+                                        firstBlock);
+  FREE(table);
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int readIndexSaveData(BufferedReader  *reader,
+                             IndexSaveData   *saveData,
+                             size_t           savedSize,
+                             Buffer         **bufferPtr)
+{
+  int result = UDS_SUCCESS;
+  if (savedSize == 0) {
+    memset(saveData, 0, sizeof(*saveData));
+  } else {
+    if (savedSize < sizeof(IndexSaveData)) {
+      return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "unexpected index save data size %zu",
+                                     savedSize);
+    }
+
+    Buffer *buffer;
+    result = makeBuffer(sizeof(*saveData), &buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                    bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return logErrorWithStringError(result, "cannot read index save data");
+    }
+    result = resetBufferEnd(buffer, bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return result;
+    }
+
+    result = decodeIndexSaveData(buffer, saveData);
+    freeBuffer(&buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    savedSize -= sizeof(IndexSaveData);
+
+    if (saveData->version > 1) {
+      return logErrorWithStringError(UDS_UNSUPPORTED_VERSION,
+                                     "unkown index save verion number %"
+                                     PRIu32,
+                                     saveData->version);
+    }
+
+    if (savedSize > INDEX_STATE_BUFFER_SIZE) {
+      return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "unexpected index state buffer size %zu",
+                                     savedSize);
+    }
+  }
+
+  Buffer *buffer = NULL;
+
+  if (saveData->version != 0) {
+    result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    if (savedSize > 0) {
+      result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                      savedSize);
+      if (result != UDS_SUCCESS) {
+        freeBuffer(&buffer);
+        return result;
+      }
+      result = resetBufferEnd(buffer, savedSize);
+      if (result != UDS_SUCCESS) {
+        freeBuffer(&buffer);
+        return result;
+      }
+    }
+  }
+
+  *bufferPtr = buffer;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+
+typedef struct {
+  LayoutRegion  *nextRegion;
+  LayoutRegion  *lastRegion;
+  uint64_t       nextBlock;
+  int            result;
+} RegionIterator;
+
+/*****************************************************************************/
+__attribute__((format(printf, 2, 3)))
+static void iterError(RegionIterator *iter, const char *fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  int r = vLogWithStringError(LOG_ERR, UDS_UNEXPECTED_RESULT, fmt, args);
+  va_end(args);
+  if (iter->result == UDS_SUCCESS) {
+    iter->result = r;
+  }
+}
+
+/**
+ * Set the next layout region in the layout according to a region table
+ * iterator, unless the iterator already contains an error
+ *
+ * @param expect        whether to record an error or return false
+ * @param lr            the layout region field to set
+ * @param iter          the region iterator, which also holds the cumulative
+ *                        result
+ * @param numBlocks     if non-zero, the expected number of blocks
+ * @param kind          the expected kind of the region
+ * @param instance      the expected instance number of the region
+ *
+ * @return true if we meet expectations, false if we do not
+ **/
+static bool expectLayout(bool            expect,
+                         LayoutRegion   *lr,
+                         RegionIterator *iter,
+                         uint64_t        numBlocks,
+                         RegionKind      kind,
+                         unsigned int    instance)
+{
+  if (iter->result != UDS_SUCCESS) {
+    return false;
+  }
+
+  if (iter->nextRegion == iter->lastRegion) {
+    if (expect) {
+      iterError(iter, "ran out of layout regions in region table");
+    }
+    return false;
+  }
+
+  if (iter->nextRegion->startBlock != iter->nextBlock) {
+    iterError(iter, "layout region not at expected offset");
+    return false;
+  }
+
+  if (iter->nextRegion->kind != kind) {
+    if (expect) {
+      iterError(iter, "layout region has incorrect kind");
+    }
+    return false;
+  }
+
+  if (iter->nextRegion->instance != instance) {
+    iterError(iter, "layout region has incorrect instance");
+    return false;
+  }
+
+  if (numBlocks > 0 && iter->nextRegion->numBlocks != numBlocks) {
+    iterError(iter, "layout region size is incorrect");
+    return false;
+  }
+
+  if (lr != NULL) {
+    *lr = *iter->nextRegion;
+  }
+
+  iter->nextBlock += iter->nextRegion->numBlocks;
+  iter->nextRegion++;
+  return true;
+}
+
+/*****************************************************************************/
+static void setupLayout(LayoutRegion *lr,
+                        uint64_t     *nextAddrPtr,
+                        uint64_t      regionSize,
+                        unsigned int  kind,
+                        unsigned int  instance)
+{
+  *lr = (LayoutRegion) {
+    .startBlock = *nextAddrPtr,
+    .numBlocks  = regionSize,
+    .checksum   = 0,
+    .kind       = kind,
+    .instance   = instance,
+  };
+  *nextAddrPtr += regionSize;
+}
+
+/*****************************************************************************/
+static void populateIndexSaveLayout(IndexSaveLayout *isl,
+                                    SuperBlockData  *super,
+                                    unsigned int     numZones,
+                                    IndexSaveType    saveType)
+{
+  uint64_t nextBlock = isl->indexSave.startBlock;
+
+  setupLayout(&isl->header, &nextBlock, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE);
+  setupLayout(&isl->indexPageMap, &nextBlock, super->pageMapBlocks,
+              RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE);
+
+  uint64_t blocksAvail = (isl->indexSave.numBlocks -
+                          (nextBlock - isl->indexSave.startBlock) -
+                          super->openChapterBlocks);
+
+  if (numZones > 0) {
+    uint64_t miBlockCount = blocksAvail / numZones;
+    unsigned int z;
+    for (z = 0; z < numZones; ++z) {
+      LayoutRegion *miz = &isl->masterIndexZones[z];
+      setupLayout(miz, &nextBlock, miBlockCount, RL_KIND_MASTER_INDEX, z);
+    }
+  }
+  if (saveType == IS_SAVE && isl->openChapter != NULL) {
+    setupLayout(isl->openChapter, &nextBlock, super->openChapterBlocks,
+                RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE);
+  }
+  setupLayout(&isl->freeSpace, &nextBlock,
+              (isl->indexSave.numBlocks -
+               (nextBlock - isl->indexSave.startBlock)),
+               RL_KIND_SCRATCH, RL_SOLE_INSTANCE);
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int reconstructIndexSave(IndexSaveLayout *isl,
+                                IndexSaveData   *saveData,
+                                SuperBlockData  *super,
+                                RegionTable     *table)
+{
+  isl->numZones = 0;
+  isl->saveData = *saveData;
+  isl->read     = false;
+  isl->written  = false;
+
+  if (table->header.type == RH_TYPE_SAVE) {
+    isl->saveType = IS_SAVE;
+  } else if (table->header.type == RH_TYPE_CHECKPOINT) {
+    isl->saveType = IS_CHECKPOINT;
+  } else {
+    isl->saveType = NO_SAVE;
+  }
+
+  if ((table->header.numRegions == 0) ||
+      ((table->header.numRegions == 1) &&
+       (table->regions[0].kind == RL_KIND_SCRATCH)))
+  {
+    populateIndexSaveLayout(isl, super, 0, NO_SAVE);
+    return UDS_SUCCESS;
+  }
+
+  RegionIterator iter = {
+    .nextRegion = table->regions,
+    .lastRegion = table->regions + table->header.numRegions,
+    .nextBlock  = isl->indexSave.startBlock,
+    .result     = UDS_SUCCESS,
+  };
+
+  expectLayout(true, &isl->header, &iter, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE);
+  expectLayout(true, &isl->indexPageMap, &iter, 0,
+               RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE);
+  unsigned int n = 0;
+  RegionIterator tmpIter;
+  for (tmpIter = iter;
+       expectLayout(false, NULL, &tmpIter, 0, RL_KIND_MASTER_INDEX, n);
+       ++n)
+    ;
+  isl->numZones = n;
+
+  int result = UDS_SUCCESS;
+
+  if (isl->numZones > 0) {
+    result = ALLOCATE(n, LayoutRegion, "master index layout regions",
+                      &isl->masterIndexZones);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  if (isl->saveType == IS_SAVE) {
+    result = ALLOCATE(1, LayoutRegion, "open chapter layout region",
+                      &isl->openChapter);
+    if (result != UDS_SUCCESS) {
+      FREE(isl->masterIndexZones);
+      return result;
+    }
+  }
+
+  unsigned int z;
+  for (z = 0; z < isl->numZones; ++z) {
+    expectLayout(true, &isl->masterIndexZones[z], &iter, 0,
+                 RL_KIND_MASTER_INDEX, z);
+  }
+  if (isl->saveType == IS_SAVE) {
+    expectLayout(true, isl->openChapter, &iter, 0,
+                 RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE);
+  }
+  if (!expectLayout(false, &isl->freeSpace, &iter, 0,
+                    RL_KIND_SCRATCH, RL_SOLE_INSTANCE))
+  {
+    isl->freeSpace = (LayoutRegion) {
+      .startBlock = iter.nextBlock,
+      .numBlocks  = (isl->indexSave.startBlock +
+                     isl->indexSave.numBlocks) - iter.nextBlock,
+      .checksum   = 0,
+      .kind       = RL_KIND_SCRATCH,
+      .instance   = RL_SOLE_INSTANCE,
+    };
+    iter.nextBlock = isl->freeSpace.startBlock + isl->freeSpace.numBlocks;
+  }
+
+  if (iter.result != UDS_SUCCESS) {
+    return iter.result;
+  }
+  if (iter.nextRegion != iter.lastRegion) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "expected %ld additional regions",
+                                   iter.lastRegion - iter.nextRegion);
+  }
+  if (iter.nextBlock != isl->indexSave.startBlock + isl->indexSave.numBlocks) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "index save layout table incomplete");
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int loadIndexSave(IndexSaveLayout *isl,
+                         SuperBlockData  *super,
+                         BufferedReader  *reader,
+                         unsigned int     saveId)
+{
+  RegionTable *table = NULL;
+  int result = loadRegionTable(reader, &table);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "cannot read index 0 save %u header",
+                                   saveId);
+  }
+
+  if (table->header.regionBlocks != isl->indexSave.numBlocks) {
+    uint64_t regionBlocks = table->header.regionBlocks;
+    FREE(table);
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "unexpected index 0 save %u "
+                                   "region block count %llu",
+                                   saveId, regionBlocks);
+  }
+
+  if (table->header.type != RH_TYPE_SAVE &&
+      table->header.type != RH_TYPE_CHECKPOINT &&
+      table->header.type != RH_TYPE_UNSAVED)
+  {
+    unsigned int type = table->header.type;
+    FREE(table);
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT, "unexpected"
+                                   " index 0 save %u header type %u",
+                                   saveId, type);
+  }
+
+  IndexSaveData indexSaveData;
+  result = readIndexSaveData(reader, &indexSaveData, table->header.payload,
+                             &isl->indexStateBuffer);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return logErrorWithStringError(result,
+                                   "unknown index 0 save %u data format",
+                                   saveId);
+  }
+
+  result = reconstructIndexSave(isl, &indexSaveData, super, table);
+  FREE(table);
+
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&isl->indexStateBuffer);
+    return logErrorWithStringError(result,
+                                   "cannot reconstruct index 0 save %u",
+                                   saveId);
+  }
+  isl->read = true;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int loadSubIndexRegions(IndexLayout *layout)
+{
+  unsigned int j;
+  for (j = 0; j < layout->super.maxSaves; ++j) {
+    IndexSaveLayout *isl = &layout->index.saves[j];
+
+    BufferedReader *reader;
+    int result = openLayoutReader(layout, &isl->indexSave, &reader);
+    if (result != UDS_SUCCESS) {
+      logErrorWithStringError(result, "cannot get reader for index 0 save %u",
+                              j);
+      while (j-- > 0) {
+        IndexSaveLayout *isl = &layout->index.saves[j];
+        FREE(isl->masterIndexZones);
+        FREE(isl->openChapter);
+        freeBuffer(&isl->indexStateBuffer);
+      }
+      return result;
+    }
+
+    result = loadIndexSave(isl, &layout->super, reader, j);
+    freeBufferedReader(reader);
+    if (result != UDS_SUCCESS) {
+      while (j-- > 0) {
+        IndexSaveLayout *isl = &layout->index.saves[j];
+        FREE(isl->masterIndexZones);
+        FREE(isl->openChapter);
+        freeBuffer(&isl->indexStateBuffer);
+      }
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static int loadIndexLayout(IndexLayout *layout)
+{
+  BufferedReader *reader;
+  int result = openBufferedReader(layout->factory, layout->offset,
+                                  UDS_BLOCK_SIZE, &reader);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "unable to read superblock");
+  }
+
+  result = loadSuperBlock(layout, UDS_BLOCK_SIZE,
+                          layout->offset / UDS_BLOCK_SIZE, reader);
+  freeBufferedReader(reader);
+  if (result != UDS_SUCCESS) {
+    FREE(layout->index.saves);
+    layout->index.saves = NULL;
+    return result;
+  }
+
+  result = loadSubIndexRegions(layout);
+  if (result != UDS_SUCCESS) {
+    FREE(layout->index.saves);
+    layout->index.saves = NULL;
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static void generateSuperBlockData(size_t          blockSize,
+                                   unsigned int    maxSaves,
+                                   uint64_t        openChapterBlocks,
+                                   uint64_t        pageMapBlocks,
+                                   SuperBlockData *super)
+{
+  memset(super, 0, sizeof(*super));
+  memcpy(super->magicLabel, SINGLE_FILE_MAGIC_1, SINGLE_FILE_MAGIC_1_LENGTH);
+  createUniqueNonceData(super->nonceInfo, sizeof(super->nonceInfo));
+
+  super->nonce             = generateMasterNonce(super->nonceInfo,
+                                                 sizeof(super->nonceInfo));
+  super->version           = SUPER_VERSION_CURRENT;
+  super->blockSize         = blockSize;
+  super->numIndexes        = 1;
+  super->maxSaves          = maxSaves;
+  super->openChapterBlocks = openChapterBlocks;
+  super->pageMapBlocks     = pageMapBlocks;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int resetIndexSaveLayout(IndexSaveLayout *isl,
+                                uint64_t        *nextBlockPtr,
+                                uint64_t         saveBlocks,
+                                uint64_t         pageMapBlocks,
+                                unsigned int     instance)
+{
+  uint64_t startBlock = *nextBlockPtr;
+
+  if (isl->masterIndexZones) {
+    FREE(isl->masterIndexZones);
+  }
+  if (isl->openChapter) {
+    FREE(isl->openChapter);
+  }
+  if (isl->indexStateBuffer) {
+    freeBuffer(&isl->indexStateBuffer);
+  }
+  memset(isl, 0, sizeof(*isl));
+  isl->saveType = NO_SAVE;
+  setupLayout(&isl->indexSave, &startBlock, saveBlocks, RL_KIND_SAVE,
+              instance);
+  setupLayout(&isl->header, nextBlockPtr,  1, RL_KIND_HEADER,
+              RL_SOLE_INSTANCE);
+  setupLayout(&isl->indexPageMap, nextBlockPtr, pageMapBlocks,
+              RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE);
+  uint64_t remaining = startBlock - *nextBlockPtr;
+  setupLayout(&isl->freeSpace, nextBlockPtr, remaining, RL_KIND_SCRATCH,
+              RL_SOLE_INSTANCE);
+  // number of zones is a save-time parameter
+  // presence of open chapter is a save-time parameter
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static void defineSubIndexNonce(SubIndexLayout *sil,
+                                uint64_t        masterNonce,
+                                unsigned int    indexId)
+{
+  struct subIndexNonceData {
+    uint64_t offset;
+    uint16_t indexId;
+  };
+  byte buffer[sizeof(struct subIndexNonceData)] = { 0 };
+  size_t offset = 0;
+  encodeUInt64LE(buffer, &offset, sil->subIndex.startBlock);
+  encodeUInt16LE(buffer, &offset, indexId);
+  sil->nonce = generateSecondaryNonce(masterNonce, buffer, sizeof(buffer));
+  if (sil->nonce == 0) {
+    sil->nonce = generateSecondaryNonce(~masterNonce + 1,
+                                        buffer, sizeof(buffer));
+  }
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int setupSubIndex(SubIndexLayout  *sil,
+                         uint64_t        *nextBlockPtr,
+                         SaveLayoutSizes *sls,
+                         unsigned int     instance,
+                         uint64_t         masterNonce)
+{
+  uint64_t startBlock = *nextBlockPtr;
+
+  setupLayout(&sil->subIndex, &startBlock, sls->subIndexBlocks,
+              RL_KIND_INDEX, instance);
+  setupLayout(&sil->volume, nextBlockPtr, sls->volumeBlocks,
+              RL_KIND_VOLUME, RL_SOLE_INSTANCE);
+  unsigned int i;
+  for (i = 0; i < sls->numSaves; ++i) {
+    int result = resetIndexSaveLayout(&sil->saves[i], nextBlockPtr,
+                                      sls->saveBlocks, sls->pageMapBlocks, i);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  if (startBlock != *nextBlockPtr) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "sub index layout regions don't agree");
+  }
+
+  defineSubIndexNonce(sil, masterNonce, instance);
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+/**
+ * Initialize a single file layout using the save layout sizes specified.
+ *
+ * @param layout  the layout to initialize
+ * @param offset  the offset in bytes from the start of the backing storage
+ * @param size    the size in bytes of the backing storage
+ * @param sls     a populated SaveLayoutSizes object
+ *
+ * @return UDS_SUCCESS or an error code, potentially
+ *         UDS_INSUFFICIENT_INDEX_SPACE if the size of the backing store
+ *              is not sufficient for the index configuration,
+ *         UDS_BAD_INDEX_ALIGNMENT if the offset specified does not
+ *              align properly with the index block and page sizes]
+ *         various other errors
+ **/
+__attribute__((warn_unused_result))
+static int initSingleFileLayout(IndexLayout     *layout,
+                                uint64_t         offset,
+                                uint64_t         size,
+                                SaveLayoutSizes *sls)
+{
+  layout->totalBlocks = sls->totalBlocks;
+
+  if (size < sls->totalBlocks * sls->blockSize) {
+    return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE,
+                                   "not enough space for index as configured");
+  }
+
+  generateSuperBlockData(sls->blockSize, sls->numSaves, sls->openChapterBlocks,
+                         sls->pageMapBlocks, &layout->super);
+  initializeIndexVersion(&layout->indexVersion, SUPER_VERSION_CURRENT);
+
+  int result = allocateSingleFileParts(layout, &layout->super);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint64_t nextBlock = offset / sls->blockSize;
+
+  setupLayout(&layout->header, &nextBlock, 1, RL_KIND_HEADER,
+              RL_SOLE_INSTANCE);
+  setupLayout(&layout->config, &nextBlock, 1, RL_KIND_CONFIG,
+              RL_SOLE_INSTANCE);
+  result = setupSubIndex(&layout->index, &nextBlock, sls, 0,
+                         layout->super.nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  setupLayout(&layout->seal, &nextBlock, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE);
+  if (nextBlock * sls->blockSize > offset + size) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "layout does not fit as expected");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static void expectSubIndex(SubIndexLayout *sil,
+                           RegionIterator *iter,
+                           SuperBlockData *super,
+                           unsigned int    instance)
+{
+  if (iter->result != UDS_SUCCESS) {
+    return;
+  }
+
+  uint64_t startBlock = iter->nextBlock;
+
+  expectLayout(true, &sil->subIndex, iter, 0, RL_KIND_INDEX, instance);
+
+  uint64_t endBlock = iter->nextBlock;
+  iter->nextBlock = startBlock;
+
+  expectLayout(true, &sil->volume, iter, 0, RL_KIND_VOLUME, RL_SOLE_INSTANCE);
+
+  unsigned int i;
+  for (i = 0; i < super->maxSaves; ++i) {
+    IndexSaveLayout *isl = &sil->saves[i];
+    expectLayout(true, &isl->indexSave, iter, 0, RL_KIND_SAVE, i);
+  }
+
+  if (iter->nextBlock != endBlock) {
+    iterError(iter, "sub index region does not span all saves");
+  }
+
+  defineSubIndexNonce(sil, super->nonce, instance);
+}
+
+/*****************************************************************************/
+
+/**
+ * Initialize a single file layout from the region table and super block data
+ * stored in stable storage.
+ *
+ * @param layout      the layout to initialize
+ * @param region      the IO region for this layout
+ * @param super       the super block data read from the superblock
+ * @param table       the region table read from the superblock
+ * @param firstBlock  the first block number in the region
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int reconstituteSingleFileLayout(IndexLayout    *layout,
+                                        SuperBlockData *super,
+                                        RegionTable    *table,
+                                        uint64_t        firstBlock)
+{
+  layout->super       = *super;
+  layout->totalBlocks = table->header.regionBlocks;
+
+  RegionIterator iter = {
+    .nextRegion = table->regions,
+    .lastRegion = table->regions + table->header.numRegions,
+    .nextBlock  = firstBlock,
+    .result     = UDS_SUCCESS
+  };
+
+  expectLayout(true, &layout->header, &iter, 1, RL_KIND_HEADER,
+               RL_SOLE_INSTANCE);
+  expectLayout(true, &layout->config, &iter, 1, RL_KIND_CONFIG,
+               RL_SOLE_INSTANCE);
+  expectSubIndex(&layout->index, &iter, &layout->super, 0);
+  expectLayout(true, &layout->seal, &iter, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE);
+
+  if (iter.result != UDS_SUCCESS) {
+    return iter.result;
+  }
+
+  if (iter.nextBlock != firstBlock + layout->totalBlocks) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "layout table does not span total blocks");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int saveSubIndexRegions(IndexLayout *layout)
+{
+  SubIndexLayout *sil = &layout->index;
+  unsigned int j;
+  for (j = 0; j < layout->super.maxSaves; ++j) {
+    IndexSaveLayout *isl = &sil->saves[j];
+    int result = writeIndexSaveLayout(layout, isl);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "unable to format index %u save 0 layout",
+                                     j);
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int makeSingleFileRegionTable(IndexLayout   *layout,
+                                     unsigned int  *numRegionsPtr,
+                                     RegionTable  **tablePtr)
+{
+  unsigned int numRegions =
+    1 +                      // header
+    1 +                      // config
+    1 +                      // index
+    1 +                      // volume
+    layout->super.maxSaves + // saves
+    1;                       // seal
+
+  RegionTable *table;
+  int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion,
+                                 "layout region table", &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  LayoutRegion *lr = &table->regions[0];
+  *lr++ = layout->header;
+  *lr++ = layout->config;
+  SubIndexLayout *sil = &layout->index;
+  *lr++ = sil->subIndex;
+  *lr++ = sil->volume;
+  unsigned int j;
+  for (j = 0; j < layout->super.maxSaves; ++j) {
+    *lr++ = sil->saves[j].indexSave;
+  }
+  *lr++ = layout->seal;
+
+  result = ASSERT((lr == &table->regions[numRegions]),
+                  "incorrect number of regions");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *numRegionsPtr = numRegions;
+  *tablePtr      = table;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int encodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData)
+{
+  int result = putUInt64LEIntoBuffer(buffer, saveData->timestamp);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, saveData->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, saveData->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = zeroBytes(buffer, 4);        /* padding */
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof *saveData,
+                           "%zu bytes encoded of %zu expected",
+                           contentLength(buffer), sizeof(*saveData));
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int encodeRegionHeader(Buffer *buffer, RegionHeader *header)
+{
+  size_t startingLength = contentLength(buffer);
+  int result = putUInt64LEIntoBuffer(buffer, REGION_MAGIC);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->regionBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, header->type);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, header->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, header->numRegions);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, header->payload);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result
+    = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*header),
+                      "%zu bytes encoded, of %zu expected",
+                      contentLength(buffer) - startingLength, sizeof(*header));
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int encodeLayoutRegion(Buffer *buffer, LayoutRegion *region)
+{
+  size_t startingLength = contentLength(buffer);
+  int result = putUInt64LEIntoBuffer(buffer, region->startBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, region->numBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, region->checksum);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, region->kind);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, region->instance);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result
+    = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*region),
+                      "%zu bytes encoded, of %zu expected",
+                      contentLength(buffer) - startingLength, sizeof(*region));
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int encodeSuperBlockData(Buffer *buffer, SuperBlockData *super)
+{
+  int result = putBytes(buffer, 32, &super->magicLabel);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putBytes(buffer, 32, &super->nonceInfo);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, super->nonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, super->version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, super->blockSize);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, super->numIndexes);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt16LEIntoBuffer(buffer, super->maxSaves);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = zeroBytes(buffer, 4);      // aligment
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, super->openChapterBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, super->pageMapBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(SuperBlockData),
+                           "%zu bytes encoded, of %zu expected",
+                           contentLength(buffer), sizeof(SuperBlockData));
+  return result;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int writeSingleFileHeader(IndexLayout    *layout,
+                                 RegionTable    *table,
+                                 unsigned int    numRegions,
+                                 BufferedWriter *writer)
+{
+  table->header = (RegionHeader) {
+    .magic        = REGION_MAGIC,
+    .regionBlocks = layout->totalBlocks,
+    .type         = RH_TYPE_SUPER,
+    .version      = 1,
+    .numRegions   = numRegions,
+    .payload      = sizeof(layout->super),
+  };
+
+  size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion);
+
+  Buffer *buffer;
+  int result = makeBuffer(tableSize, &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodeRegionHeader(buffer, &table->header);
+
+  unsigned int i;
+  for (i = 0; i < numRegions; i++) {
+    if (result == UDS_SUCCESS) {
+      result = encodeLayoutRegion(buffer, &table->regions[i]);
+    }
+  }
+
+  if (result == UDS_SUCCESS) {
+    result = writeToBufferedWriter(writer,  getBufferContents(buffer),
+                                   contentLength(buffer));
+  }
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeBuffer(sizeof(layout->super), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodeSuperBlockData(buffer, &layout->super);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+
+  result = writeToBufferedWriter(writer,  getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return flushBufferedWriter(writer);
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int saveSingleFileConfiguration(IndexLayout *layout)
+{
+  int result = saveSubIndexRegions(layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  RegionTable  *table;
+  unsigned int  numRegions;
+  result = makeSingleFileRegionTable(layout, &numRegions, &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BufferedWriter *writer = NULL;
+  result = openLayoutWriter(layout, &layout->header, &writer);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return result;
+  }
+
+  result = writeSingleFileHeader(layout, table, numRegions, writer);
+  FREE(table);
+  freeBufferedWriter(writer);
+
+  return result;
+}
+
+/*****************************************************************************/
+void putIndexLayout(IndexLayout **layoutPtr)
+{
+  if (layoutPtr == NULL) {
+    return;
+  }
+  IndexLayout *layout = *layoutPtr;
+  *layoutPtr = NULL;
+  if ((layout == NULL) || (--layout->refCount > 0)) {
+    return;
+  }
+
+  SubIndexLayout *sil = &layout->index;
+  if (sil->saves != NULL) {
+    unsigned int j;
+    for (j = 0; j < layout->super.maxSaves; ++j) {
+      IndexSaveLayout *isl = &sil->saves[j];
+      FREE(isl->masterIndexZones);
+      FREE(isl->openChapter);
+      freeBuffer(&isl->indexStateBuffer);
+    }
+  }
+  FREE(sil->saves);
+
+  if (layout->factory != NULL) {
+    putIOFactory(layout->factory);
+  }
+  FREE(layout);
+}
+
+/*****************************************************************************/
+void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr)
+{
+  ++layout->refCount;
+  *layoutPtr = layout;
+}
+
+/*****************************************************************************/
+const struct index_version *getIndexVersion(IndexLayout *layout)
+{
+  return &layout->indexVersion;
+}
+
+/*****************************************************************************/
+int writeIndexConfig(IndexLayout *layout, UdsConfiguration config)
+{
+  BufferedWriter *writer = NULL;
+  int result = openLayoutWriter(layout, &layout->config, &writer);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "failed to open config region");
+  }
+
+  result = writeConfigContents(writer, config);
+  if (result != UDS_SUCCESS) {
+    freeBufferedWriter(writer);
+    return logErrorWithStringError(result, "failed to write config region");
+  }
+  result = flushBufferedWriter(writer);
+  if (result != UDS_SUCCESS) {
+    freeBufferedWriter(writer);
+    return logErrorWithStringError(result, "cannot flush config writer");
+  }
+  freeBufferedWriter(writer);
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config)
+{
+  BufferedReader *reader = NULL;
+  int result = openLayoutReader(layout, &layout->config, &reader);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "failed to open config reader");
+  }
+
+  struct udsConfiguration storedConfig;
+  result = readConfigContents(reader, &storedConfig);
+  if (result != UDS_SUCCESS) {
+    freeBufferedReader(reader);
+    return logErrorWithStringError(result, "failed to read config region");
+  }
+  freeBufferedReader(reader);
+
+  return (areUdsConfigurationsEqual(&storedConfig, config)
+          ? UDS_SUCCESS
+          : UDS_NO_INDEX);
+}
+
+#ifdef __KERNEL__
+/*****************************************************************************/
+int openVolumeBufio(IndexLayout             *layout,
+                    size_t                   blockSize,
+                    unsigned int             reservedBuffers,
+                    struct dm_bufio_client **clientPtr)
+{
+  off_t offset = layout->index.volume.startBlock * layout->super.blockSize;
+  return makeBufio(layout->factory, offset, blockSize, reservedBuffers,
+                   clientPtr);
+}
+#else
+/*****************************************************************************/
+int openVolumeRegion(IndexLayout *layout, IORegion **regionPtr)
+{
+  LayoutRegion *lr = &layout->index.volume;
+  off_t start = lr->startBlock * layout->super.blockSize;
+  size_t size = lr->numBlocks * layout->super.blockSize;
+  int result =  makeIORegion(layout->factory, start, size, regionPtr);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "cannot access index volume region");
+  }
+  return UDS_SUCCESS;
+}
+#endif
+
+/*****************************************************************************/
+uint64_t getVolumeNonce(IndexLayout *layout)
+{
+  return layout->index.nonce;
+}
+
+/*****************************************************************************/
+static uint64_t generateIndexSaveNonce(uint64_t         volumeNonce,
+                                       IndexSaveLayout *isl)
+{
+  struct SaveNonceData {
+    IndexSaveData data;
+    uint64_t      offset;
+  } nonceData;
+
+  nonceData.data = isl->saveData;
+  nonceData.data.nonce = 0;
+  nonceData.offset = isl->indexSave.startBlock;
+
+  byte buffer[sizeof(nonceData)];
+  size_t offset = 0;
+  encodeUInt64LE(buffer, &offset, nonceData.data.timestamp);
+  encodeUInt64LE(buffer, &offset, nonceData.data.nonce);
+  encodeUInt32LE(buffer, &offset, nonceData.data.version);
+  encodeUInt32LE(buffer, &offset, 0U);    // padding
+  encodeUInt64LE(buffer, &offset, nonceData.offset);
+  ASSERT_LOG_ONLY(offset == sizeof(nonceData),
+                  "%zu bytes encoded of %zu expected",
+                  offset, sizeof(nonceData));
+  return generateSecondaryNonce(volumeNonce, buffer, sizeof(buffer));
+}
+
+/*****************************************************************************/
+static int validateIndexSaveLayout(IndexSaveLayout *isl,
+                                   uint64_t         volumeNonce,
+                                   uint64_t        *saveTimePtr)
+{
+  if (isl->saveType == NO_SAVE || isl->numZones == 0 ||
+      isl->saveData.timestamp == 0)
+  {
+    return UDS_BAD_STATE;
+  }
+  if (isl->saveData.nonce != generateIndexSaveNonce(volumeNonce, isl)) {
+    return UDS_BAD_STATE;
+  }
+  if (saveTimePtr != NULL) {
+    *saveTimePtr = isl->saveData.timestamp;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int selectOldestIndexSaveLayout(SubIndexLayout   *sil,
+                                       unsigned int      maxSaves,
+                                       IndexSaveLayout **islPtr)
+{
+  IndexSaveLayout *oldest = NULL;
+  uint64_t         oldestTime = 0;
+
+  // find the oldest valid or first invalid slot
+  IndexSaveLayout *isl;
+  for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) {
+    uint64_t saveTime = 0;
+    int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime);
+    if (result != UDS_SUCCESS) {
+      saveTime = 0;
+    }
+    if (oldest == NULL || saveTime < oldestTime) {
+      oldest = isl;
+      oldestTime = saveTime;
+    }
+  }
+
+  int result = ASSERT((oldest != NULL), "no oldest or free save slot");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  *islPtr = oldest;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int selectLatestIndexSaveLayout(SubIndexLayout   *sil,
+                                       unsigned int      maxSaves,
+                                       IndexSaveLayout **islPtr)
+{
+  IndexSaveLayout *latest = NULL;
+  uint64_t         latestTime = 0;
+
+  // find the latest valid save slot
+  IndexSaveLayout *isl;
+  for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) {
+    uint64_t saveTime = 0;
+    int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime);
+    if (result != UDS_SUCCESS) {
+      continue;
+    }
+    if (saveTime > latestTime) {
+      latest = isl;
+      latestTime = saveTime;
+    }
+  }
+
+  if (latest == NULL) {
+    return UDS_INDEX_NOT_SAVED_CLEANLY;
+  }
+  *islPtr = latest;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static uint64_t getTimeMS(AbsTime time)
+{
+  time_t t = asTimeT(time);
+  RelTime r = timeDifference(time, fromTimeT(t));
+  return (uint64_t) t * 1000 + relTimeToMilliseconds(r);
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int instantiateIndexSaveLayout(IndexSaveLayout *isl,
+                                      SuperBlockData  *super,
+                                      uint64_t         volumeNonce,
+                                      unsigned int     numZones,
+                                      IndexSaveType    saveType)
+{
+  int result = UDS_SUCCESS;
+  if (isl->openChapter && saveType == IS_CHECKPOINT) {
+    FREE(isl->openChapter);
+    isl->openChapter = NULL;
+  } else if (isl->openChapter == NULL && saveType == IS_SAVE) {
+    result = ALLOCATE(1, LayoutRegion, "open chapter layout",
+                      &isl->openChapter);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  if (numZones != isl->numZones) {
+    if (isl->masterIndexZones != NULL) {
+      FREE(isl->masterIndexZones);
+    }
+    result = ALLOCATE(numZones, LayoutRegion, "master index zone layouts",
+                      &isl->masterIndexZones);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    isl->numZones = numZones;
+  }
+
+  populateIndexSaveLayout(isl, super, numZones, saveType);
+
+  result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &isl->indexStateBuffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  isl->read = isl->written = false;
+  isl->saveType = saveType;
+  memset(&isl->saveData, 0, sizeof(isl->saveData));
+  isl->saveData.timestamp = getTimeMS(currentTime(CLOCK_REALTIME));
+  isl->saveData.version   = 1;
+
+  isl->saveData.nonce = generateIndexSaveNonce(volumeNonce, isl);
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int invalidateOldSave(IndexLayout *layout, IndexSaveLayout *isl)
+{
+  uint64_t startBlock = isl->indexSave.startBlock;
+  uint64_t saveBlocks = isl->indexSave.numBlocks;
+  unsigned int save   = isl->indexSave.instance;
+
+  int result = resetIndexSaveLayout(isl, &startBlock, saveBlocks,
+                                    layout->super.pageMapBlocks, save);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return writeIndexSaveLayout(layout, isl);
+}
+
+/*****************************************************************************/
+int setupIndexSaveSlot(IndexLayout   *layout,
+                       unsigned int   numZones,
+                       IndexSaveType  saveType,
+                       unsigned int  *saveSlotPtr)
+{
+  SubIndexLayout *sil = &layout->index;
+
+  IndexSaveLayout *isl = NULL;
+  int result = selectOldestIndexSaveLayout(sil, layout->super.maxSaves, &isl);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = invalidateOldSave(layout, isl);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = instantiateIndexSaveLayout(isl, &layout->super, sil->nonce,
+                                      numZones, saveType);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *saveSlotPtr = isl - sil->saves;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int findLatestIndexSaveSlot(IndexLayout  *layout,
+                            unsigned int *numZonesPtr,
+                            unsigned int *slotPtr)
+{
+  SubIndexLayout *sil = &layout->index;
+
+  IndexSaveLayout *isl = NULL;
+  int result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (numZonesPtr != NULL) {
+    *numZonesPtr = isl->numZones;
+  }
+  if (slotPtr != NULL) {
+    *slotPtr = isl - sil->saves;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int makeIndexSaveRegionTable(IndexSaveLayout  *isl,
+                                    unsigned int     *numRegionsPtr,
+                                    RegionTable     **tablePtr)
+{
+  unsigned int numRegions =
+    1 +                         // header
+    1 +                         // index page map
+    isl->numZones +             // master index zones
+    (bool) isl->openChapter;    // open chapter if needed
+
+  if (isl->freeSpace.numBlocks > 0) {
+    numRegions++;
+  }
+
+  RegionTable *table;
+  int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion,
+                                 "layout region table for ISL", &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  LayoutRegion *lr = &table->regions[0];
+  *lr++ = isl->header;
+  *lr++ = isl->indexPageMap;
+  unsigned int z;
+  for (z = 0; z < isl->numZones; ++z) {
+    *lr++ = isl->masterIndexZones[z];
+  }
+  if (isl->openChapter) {
+    *lr++ = *isl->openChapter;
+  }
+  if (isl->freeSpace.numBlocks > 0) {
+    *lr++ = isl->freeSpace;
+  }
+
+  result = ASSERT((lr == &table->regions[numRegions]),
+                  "incorrect number of ISL regions");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *numRegionsPtr = numRegions;
+  *tablePtr = table;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static unsigned int regionTypeForSaveType(IndexSaveType saveType)
+{
+  switch (saveType) {
+    case IS_SAVE:
+      return RH_TYPE_SAVE;
+
+    case IS_CHECKPOINT:
+      return RH_TYPE_CHECKPOINT;
+
+    default:
+      break;
+  }
+
+  return RH_TYPE_UNSAVED;
+}
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int writeIndexSaveHeader(IndexSaveLayout *isl,
+                                RegionTable     *table,
+                                unsigned int     numRegions,
+                                BufferedWriter  *writer)
+{
+  size_t payload = sizeof(isl->saveData);
+  if (isl->indexStateBuffer != NULL) {
+    payload += contentLength(isl->indexStateBuffer);
+  }
+
+  table->header = (RegionHeader) {
+    .magic        = REGION_MAGIC,
+    .regionBlocks = isl->indexSave.numBlocks,
+    .type         = regionTypeForSaveType(isl->saveType),
+    .version      = 1,
+    .numRegions   = numRegions,
+    .payload      = payload,
+  };
+
+  size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion);
+  Buffer *buffer;
+  int result = makeBuffer(tableSize, &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodeRegionHeader(buffer, &table->header);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+
+  unsigned int i;
+  for (i = 0; i < numRegions; i++) {
+    result = encodeLayoutRegion(buffer, &table->regions[i]);
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return result;
+    }
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == tableSize,
+                           "%zu bytes encoded of %zu expected",
+                           contentLength(buffer), tableSize);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+
+  result = writeToBufferedWriter(writer,  getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeBuffer(sizeof(isl->saveData), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodeIndexSaveData(buffer,  &isl->saveData);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+
+  result = writeToBufferedWriter(writer, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (isl->indexStateBuffer != NULL) {
+    result = writeToBufferedWriter(writer,
+                                   getBufferContents(isl->indexStateBuffer),
+                                   contentLength(isl->indexStateBuffer));
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return flushBufferedWriter(writer);
+}
+
+/*****************************************************************************/
+static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl)
+{
+  unsigned int  numRegions;
+  RegionTable  *table;
+  int result = makeIndexSaveRegionTable(isl, &numRegions, &table);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BufferedWriter *writer = NULL;
+  result = openLayoutWriter(layout, &isl->header, &writer);
+  if (result != UDS_SUCCESS) {
+    FREE(table);
+    return result;
+  }
+
+  result = writeIndexSaveHeader(isl, table, numRegions, writer);
+  FREE(table);
+  freeBufferedWriter(writer);
+
+  isl->written = true;
+  return result;
+}
+
+/*****************************************************************************/
+int commitIndexSave(IndexLayout *layout, unsigned int saveSlot)
+{
+  int result = ASSERT((saveSlot < layout->super.maxSaves),
+                      "save slot out of range");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  IndexSaveLayout *isl = &layout->index.saves[saveSlot];
+
+  if (bufferUsed(isl->indexStateBuffer) == 0) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "%s: no index state data saved", __func__);
+  }
+
+  return writeIndexSaveLayout(layout, isl);
+}
+
+/*****************************************************************************/
+
+static void mutilateIndexSaveInfo(IndexSaveLayout *isl)
+{
+  memset(&isl->saveData, 0, sizeof(isl->saveData));
+  isl->read = isl->written = 0;
+  isl->saveType = NO_SAVE;
+  isl->numZones = 0;
+  freeBuffer(&isl->indexStateBuffer);
+}
+
+/*****************************************************************************/
+int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot)
+{
+  int result = ASSERT((saveSlot < layout->super.maxSaves),
+                      "save slot out of range");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  mutilateIndexSaveInfo(&layout->index.saves[saveSlot]);
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int discardIndexSaves(IndexLayout *layout, bool all)
+{
+  int result = UDS_SUCCESS;
+  SubIndexLayout *sil = &layout->index;
+
+  if (all) {
+    unsigned int i;
+    for (i = 0; i < layout->super.maxSaves; ++i) {
+      IndexSaveLayout *isl = &sil->saves[i];
+      result = firstError(result, invalidateOldSave(layout, isl));
+    }
+  } else {
+    IndexSaveLayout *isl;
+    result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl);
+    if (result == UDS_SUCCESS) {
+      result = invalidateOldSave(layout, isl);
+    }
+  }
+
+  return result;
+}
+
+/*****************************************************************************/
+static int createIndexLayout(IndexLayout            *layout,
+                             uint64_t                size,
+                             const UdsConfiguration  config)
+{
+  if (config == NULL) {
+    return UDS_CONF_PTR_REQUIRED;
+  }
+
+  SaveLayoutSizes sizes;
+  int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, 0);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (size < sizes.totalBlocks * sizes.blockSize) {
+    return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE,
+                                   "layout requires at least %" PRIu64 
+                                   " bytes",
+                                   sizes.totalBlocks * sizes.blockSize);
+  }
+
+  result = initSingleFileLayout(layout, layout->offset, size, &sizes);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = saveSingleFileConfiguration(layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot)
+{
+  return layout->index.saves[slot].indexStateBuffer;
+}
+
+/*****************************************************************************/
+static int findLayoutRegion(IndexLayout   *layout,
+                            unsigned int   slot,
+                            const char    *operation,
+                            RegionKind     kind,
+                            unsigned int   zone,
+                            LayoutRegion **lrPtr)
+{
+  int result = ASSERT((slot < layout->super.maxSaves), "%s not started",
+                  operation);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  IndexSaveLayout *isl = &layout->index.saves[slot];
+
+  LayoutRegion *lr = NULL;
+  switch (kind) {
+    case RL_KIND_INDEX_PAGE_MAP:
+      lr = &isl->indexPageMap;
+      break;
+
+    case RL_KIND_OPEN_CHAPTER:
+      if (isl->openChapter == NULL) {
+        return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                       "%s: %s has no open chapter",
+                                       __func__, operation);
+      }
+      lr = isl->openChapter;
+      break;
+
+    case RL_KIND_MASTER_INDEX:
+      if (isl->masterIndexZones == NULL || zone >= isl->numZones) {
+        return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                       "%s: %s has no master index zone %u",
+                                       __func__, operation, zone);
+      }
+      lr = &isl->masterIndexZones[zone];
+      break;
+
+    default:
+      return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                     "%s: unexpected kind %u",
+                                     __func__, kind);
+  }
+
+  *lrPtr = lr;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int openIndexBufferedReader(IndexLayout     *layout,
+                            unsigned int     slot,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedReader **readerPtr)
+{
+  LayoutRegion *lr = NULL;
+  int result = findLayoutRegion(layout, slot, "load", kind, zone, &lr);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return openLayoutReader(layout, lr, readerPtr);
+}
+
+/*****************************************************************************/
+int openIndexBufferedWriter(IndexLayout     *layout,
+                            unsigned int     slot,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedWriter **writerPtr)
+{
+  LayoutRegion *lr = NULL;
+  int result = findLayoutRegion(layout, slot, "save", kind, zone, &lr);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return openLayoutWriter(layout, lr, writerPtr);
+}
+
+/*****************************************************************************/
+int makeIndexLayoutFromFactory(IOFactory               *factory,
+                               off_t                    offset,
+                               uint64_t                 namedSize,
+                               bool                     newLayout,
+                               const UdsConfiguration   config,
+                               IndexLayout            **layoutPtr)
+{
+  // Get the device size and round it down to a multiple of UDS_BLOCK_SIZE.
+  size_t size = getWritableSize(factory) & -UDS_BLOCK_SIZE;
+  if (namedSize > size) {
+    return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE,
+                                   "index storage (%zu) is smaller than the"
+                                   " requested size %llu",
+                                   size, namedSize);
+  }
+  if ((namedSize > 0) && (namedSize < size)) {
+    size = namedSize;
+  }
+
+  // Get the index size according the the config
+  uint64_t configSize;
+  int result = udsComputeIndexSize(config, 0, &configSize);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (size < configSize) {
+    return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE,
+                                   "index storage (%zu) is smaller than the"
+                                   " required size %llu",
+                                   size, configSize);
+  }
+  size = configSize;
+
+  IndexLayout *layout = NULL;
+  result = ALLOCATE(1, IndexLayout, __func__, &layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  layout->refCount = 1;
+
+  getIOFactory(factory);
+  layout->factory = factory;
+  layout->offset  = offset;
+
+  if (newLayout) {
+    // Populate the layout from the UDSConfiguration
+    result = createIndexLayout(layout, size, config);
+  } else {
+    // Populate the layout from the saved index.
+    result = loadIndexLayout(layout);
+  }
+  if (result != UDS_SUCCESS) {
+    putIndexLayout(&layout);
+    return result;
+  }
+  *layoutPtr = layout;
+  return UDS_SUCCESS;
+}
diff --git a/uds/indexLayout.h b/uds/indexLayout.h
new file mode 100644
index 0000000..4144799
--- /dev/null
+++ b/uds/indexLayout.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.h#13 $
+ */
+
+#ifndef INDEX_LAYOUT_H
+#define INDEX_LAYOUT_H
+
+#include "buffer.h"
+#include "indexState.h"
+#include "indexVersion.h"
+#include "ioFactory.h"
+#include "uds.h"
+
+typedef struct indexLayout IndexLayout;
+
+/**
+ * Construct an index layout.  This is a platform specific function that uses
+ * the name string, a flag that indicates old vs. new indices, and a
+ * UDSConfiguration (for new indices) to make an IOFactory and invoke
+ * makeIndexLayoutFromFactory.
+ *
+ * @param name       String naming the index.  Each platform will use its own
+ *                   conventions to interpret the string, but in general it is
+ *                   a space-separated sequence of param=value settings.  For
+ *                   backward compatibility a string without an equals is
+ *                   treated as a platform-specific default parameter value.
+ * @param newLayout  Whether this is a new layout.
+ * @param config     The UdsConfiguration required for a new layout.
+ * @param layoutPtr  Where to store the new index layout
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int makeIndexLayout(const char              *name,
+                    bool                     newLayout,
+                    const UdsConfiguration   config,
+                    IndexLayout            **layoutPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Construct an index layout using an IOFactory.  This method is common to all
+ * platforms.
+ *
+ * @param factory    The IOFactory for the block storage containing the index.
+ * @param offset     The offset of the start of the index within the block
+ *                   storage address space.
+ * @param namedSize  The size in bytes of the space within the block storage
+ *                   address space, as specified in the name string.
+ * @param newLayout  Whether this is a new layout.
+ * @param config     The UdsConfiguration required for a new layout.
+ * @param layoutPtr  Where to store the new index layout
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int makeIndexLayoutFromFactory(IOFactory               *factory,
+                               off_t                    offset,
+                               uint64_t                 namedSize,
+                               bool                     newLayout,
+                               const UdsConfiguration   config,
+                               IndexLayout            **layoutPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decrement the use count of an index layout.  If the count goes to zero, free
+ * the index layout.
+ *
+ * @param layoutPtr  Where the layout is being stored.  Always reset to NULL.
+ **/
+void putIndexLayout(IndexLayout **layoutPtr);
+
+/*****************************************************************************/
+int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot)
+  __attribute__((warn_unused_result));
+
+/*****************************************************************************/
+int commitIndexSave(IndexLayout *layout, unsigned int saveSlot)
+  __attribute__((warn_unused_result));
+
+/*****************************************************************************/
+int discardIndexSaves(IndexLayout *layout, bool all)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the latest index save slot.
+ *
+ * @param [in]  layout          The single file layout.
+ * @param [out] numZonesPtr     Where to store the actual number of zones
+ *                                that were saved.
+ * @param [out] slotPtr         Where to store the slot number we found.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int findLatestIndexSaveSlot(IndexLayout  *layout,
+                            unsigned int *numZonesPtr,
+                            unsigned int *slotPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get another reference to an index layout, incrementing it's use count.
+ *
+ * @param layout     The index layout.
+ * @param layoutPtr  Where the new layout pointer is being stored.
+ **/
+void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr);
+
+/**
+ * Open a BufferedReader for a specified state, kind, and zone.
+ *
+ * @param layout     The index layout
+ * @param slot       The save slot
+ * @param kind       The kind if index save region to open.
+ * @param zone       The zone number for the region.
+ * @param readerPtr  Where to store the BufferedReader.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openIndexBufferedReader(IndexLayout     *layout,
+                            unsigned int     slot,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedReader **readerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open a BufferedWriter for a specified state, kind, and zone.
+ *
+ * @param layout     The index layout
+ * @param slot       The save slot
+ * @param kind       The kind if index save region to open.
+ * @param zone       The zone number for the region.
+ * @param writerPtr  Where to store the BufferedWriter.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openIndexBufferedWriter(IndexLayout     *layout,
+                            unsigned int     slot,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedWriter **writerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Obtain the nonce to be used to store or validate the loading of volume index
+ * pages.
+ *
+ * @param [in]  layout   The index layout.
+ *
+ * @return The nonce to use.
+ **/
+uint64_t getVolumeNonce(IndexLayout *layout)
+  __attribute__((warn_unused_result));
+
+#ifdef __KERNEL__
+/**
+ * Obtain a dm_bufio_client for the specified index volume.
+ *
+ * @param [in]  layout           The index layout.
+ * @param [in]  blockSize        The size of a volume page
+ * @param [in]  reservedBuffers  The count of reserved buffers
+ * @param [out] clientPtr        Where to put the new dm_bufio_client
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openVolumeBufio(IndexLayout             *layout,
+                    size_t                   blockSize,
+                    unsigned int             reservedBuffers,
+                    struct dm_bufio_client **clientPtr)
+  __attribute__((warn_unused_result));
+#else
+/**
+ * Obtain an IORegion for the specified index volume.
+ *
+ * @param [in]  layout     The index layout.
+ * @param [out] regionPtr  Where to put the new region.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openVolumeRegion(IndexLayout *layout, struct ioRegion **regionPtr)
+  __attribute__((warn_unused_result));
+#endif
+
+/**
+ * Read the index configuration, and verify that it matches the given
+ * configuration.
+ *
+ * @param layout  the generic index layout
+ * @param config  the index configuration
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine which index save slot to use for a new index save.
+ *
+ * Also allocates the masterIndex regions and, if needed, the openChapter
+ * region.
+ *
+ * @param [in]  layout          The index layout.
+ * @param [in]  numZones        Actual number of zones currently in use.
+ * @param [in]  saveType        The index save type.
+ * @param [out] saveSlotPtr     Where to store the save slot number.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int setupIndexSaveSlot(IndexLayout   *layout,
+                       unsigned int   numZones,
+                       IndexSaveType  saveType,
+                       unsigned int  *saveSlotPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write the index configuration.
+ *
+ * @param layout  the generic index layout
+ * @param config  the index configuration to write
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int writeIndexConfig(IndexLayout *layout, UdsConfiguration config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the index state buffer
+ *
+ * @param layout  the index layout
+ * @param slot    the save slot
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the index version parameters.
+ *
+ * @param layout  the index layout
+ *
+ * @return the index version parameters.
+ **/
+const struct index_version *getIndexVersion(IndexLayout *layout)
+  __attribute__((warn_unused_result));
+
+#endif // INDEX_LAYOUT_H
diff --git a/uds/indexLayoutLinuxKernel.c b/uds/indexLayoutLinuxKernel.c
new file mode 100644
index 0000000..8301166
--- /dev/null
+++ b/uds/indexLayoutLinuxKernel.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/indexLayoutLinuxKernel.c#5 $
+ */
+
+#include "indexLayout.h"
+#include "indexLayoutParser.h"
+#include "memoryAlloc.h"
+
+/*****************************************************************************/
+int makeIndexLayout(const char              *name,
+                    bool                     newLayout,
+                    const UdsConfiguration   config,
+                    IndexLayout            **layoutPtr)
+{
+  char     *dev    = NULL;
+  uint64_t  offset = 0;
+  uint64_t  size   = 0;
+
+  LayoutParameter parameterTable[] = {
+    { "dev",    LP_STRING | LP_DEFAULT, { .str = &dev    } },
+    { "offset", LP_UINT64,              { .num = &offset } },
+    { "size",   LP_UINT64,              { .num = &size   } },
+  };
+  size_t numParameters = sizeof(parameterTable) / sizeof(*parameterTable);
+
+  char *params = NULL;
+  int result = duplicateString(name, "makeIndexLayout parameters", &params);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // note dev will be set to memory owned by params
+  result = parseLayoutString(params, parameterTable, numParameters);
+  if (result != UDS_SUCCESS) {
+    FREE(params);
+    return result;
+  }
+
+  IOFactory *factory = NULL;
+  result = makeIOFactory(dev, &factory);
+  FREE(params);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  IndexLayout *layout;
+  result = makeIndexLayoutFromFactory(factory, offset, size, newLayout, config,
+                                      &layout);
+  putIOFactory(factory);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  *layoutPtr = layout;
+  return UDS_SUCCESS;
+}
diff --git a/uds/indexLayoutParser.c b/uds/indexLayoutParser.c
new file mode 100644
index 0000000..808def7
--- /dev/null
+++ b/uds/indexLayoutParser.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.c#2 $
+ */
+
+#include "indexLayoutParser.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "permassert.h"
+#include "stringUtils.h"
+#include "typeDefs.h"
+#include "uds.h"
+
+/*****************************************************************************/
+__attribute__((warn_unused_result))
+static int setParameterValue(LayoutParameter *lp, char *data)
+{
+  if ((lp->type & LP_TYPE_MASK) == LP_UINT64) {
+    int result = parseUint64(data, lp->value.num);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED,
+                                     "bad numeric value %s", data);
+    }
+  } else if ((lp->type & LP_TYPE_MASK) == LP_STRING) {
+    *lp->value.str = data;
+  } else {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "unkown LayoutParameter type code %x",
+                                   (lp->type & LP_TYPE_MASK));
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int parseLayoutString(char *info, LayoutParameter *params, size_t count)
+{
+  if (!strchr(info, '=')) {
+    LayoutParameter *lp;
+    for (lp = params; lp < params + count; ++lp) {
+      if (lp->type & LP_DEFAULT) {
+        int result = setParameterValue(lp, info);
+        if (result != UDS_SUCCESS) {
+          return result;
+        }
+        break;
+      }
+    }
+  } else {
+    char *data = NULL;
+    char *token;
+    for (token = nextToken(info, " ", &data);
+         token;
+         token = nextToken(NULL, " ", &data))
+    {
+      char *equal = strchr(token, '=');
+      LayoutParameter *lp;
+      for (lp = params; lp < params + count; ++lp) {
+        if (!equal && (lp->type & LP_DEFAULT)) {
+          break;
+        } else if (strncmp(token, lp->name, equal - token) == 0 &&
+                   strlen(lp->name) == (size_t) (equal - token)) {
+          break;
+        }
+      }
+      if (lp == NULL) {
+        return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED,
+                                       "unkown index parameter %s",
+                                       token);
+      }
+      if (lp->seen) {
+        return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED,
+                                       "duplicate index parameter %s",
+                                       token);
+      }
+      lp->seen = true;
+      int result = setParameterValue(lp, equal ? equal + 1 : token);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+  }
+  return UDS_SUCCESS;
+}
diff --git a/uds/indexLayoutParser.h b/uds/indexLayoutParser.h
new file mode 100644
index 0000000..35b492a
--- /dev/null
+++ b/uds/indexLayoutParser.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.h#1 $
+ */
+
+#ifndef INDEX_LAYOUT_PARSER_H
+#define INDEX_LAYOUT_PARSER_H
+
+#include "typeDefs.h"
+
+typedef enum {
+  LP_STRING    = 0x001,
+  LP_UINT64    = 0x002,
+  LP_TYPE_MASK = 0x0FF,
+  LP_DEFAULT   = 0x100,
+} LPType;
+
+typedef struct layoutParameter {
+  const char *name;
+  LPType      type;
+  union {
+    char     **str;
+    uint64_t  *num;
+  } value;
+  bool        seen;
+} LayoutParameter;
+
+/**
+ * Function to parse an index layout specification.
+ *
+ * This parser treats the specification as a set of name=value parameters
+ * or, in the absence of an '=' character, a single value for a default
+ * parameter. The list of acceptable parameters is specified as an array
+ * of LayoutParameter entries. Each such parameter contains the address
+ * of the variable in which the value is to be stored.
+ *
+ * @param info          A copy of the index layout specification that
+ *                        will be altered by the parser to insert null
+ *                        characters after each value. Note that string
+ *                        parameter values will point into the memory of
+ *                        this string, so this specification cannot be
+ *                        deallocated until all uses of the parameter
+ *                        values are over.
+ * @param params        The table of parameters the caller expects to
+ *                        find in the ``info'' string. Currently this
+ *                        parser can handle string and uint64_t values.
+ * @param count         The size of the parameter table.
+ *
+ * @return UDS_SUCCESS or an error code, particularly
+ *      UDS_INDEX_NAME_REQUIRED for all parsing errors.
+ **/
+int parseLayoutString(char *info, LayoutParameter *params, size_t count)
+  __attribute__((warn_unused_result));
+
+#endif // INDEX_LAYOUT_PARSER_H
diff --git a/uds/indexPageMap.c b/uds/indexPageMap.c
new file mode 100644
index 0000000..a915179
--- /dev/null
+++ b/uds/indexPageMap.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.c#4 $
+ */
+
+#include "indexPageMap.h"
+
+#include "buffer.h"
+#include "bufferedWriter.h"
+#include "compiler.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "indexComponent.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "stringUtils.h"
+#include "threads.h"
+#include "uds.h"
+
+static int readIndexPageMap(ReadPortal *portal);
+static int writeIndexPageMap(IndexComponent *component,
+                             BufferedWriter *writer,
+                             unsigned int    zone);
+
+static const byte INDEX_PAGE_MAP_MAGIC[] = "ALBIPM02";
+enum {
+  INDEX_PAGE_MAP_MAGIC_LENGTH = sizeof(INDEX_PAGE_MAP_MAGIC) - 1,
+};
+
+const IndexComponentInfo INDEX_PAGE_MAP_INFO = {
+  .kind        = RL_KIND_INDEX_PAGE_MAP,
+  .name        = "index page map",
+  .saveOnly    = false,
+  .chapterSync = true,
+  .multiZone   = false,
+  .ioStorage   = true,
+  .loader      = readIndexPageMap,
+  .saver       = writeIndexPageMap,
+  .incremental = NULL,
+};
+
+/*****************************************************************************/
+static INLINE size_t numEntries(const Geometry *geometry)
+{
+  return geometry->chaptersPerVolume * (geometry->indexPagesPerChapter - 1);
+}
+
+/*****************************************************************************/
+int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr)
+{
+  unsigned int deltaListsPerChapter = geometry->deltaListsPerChapter;
+  int result
+    = ASSERT_WITH_ERROR_CODE(((deltaListsPerChapter - 1) <= UINT16_MAX),
+                             UDS_BAD_STATE,
+                             "delta lists per chapter (%u) is too large",
+                             deltaListsPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  IndexPageMap *map;
+  result = ALLOCATE(1, IndexPageMap, "Index Page Map", &map);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  map->geometry = geometry;
+
+  result = ALLOCATE(numEntries(geometry),
+                    IndexPageMapEntry,
+                    "Index Page Map Entries",
+                    &map->entries);
+  if (result != UDS_SUCCESS) {
+    freeIndexPageMap(map);
+    return result;
+  }
+
+  *mapPtr = map;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void freeIndexPageMap(IndexPageMap *map)
+{
+  if (map != NULL) {
+    FREE(map->entries);
+    FREE(map);
+  }
+}
+
+/*****************************************************************************/
+uint64_t getLastUpdate(const IndexPageMap *map)
+{
+  return map->lastUpdate;
+}
+
+/*****************************************************************************/
+int updateIndexPageMap(IndexPageMap   *map,
+                       uint64_t        virtualChapterNumber,
+                       unsigned int    chapterNumber,
+                       unsigned int    indexPageNumber,
+                       unsigned int    deltaListNumber)
+{
+  const Geometry *geometry = map->geometry;
+  if ((virtualChapterNumber < map->lastUpdate)
+      || (virtualChapterNumber > map->lastUpdate + 1)) {
+    // if the lastUpdate is 0, this is likely to be normal because we are
+    // replaying the volume
+    if (map->lastUpdate != 0) {
+      logWarning("unexpected index page map update, jumping from %" PRIu64
+                 " to %llu",
+                 map->lastUpdate, virtualChapterNumber);
+    }
+  }
+  map->lastUpdate = virtualChapterNumber;
+
+  if (chapterNumber >= geometry->chaptersPerVolume) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u",
+      chapterNumber, geometry->chaptersPerVolume - 1);
+  }
+  if (indexPageNumber >= geometry->indexPagesPerChapter) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "index page number %u exceeds maximum %u",
+      indexPageNumber, geometry->indexPagesPerChapter - 1);
+  }
+  if (deltaListNumber >= geometry->deltaListsPerChapter) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "delta list number %u exceeds maximum %u",
+      deltaListNumber, geometry->deltaListsPerChapter - 1);
+  }
+
+  if (indexPageNumber == (geometry->indexPagesPerChapter - 1)) {
+    /*
+     * There is no entry for the last index page of a chapter since its entry
+     * would always be geometry->deltaListsPerChapter - 1.
+     */
+    return UDS_SUCCESS;
+  }
+
+  size_t slot
+    = (chapterNumber * (geometry->indexPagesPerChapter - 1)) + indexPageNumber;
+  map->entries[slot] = (IndexPageMapEntry) deltaListNumber;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int findIndexPageNumber(const IndexPageMap *map,
+                        const UdsChunkName *name,
+                        unsigned int        chapterNumber,
+                        unsigned int       *indexPageNumberPtr)
+{
+  const Geometry *geometry = map->geometry;
+  if (chapterNumber >= geometry->chaptersPerVolume) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u",
+      chapterNumber, geometry->chaptersPerVolume - 1);
+  }
+
+  unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry);
+  unsigned int slot = (chapterNumber * (geometry->indexPagesPerChapter - 1));
+  unsigned int limit = slot + (geometry->indexPagesPerChapter - 1);
+  unsigned int indexPageNumber = 0;
+  for (; slot < limit; indexPageNumber++, slot++) {
+    if (deltaListNumber <= map->entries[slot]) {
+      break;
+    }
+  }
+
+  // This should be a clear post-condition of the loop above, but just in case
+  // it's not obvious, the check is cheap.
+  int result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter),
+                      "index page number too large");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *indexPageNumberPtr = indexPageNumber;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getListNumberBounds(const IndexPageMap *map,
+                        unsigned int        chapterNumber,
+                        unsigned int        indexPageNumber,
+                        IndexPageBounds    *bounds)
+{
+  const Geometry *geometry = map->geometry;
+  int result = ASSERT((chapterNumber < geometry->chaptersPerVolume),
+                      "chapter number is valid");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter),
+                  "index page number is valid");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int slot = chapterNumber * (geometry->indexPagesPerChapter - 1);
+  bounds->lowestList = ((indexPageNumber == 0)
+                        ? 0
+                        : map->entries[slot + indexPageNumber - 1] + 1);
+  bounds->highestList = ((indexPageNumber == geometry->indexPagesPerChapter - 1)
+                         ? geometry->deltaListsPerChapter - 1
+                         : map->entries[slot + indexPageNumber]);
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+size_t indexPageMapSize(const Geometry *geometry)
+{
+  return sizeof(IndexPageMapEntry) * numEntries(geometry);
+}
+
+/*****************************************************************************/
+static int writeIndexPageMap(IndexComponent *component,
+                             BufferedWriter *writer,
+                             unsigned int    zone)
+{
+  int result = ASSERT((zone == 0), "unimplemented zone %d", zone);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  IndexPageMap *map = indexComponentData(component);
+
+  Buffer *buffer;
+  result = makeBuffer(INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(map->lastUpdate),
+                      &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putBytes(buffer, INDEX_PAGE_MAP_MAGIC_LENGTH, INDEX_PAGE_MAP_MAGIC);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, map->lastUpdate);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(writer, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "cannot write index page map header");
+  }
+  result = makeBuffer(indexPageMapSize(map->geometry), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result
+    = putUInt16LEsIntoBuffer(buffer, numEntries(map->geometry), map->entries);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(writer, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "cannot write index page map data");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+uint64_t computeIndexPageMapSaveSize(const Geometry *geometry)
+{
+  return indexPageMapSize(geometry) +
+    INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(((IndexPageMap *) 0)->lastUpdate);
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeIndexPageMap(Buffer *buffer, IndexPageMap *map)
+{
+  int result = getUInt64LEFromBuffer(buffer, &map->lastUpdate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt16LEsFromBuffer(buffer, numEntries(map->geometry),
+                                  map->entries);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer) - contentLength(buffer),
+                           bufferLength(buffer));
+  return result;
+}
+
+/*****************************************************************************/
+static int readIndexPageMap(ReadPortal *portal)
+{
+  IndexPageMap *map = indexComponentData(portal->component);
+
+  BufferedReader *reader = NULL;
+
+  int result = getBufferedReaderForPortal(portal, 0, &reader);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = verifyBufferedData(reader, INDEX_PAGE_MAP_MAGIC,
+                              INDEX_PAGE_MAP_MAGIC_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "bad index page map saved magic");
+  }
+
+  Buffer *buffer;
+  result
+    = makeBuffer(sizeof(map->lastUpdate) + indexPageMapSize(map->geometry),
+                 &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = readFromBufferedReader(reader, getBufferContents(buffer),
+                                  bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    logErrorWithStringError(result, "cannot read index page map data");
+    return result;
+  }
+  result = resetBufferEnd(buffer, bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = decodeIndexPageMap(buffer, map);
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  logDebug("read index page map, last update %llu", map->lastUpdate);
+  return UDS_SUCCESS;
+}
diff --git a/uds/indexPageMap.h b/uds/indexPageMap.h
new file mode 100644
index 0000000..3767cdd
--- /dev/null
+++ b/uds/indexPageMap.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.h#2 $
+ */
+
+#ifndef INDEX_PAGE_MAP_H
+#define INDEX_PAGE_MAP_H 1
+
+#include "common.h"
+#include "geometry.h"
+#include "indexComponent.h"
+
+extern const IndexComponentInfo INDEX_PAGE_MAP_INFO;
+
+typedef struct indexPageMap IndexPageMap;
+
+typedef struct {
+  unsigned int lowestList;
+  unsigned int highestList;
+} IndexPageBounds;
+
+/*
+ *  Notes on IndexPageMap
+ *
+ *  Each volume maintains an index page map which records how the chapter delta
+ *  lists are distributed among the index pages for that chapter.
+ *
+ *  The map is conceptually a two-dimensional array indexed by chapter number
+ *  and index page number within the chapter.  Each entry contains the number
+ *  of the last delta list on that index page.  In order to save memory, the
+ *  information for the last page in each chapter is not recorded, as it is
+ *  known from the geometry.
+ */
+
+typedef uint16_t IndexPageMapEntry;
+
+struct indexPageMap {
+  const Geometry         *geometry;
+  uint64_t                lastUpdate;
+  IndexPageMapEntry      *entries;
+};
+
+/**
+ * Create an index page map.
+ *
+ * @param geometry     The geometry governing the index.
+ * @param mapPtr       A pointer to hold the new map.
+ *
+ * @return             A success or error code.
+ **/
+int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an index page map.
+ *
+ * @param map  The index page map to destroy.
+ **/
+void freeIndexPageMap(IndexPageMap *map);
+
+/**
+ * Get the virtual chapter number of the last update to the index page map.
+ *
+ * @param map   The index page map
+ *
+ * @return the virtual chapter number of the last chapter updated
+ **/
+uint64_t getLastUpdate(const IndexPageMap *map);
+
+/**
+ * Update an index page map entry.
+ *
+ * @param map                   The map to update
+ * @param virtualChapterNumber  The virtual chapter number being updated.
+ * @param chapterNumber         The chapter of the entry to update
+ * @param indexPageNumber       The index page of the entry to update
+ * @param deltaListNumber       The value of the new entry
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int updateIndexPageMap(IndexPageMap    *map,
+                       uint64_t         virtualChapterNumber,
+                       unsigned int     chapterNumber,
+                       unsigned int     indexPageNumber,
+                       unsigned int     deltaListNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the page number of the index page in a chapter that will contain the
+ * chapter index entry for a given chunk name, if it exists.
+ *
+ * @param [in]  map                 The map to search
+ * @param [in]  name                The chunk name
+ * @param [in]  chapterNumber       The chapter containing the index page
+ * @param [out] indexPageNumberPtr  A pointer to hold the result, guaranteed to
+ *                                  be a valid index page number on UDS_SUCCESS
+ *
+ * @return UDS_SUCCESS, or UDS_INVALID_ARGUMENT if the chapter number
+ *         is out of range
+ **/
+int findIndexPageNumber(const IndexPageMap *map,
+                        const UdsChunkName *name,
+                        unsigned int        chapterNumber,
+                        unsigned int       *indexPageNumberPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the lowest and highest numbered delta lists for the given immutable
+ * chapter index page from the index page map.
+ *
+ * @param map             The index page map
+ * @param chapterNumber   The chapter containing the delta list
+ * @param indexPageNumber The index page number within the chapter
+ * @param bounds          A structure to hold the list number bounds
+ *                        for the given page
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getListNumberBounds(const IndexPageMap *map,
+                        unsigned int        chapterNumber,
+                        unsigned int        indexPageNumber,
+                        IndexPageBounds    *bounds)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the size of the index page map save image, including all headers.
+ *
+ * @param geometry      The index geometry.
+ *
+ * @return The number of bytes required to save the index page map.
+ **/
+uint64_t computeIndexPageMapSaveSize(const Geometry *geometry);
+
+/**
+ * Escaped for testing....
+ *
+ * @param geometry      The index geometry.
+ *
+ * @return              The number of bytes required for the page map data,
+ *                      exclusive of headers.
+ **/
+size_t indexPageMapSize(const Geometry *geometry)
+  __attribute__((warn_unused_result));
+
+#endif // INDEX_PAGE_MAP_H
diff --git a/uds/indexRouter.c b/uds/indexRouter.c
new file mode 100644
index 0000000..b9b0a9e
--- /dev/null
+++ b/uds/indexRouter.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.c#7 $
+ */
+
+#include "indexRouter.h"
+
+#include "compiler.h"
+#include "indexCheckpoint.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "requestQueue.h"
+#include "zone.h"
+
+/**
+ * This is the request processing function invoked by the zone's RequestQueue
+ * worker thread.
+ *
+ * @param request  the request to be indexed or executed by the zone worker
+ **/
+static void executeZoneRequest(Request *request)
+{
+  executeIndexRouterRequest(request->router, request);
+}
+
+/**
+ * Construct and enqueue asynchronous control messages to add the chapter
+ * index for a given virtual chapter to the sparse chapter index cache.
+ *
+ * @param router          the router containing the relevant queues
+ * @param index           the index with the relevant cache and chapter
+ * @param virtualChapter  the virtual chapter number of the chapter to cache
+ **/
+static void enqueueBarrierMessages(IndexRouter *router,
+                                   Index       *index,
+                                   uint64_t     virtualChapter)
+{
+  ZoneMessage barrier = {
+    .index = index,
+    .data = {
+      .barrier = {
+        .virtualChapter = virtualChapter,
+      }
+    }
+  };
+  unsigned int zone;
+  for (zone = 0; zone < router->zoneCount; zone++) {
+    int result = launchZoneControlMessage(REQUEST_SPARSE_CACHE_BARRIER,
+                                          barrier, zone, router);
+    ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation");
+  }
+}
+
+/**
+ * This is the request processing function for the triage stage queue. Each
+ * request is resolved in the master index, determining if it is a hook or
+ * not, and if a hook, what virtual chapter (if any) it might be found in. If
+ * a virtual chapter is found, this enqueues a sparse chapter cache barrier in
+ * every zone before enqueueing the request in its zone.
+ *
+ * @param request  the request to triage
+ **/
+static void triageRequest(Request *request)
+{
+  IndexRouter *router = request->router;
+  Index *index = router->index;
+
+  // Check if the name is a hook in the index pointing at a sparse chapter.
+  uint64_t sparseVirtualChapter = triageIndexRequest(index, request);
+  if (sparseVirtualChapter != UINT64_MAX) {
+    // Generate and place a barrier request on every zone queue.
+    enqueueBarrierMessages(router, index, sparseVirtualChapter);
+  }
+
+  enqueueRequest(request, STAGE_INDEX);
+}
+
+/**
+ * Initialize the zone queues and the triage queue.
+ *
+ * @param router    the router containing the queues
+ * @param geometry  the geometry governing the indexes
+ *
+ * @return  UDS_SUCCESS or error code
+ **/
+static int initializeLocalIndexQueues(IndexRouter    *router,
+                                      const Geometry *geometry)
+{
+  unsigned int i;
+  for (i = 0; i < router->zoneCount; i++) {
+    int result = makeRequestQueue("indexW", &executeZoneRequest,
+                                  &router->zoneQueues[i]);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  // The triage queue is only needed for sparse multi-zone indexes.
+  if ((router->zoneCount > 1) && isSparse(geometry)) {
+    int result = makeRequestQueue("triageW", &triageRequest,
+                                  &router->triageQueue);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static INLINE RequestQueue *getZoneQueue(IndexRouter  *router,
+                                         unsigned int  zoneNumber)
+{
+  return router->zoneQueues[zoneNumber];
+}
+
+/**********************************************************************/
+int makeIndexRouter(IndexLayout                  *layout,
+                    const Configuration          *config,
+                    const struct uds_parameters  *userParams,
+                    LoadType                      loadType,
+                    IndexLoadContext             *loadContext,
+                    IndexRouterCallback           callback,
+                    IndexRouter                 **routerPtr)
+{
+  unsigned int zoneCount = getZoneCount(userParams);
+  IndexRouter *router;
+  int result = ALLOCATE_EXTENDED(IndexRouter, zoneCount, RequestQueue *,
+                                 "index router", &router);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  router->callback  = callback;
+  router->zoneCount = zoneCount;
+
+  result = initializeLocalIndexQueues(router, config->geometry);
+  if (result != UDS_SUCCESS) {
+    freeIndexRouter(router);
+    return result;
+  }
+
+  result = makeIndex(layout, config, userParams, router->zoneCount, loadType,
+                     loadContext, &router->index);
+  if (result != UDS_SUCCESS) {
+    freeIndexRouter(router);
+    return logErrorWithStringError(result, "failed to create index");
+  }
+
+  router->needToSave = (router->index->loadedType != LOAD_LOAD);
+  *routerPtr = router;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int saveIndexRouter(IndexRouter *router)
+{
+  if (!router->needToSave) {
+    return UDS_SUCCESS;
+  }
+  int result = saveIndex(router->index);
+  router->needToSave = (result != UDS_SUCCESS);
+  return result;
+}
+
+/**********************************************************************/
+void freeIndexRouter(IndexRouter *router)
+{
+  if (router == NULL) {
+    return;
+  }
+  requestQueueFinish(router->triageQueue);
+  unsigned int i;
+  for (i = 0; i < router->zoneCount; i++) {
+    requestQueueFinish(router->zoneQueues[i]);
+  }
+  freeIndex(router->index);
+  FREE(router);
+}
+
+/**********************************************************************/
+RequestQueue *selectIndexRouterQueue(IndexRouter  *router,
+                                     Request      *request,
+                                     RequestStage  nextStage)
+{
+  if (request->isControlMessage) {
+    return getZoneQueue(router, request->zoneNumber);
+  }
+
+  if (nextStage == STAGE_TRIAGE) {
+    // The triage queue is only needed for multi-zone sparse indexes and won't
+    // be allocated by the router if not needed, so simply check for NULL.
+    if (router->triageQueue != NULL) {
+      return router->triageQueue;
+    }
+    // Dense index or single zone, so route it directly to the zone queue.
+  } else if (nextStage != STAGE_INDEX) {
+    ASSERT_LOG_ONLY(false, "invalid index stage: %d", nextStage);
+    return NULL;
+  }
+
+  Index *index = router->index;
+  request->zoneNumber = getMasterIndexZone(index->masterIndex,
+                                           &request->chunkName);
+  return getZoneQueue(router, request->zoneNumber);
+}
+
+/**********************************************************************/
+void executeIndexRouterRequest(IndexRouter *router, Request *request)
+{
+  if (request->isControlMessage) {
+    int result = dispatchIndexZoneControlRequest(request);
+    if (result != UDS_SUCCESS) {
+      logErrorWithStringError(result, "error executing control message: %d",
+                              request->action);
+    }
+    request->status = result;
+    enterCallbackStage(request);
+    return;
+  }
+
+  router->needToSave = true;
+  if (request->requeued && !isSuccessful(request->status)) {
+    request->status = makeUnrecoverable(request->status);
+    router->callback(request);
+    return;
+  }
+
+  Index *index = router->index;
+  int result = dispatchIndexRequest(index, request);
+  if (result == UDS_QUEUED) {
+    // Take the request off the pipeline.
+    return;
+  }
+
+  request->status = result;
+  router->callback(request);
+}
diff --git a/uds/indexRouter.h b/uds/indexRouter.h
new file mode 100644
index 0000000..a96262b
--- /dev/null
+++ b/uds/indexRouter.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.h#3 $
+ */
+
+#ifndef INDEX_ROUTER_H
+#define INDEX_ROUTER_H
+
+#include "compiler.h"
+#include "index.h"
+#include "indexSession.h"
+#include "request.h"
+
+/**
+ * Callback after a query, update or remove request completes and fills in
+ * select fields in the request: status for all requests, oldMetadata and
+ * hashExists for query and update requests.
+ *
+ * @param request     request object.
+ **/
+typedef void (*IndexRouterCallback)(Request *request);
+
+struct indexRouter {
+  IndexRouterCallback  callback;
+  unsigned int         zoneCount;
+  bool                 needToSave;
+  Index               *index;
+  RequestQueue        *triageQueue;
+  RequestQueue        *zoneQueues[];
+};
+
+/**
+ * Construct and initialize an IndexRouter instance.
+ *
+ * @param layout       the IndexLayout that describes the stored index
+ * @param config       the configuration to use
+ * @param userParams   the index session parameters.  If NULL, the default
+ *                     session parameters will be used.
+ * @param loadType     selects whether to create, load, or rebuild the index
+ * @param loadContext  the index load context to use
+ * @param callback     the function to invoke when a request completes or fails
+ * @param routerPtr    a pointer in which to store the new router
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIndexRouter(IndexLayout                  *layout,
+                    const Configuration          *config,
+                    const struct uds_parameters  *userParams,
+                    LoadType                      loadType,
+                    IndexLoadContext             *loadContext,
+                    IndexRouterCallback           callback,
+                    IndexRouter                 **routerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Executes the index operation for a UDS request and calls the callback upon
+ * completion.
+ *
+ * @param router      The index router.
+ * @param request     A pointer to the Request to process.
+ **/
+void executeIndexRouterRequest(IndexRouter *router, Request *request);
+
+/**
+ * Save the index router state to persistent storage.
+ *
+ * It is the responsibility of the caller to ensure that there are no other
+ * uses of the index during a call to this method.  It is necessary that there
+ * be no index requests from any block context nor any other attempt to save
+ * the index until after a call to saveIndexRouter returns.
+ *
+ * @param router  the index router to save
+ *
+ * @return UDS_SUCCESS if successful.
+ **/
+int saveIndexRouter(IndexRouter *router) __attribute__((warn_unused_result));
+
+/**
+ * Destroy the index router and free its memory.
+ *
+ * @param router  the index router to destroy (may be NULL)
+ *
+ * @return UDS_SUCCESS if successful.
+ **/
+void freeIndexRouter(IndexRouter *router);
+
+/**
+ * Select and return the request queue responsible for executing the next
+ * index stage of a request, updating the request with any associated state
+ * (such as the zone number for UDS requests on a local index).
+ *
+ * @param router     The index router.
+ * @param request    The Request destined for the queue.
+ * @param nextStage  The next request stage (STAGE_TRIAGE or STAGE_INDEX).
+ *
+ * @return the next index stage queue (the local triage queue, local zone
+ *         queue, or remote RPC send queue)
+ **/
+RequestQueue *selectIndexRouterQueue(IndexRouter  *router,
+                                     Request      *request,
+                                     RequestStage  nextStage);
+
+/**
+ * Wait for the index router to finish all operations that access a local
+ * storage device.
+ *
+ * @param router    The index router.
+ **/
+static INLINE void waitForIdleIndexRouter(IndexRouter *router)
+{
+  waitForIdleChapterWriter(router->index->chapterWriter);
+}
+
+#endif /* INDEX_ROUTER_H */
diff --git a/uds/indexSession.c b/uds/indexSession.c
new file mode 100644
index 0000000..15e5b3f
--- /dev/null
+++ b/uds/indexSession.c
@@ -0,0 +1,554 @@
+/*
+ * %Copyright%
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexSession.c#10 $
+ */
+
+#include "indexSession.h"
+
+#include "indexCheckpoint.h"
+#include "indexRouter.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "requestQueue.h"
+
+/**********************************************************************/
+static void collectStats(const struct uds_index_session *indexSession,
+                         UdsContextStats                *stats)
+{
+  const SessionStats *sessionStats = &indexSession->stats;
+
+  stats->currentTime = asTimeT(currentTime(CLOCK_REALTIME));
+
+  stats->postsFound         = READ_ONCE(sessionStats->postsFound);
+  stats->inMemoryPostsFound = READ_ONCE(sessionStats->postsFoundOpenChapter);
+  stats->densePostsFound    = READ_ONCE(sessionStats->postsFoundDense);
+  stats->sparsePostsFound   = READ_ONCE(sessionStats->postsFoundSparse);
+  stats->postsNotFound      = READ_ONCE(sessionStats->postsNotFound);
+  stats->updatesFound       = READ_ONCE(sessionStats->updatesFound);
+  stats->updatesNotFound    = READ_ONCE(sessionStats->updatesNotFound);
+  stats->deletionsFound     = READ_ONCE(sessionStats->deletionsFound);
+  stats->deletionsNotFound  = READ_ONCE(sessionStats->deletionsNotFound);
+  stats->queriesFound       = READ_ONCE(sessionStats->queriesFound);
+  stats->queriesNotFound    = READ_ONCE(sessionStats->queriesNotFound);
+  stats->requests           = READ_ONCE(sessionStats->requests);
+}
+
+/**********************************************************************/
+static void handleCallbacks(Request *request)
+{
+  if (request->status == UDS_SUCCESS) {
+    // Measure the turnaround time of this request and include that time,
+    // along with the rest of the request, in the context's StatCounters.
+    updateRequestContextStats(request);
+  }
+
+  if (request->callback != NULL) {
+    // The request has specified its own callback and does not expect to be
+    // freed.
+    struct uds_index_session *indexSession = request->session;
+    request->found = (request->location != LOC_UNAVAILABLE);
+    request->callback((UdsRequest *) request);
+    // We do this release after the callback because of the contract of the
+    // udsFlushIndexSession method.
+    releaseIndexSession(indexSession);
+    return;
+  }
+
+  // Should not get here, because this is either a control message or it has a
+  // callback method.
+  freeRequest(request);
+}
+
+/**********************************************************************/
+int checkIndexSession(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+  unsigned int state = indexSession->state;
+  unlockMutex(&indexSession->requestMutex);
+
+  if (state == IS_FLAG_LOADED) {
+    return UDS_SUCCESS;
+  } else if (state & IS_FLAG_DISABLED) {
+    return UDS_DISABLED;
+  } else if ((state & IS_FLAG_LOADING)
+             || (state & IS_FLAG_SUSPENDED)
+             || (state & IS_FLAG_WAITING)) {
+    return UDS_SUSPENDED;
+  }
+
+   return UDS_NO_INDEXSESSION;
+}
+
+/**********************************************************************/
+int getIndexSession(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+  indexSession->requestCount++;
+  unlockMutex(&indexSession->requestMutex);
+
+  int result = checkIndexSession(indexSession);
+  if (result != UDS_SUCCESS) {
+    releaseIndexSession(indexSession);
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void releaseIndexSession(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+  if (--indexSession->requestCount == 0) {
+    broadcastCond(&indexSession->requestCond);
+  }
+  unlockMutex(&indexSession->requestMutex);
+}
+
+/**********************************************************************/
+int startLoadingIndexSession(struct uds_index_session *indexSession)
+{
+  int result;
+  lockMutex(&indexSession->requestMutex);
+  if (indexSession->state & IS_FLAG_SUSPENDED) {
+    result = UDS_SUSPENDED;
+  } else if (indexSession->state != 0) {
+    result = UDS_INDEXSESSION_IN_USE;
+  } else {
+    indexSession->state |= IS_FLAG_LOADING;
+    result = UDS_SUCCESS;
+  }
+  unlockMutex(&indexSession->requestMutex);
+  return result;
+}
+
+/**********************************************************************/
+void finishLoadingIndexSession(struct uds_index_session *indexSession,
+                               int                       result)
+{
+  lockMutex(&indexSession->requestMutex);
+  indexSession->state &= ~IS_FLAG_LOADING;
+  if (result == UDS_SUCCESS) {
+    indexSession->state |= IS_FLAG_LOADED;
+  }
+  broadcastCond(&indexSession->requestCond);
+  unlockMutex(&indexSession->requestMutex);
+}
+
+/**********************************************************************/
+void disableIndexSession(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+  indexSession->state |= IS_FLAG_DISABLED;
+  unlockMutex(&indexSession->requestMutex);
+}
+
+/**********************************************************************/
+int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr)
+{
+  struct uds_index_session *session;
+  int result = ALLOCATE(1, struct uds_index_session, __func__, &session);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initMutex(&session->requestMutex);
+  if (result != UDS_SUCCESS) {
+    FREE(session);
+    return result;
+  }
+
+  result = initCond(&session->requestCond);
+  if (result != UDS_SUCCESS) {
+    destroyMutex(&session->requestMutex);
+    FREE(session);
+    return result;
+  }
+
+  result = initMutex(&session->loadContext.mutex);
+  if (result != UDS_SUCCESS) {
+    destroyCond(&session->requestCond);
+    destroyMutex(&session->requestMutex);
+    FREE(session);
+    return result;
+  }
+
+  result = initCond(&session->loadContext.cond);
+  if (result != UDS_SUCCESS) {
+    destroyMutex(&session->loadContext.mutex);
+    destroyCond(&session->requestCond);
+    destroyMutex(&session->requestMutex);
+    FREE(session);
+    return result;
+  }
+
+  result = makeRequestQueue("callbackW", &handleCallbacks,
+                            &session->callbackQueue);
+  if (result != UDS_SUCCESS) {
+    destroyCond(&session->loadContext.cond);
+    destroyMutex(&session->loadContext.mutex);
+    destroyCond(&session->requestCond);
+    destroyMutex(&session->requestMutex);
+    FREE(session);
+    return result;
+  }
+
+  *indexSessionPtr = session;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int udsSuspendIndexSession(struct uds_index_session *session, bool save)
+{
+  int result;
+  bool saveIndex = false;
+  bool suspendIndex = false;
+  lockMutex(&session->requestMutex);
+  // Wait for any pending close operation to complete.
+  while (session->state & IS_FLAG_CLOSING) {
+    waitCond(&session->requestCond, &session->requestMutex);
+  }
+  if ((session->state & IS_FLAG_WAITING)
+      || (session->state & IS_FLAG_DESTROYING)) {
+    result = EBUSY;
+  } else if (session->state & IS_FLAG_SUSPENDED) {
+    result = UDS_SUCCESS;
+  } else if (session->state & IS_FLAG_LOADING) {
+    session->state |= IS_FLAG_WAITING;
+    suspendIndex = true;
+    result = UDS_SUCCESS;
+  } else if (!(session->state & IS_FLAG_LOADED)) {
+    session->state |= IS_FLAG_SUSPENDED;
+    broadcastCond(&session->requestCond);
+    result = UDS_SUCCESS;
+  } else {
+    saveIndex = save;
+    if (saveIndex) {
+      session->state |= IS_FLAG_WAITING;
+    } else {
+      session->state |= IS_FLAG_SUSPENDED;
+      broadcastCond(&session->requestCond);
+    }
+    result = UDS_SUCCESS;
+  }
+  unlockMutex(&session->requestMutex);
+
+  if (!saveIndex && !suspendIndex) {
+    return result;
+  }
+
+  if (saveIndex) {
+    result = udsSaveIndex(session);
+    lockMutex(&session->requestMutex);
+    session->state &= ~IS_FLAG_WAITING;
+    session->state |= IS_FLAG_SUSPENDED;
+    broadcastCond(&session->requestCond);
+    unlockMutex(&session->requestMutex);
+    return result;
+  }
+
+  lockMutex(&session->loadContext.mutex);
+  switch (session->loadContext.status) {
+  case INDEX_OPENING:
+    session->loadContext.status = INDEX_SUSPENDING;
+
+    // Wait until the index indicates that it is not replaying.
+    while ((session->loadContext.status != INDEX_SUSPENDED)
+           && (session->loadContext.status != INDEX_READY)) {
+      waitCond(&session->loadContext.cond,
+               &session->loadContext.mutex);
+    }
+    break;
+
+  case INDEX_READY:
+    // Index load does not need to be suspended.
+    break;
+
+  case INDEX_SUSPENDED:
+  case INDEX_SUSPENDING:
+  case INDEX_FREEING:
+  default:
+    // These cases should not happen.
+    ASSERT_LOG_ONLY(false, "Bad load context state %u",
+                    session->loadContext.status);
+    break;
+  }
+  unlockMutex(&session->loadContext.mutex);
+
+  lockMutex(&session->requestMutex);
+  session->state &= ~IS_FLAG_WAITING;
+  session->state |= IS_FLAG_SUSPENDED;
+  broadcastCond(&session->requestCond);
+  unlockMutex(&session->requestMutex);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int udsResumeIndexSession(struct uds_index_session *session)
+{
+  lockMutex(&session->requestMutex);
+  if (session->state & IS_FLAG_WAITING) {
+    unlockMutex(&session->requestMutex);
+    return EBUSY;
+  }
+
+  /* If not suspended, just succeed */
+  if (!(session->state & IS_FLAG_SUSPENDED)) {
+    unlockMutex(&session->requestMutex);
+    return UDS_SUCCESS;
+  }
+
+  if (!(session->state & IS_FLAG_LOADING)) {
+    session->state &= ~IS_FLAG_SUSPENDED;
+    unlockMutex(&session->requestMutex);
+    return UDS_SUCCESS;
+  }
+
+  session->state |= IS_FLAG_WAITING;
+  unlockMutex(&session->requestMutex);
+
+  lockMutex(&session->loadContext.mutex);
+  switch (session->loadContext.status) {
+  case INDEX_SUSPENDED:
+    session->loadContext.status = INDEX_OPENING;
+    // Notify the index to start replaying again.
+    broadcastCond(&session->loadContext.cond);
+    break;
+
+  case INDEX_READY:
+    // There is no index rebuild to resume.
+    break;
+
+  case INDEX_OPENING:
+  case INDEX_SUSPENDING:
+  case INDEX_FREEING:
+  default:
+    // These cases should not happen; do nothing.
+    ASSERT_LOG_ONLY(false, "Bad load context state %u",
+                    session->loadContext.status);
+    break;
+  }
+  unlockMutex(&session->loadContext.mutex);
+
+  lockMutex(&session->requestMutex);
+  session->state &= ~IS_FLAG_WAITING;
+  session->state &= ~IS_FLAG_SUSPENDED;
+  broadcastCond(&session->requestCond);
+  unlockMutex(&session->requestMutex);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static void waitForNoRequestsInProgress(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+  while (indexSession->requestCount > 0) {
+    waitCond(&indexSession->requestCond, &indexSession->requestMutex);
+  }
+  unlockMutex(&indexSession->requestMutex);
+}
+
+/**********************************************************************/
+int saveAndFreeIndex(struct uds_index_session *indexSession)
+{
+  int result = UDS_SUCCESS;
+  IndexRouter *router = indexSession->router;
+  if (router != NULL) {
+    lockMutex(&indexSession->requestMutex);
+    bool suspended = (indexSession->state & IS_FLAG_SUSPENDED);
+    unlockMutex(&indexSession->requestMutex);
+    if (!suspended) {
+      result = saveIndexRouter(router);
+      if (result != UDS_SUCCESS) {
+        logWarningWithStringError(result, "ignoring error from saveIndexRouter");
+      }
+    }
+    freeIndexRouter(router);
+    indexSession->router = NULL;
+
+    // Reset all index state that happens to be in the index session, so it
+    // doesn't affect any future index.
+    lockMutex(&indexSession->loadContext.mutex);
+    indexSession->loadContext.status = INDEX_OPENING;
+    unlockMutex(&indexSession->loadContext.mutex);
+
+    lockMutex(&indexSession->requestMutex);
+    // Only the suspend bit will remain relevant.
+    indexSession->state &= IS_FLAG_SUSPENDED;
+    unlockMutex(&indexSession->requestMutex);
+  }
+
+  logDebug("Closed index");
+  return result;
+}
+
+/**********************************************************************/
+int udsCloseIndex(struct uds_index_session *indexSession)
+{
+  lockMutex(&indexSession->requestMutex);
+
+  // Wait for any pending suspend, resume or close operations to complete.
+  while ((indexSession->state & IS_FLAG_WAITING)
+         || (indexSession->state & IS_FLAG_CLOSING)) {
+    waitCond(&indexSession->requestCond, &indexSession->requestMutex);
+  }
+
+  int result = UDS_SUCCESS;
+  if (indexSession->state & IS_FLAG_SUSPENDED) {
+    result = UDS_SUSPENDED;
+  } else if ((indexSession->state & IS_FLAG_DESTROYING)
+             || !(indexSession->state & IS_FLAG_LOADED)) {
+    // The index doesn't exist, hasn't finished loading, or is being destroyed.
+    result = UDS_NO_INDEXSESSION;
+  } else {
+    indexSession->state |= IS_FLAG_CLOSING;
+  }
+  unlockMutex(&indexSession->requestMutex);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  logDebug("Closing index");
+  waitForNoRequestsInProgress(indexSession);
+  result = saveAndFreeIndex(indexSession);
+
+  lockMutex(&indexSession->requestMutex);
+  indexSession->state &= ~IS_FLAG_CLOSING;
+  broadcastCond(&indexSession->requestCond);
+  unlockMutex(&indexSession->requestMutex);
+  return result;
+}
+
+/**********************************************************************/
+int udsDestroyIndexSession(struct uds_index_session *indexSession)
+{
+  logDebug("Destroying index session");
+
+  bool loadPending = false;
+  lockMutex(&indexSession->requestMutex);
+
+  // Wait for any pending suspend, resume, or close operations to complete.
+  while ((indexSession->state & IS_FLAG_WAITING)
+         || (indexSession->state & IS_FLAG_CLOSING)) {
+    waitCond(&indexSession->requestCond, &indexSession->requestMutex);
+  }
+
+  if (indexSession->state & IS_FLAG_DESTROYING) {
+    unlockMutex(&indexSession->requestMutex);
+    return EBUSY;
+  }
+
+  indexSession->state |= IS_FLAG_DESTROYING;
+  loadPending = ((indexSession->state & IS_FLAG_LOADING)
+                 && (indexSession->state & IS_FLAG_SUSPENDED));
+  unlockMutex(&indexSession->requestMutex);
+
+  if (loadPending) {
+    // Tell the index to terminate the rebuild.
+    lockMutex(&indexSession->loadContext.mutex);
+    if (indexSession->loadContext.status == INDEX_SUSPENDED) {
+      indexSession->loadContext.status = INDEX_FREEING;
+      broadcastCond(&indexSession->loadContext.cond);
+    }
+    unlockMutex(&indexSession->loadContext.mutex);
+
+    // Wait until the load exits before proceeding.
+    lockMutex(&indexSession->requestMutex);
+    while (indexSession->state & IS_FLAG_LOADING) {
+      waitCond(&indexSession->requestCond, &indexSession->requestMutex);
+    }
+    unlockMutex(&indexSession->requestMutex);
+  }
+
+  waitForNoRequestsInProgress(indexSession);
+  int result = saveAndFreeIndex(indexSession);
+  requestQueueFinish(indexSession->callbackQueue);
+  indexSession->callbackQueue = NULL;
+  destroyCond(&indexSession->loadContext.cond);
+  destroyMutex(&indexSession->loadContext.mutex);
+  destroyCond(&indexSession->requestCond);
+  destroyMutex(&indexSession->requestMutex);
+  logDebug("Destroyed index session");
+  FREE(indexSession);
+  return result;
+}
+
+/**********************************************************************/
+int udsFlushIndexSession(struct uds_index_session *indexSession)
+{
+  waitForNoRequestsInProgress(indexSession);
+  // Wait until any open chapter writes are complete
+  waitForIdleIndexRouter(indexSession->router);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int udsSaveIndex(struct uds_index_session *indexSession)
+{
+  waitForNoRequestsInProgress(indexSession);
+  // saveIndexRouter waits for open chapter writes to complete
+  return saveIndexRouter(indexSession->router);
+}
+
+/**********************************************************************/
+int udsSetCheckpointFrequency(struct uds_index_session *indexSession,
+                              unsigned int              frequency)
+{
+  setIndexCheckpointFrequency(indexSession->router->index->checkpoint,
+                              frequency);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int udsGetIndexConfiguration(struct uds_index_session *indexSession,
+                             UdsConfiguration         *conf)
+{
+  if (conf == NULL) {
+    return logErrorWithStringError(UDS_CONF_PTR_REQUIRED,
+                                   "received a NULL config pointer");
+  }
+  int result = ALLOCATE(1, struct udsConfiguration, __func__, conf);
+  if (result == UDS_SUCCESS) {
+    **conf = indexSession->userConfig;
+  }
+  return result;
+}
+
+/**********************************************************************/
+int udsGetIndexStats(struct uds_index_session *indexSession,
+                     UdsIndexStats            *stats)
+{
+  if (stats == NULL) {
+    return logErrorWithStringError(UDS_INDEX_STATS_PTR_REQUIRED,
+                                   "received a NULL index stats pointer");
+  }
+  getIndexStats(indexSession->router->index, stats);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int udsGetIndexSessionStats(struct uds_index_session *indexSession,
+                            UdsContextStats          *stats)
+{
+  if (stats == NULL) {
+    return logWarningWithStringError(UDS_CONTEXT_STATS_PTR_REQUIRED,
+                                     "received a NULL context stats pointer");
+  }
+  collectStats(indexSession, stats);
+  return UDS_SUCCESS;
+}
diff --git a/uds/indexSession.h b/uds/indexSession.h
new file mode 100644
index 0000000..1467fd2
--- /dev/null
+++ b/uds/indexSession.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexSession.h#6 $
+ */
+
+#ifndef INDEX_SESSION_H
+#define INDEX_SESSION_H
+
+#include "atomicDefs.h"
+#include "config.h"
+#include "cpu.h"
+#include "opaqueTypes.h"
+#include "threads.h"
+#include "uds.h"
+
+/**
+ * The bit position of flags used to indicate index session states.
+ **/
+typedef enum {
+  IS_FLAG_BIT_START      = 8,
+  /** Flag indicating that the session is loading */
+  IS_FLAG_BIT_LOADING    = IS_FLAG_BIT_START,
+  /** Flag indicating that that the session has been loaded */
+  IS_FLAG_BIT_LOADED,
+  /** Flag indicating that the session is disabled permanently */
+  IS_FLAG_BIT_DISABLED,
+  /** Flag indicating that the session is suspended */
+  IS_FLAG_BIT_SUSPENDED,
+  /** Flag indicating that the session is waiting for an index state change */
+  IS_FLAG_BIT_WAITING,
+  /** Flag indicating that that the session is closing */
+  IS_FLAG_BIT_CLOSING,
+  /** Flag indicating that that the session is being destroyed */
+  IS_FLAG_BIT_DESTROYING,
+} IndexSessionFlagBit;
+
+/**
+ * The index session state flags.
+ **/
+typedef enum {
+  IS_FLAG_LOADED     = (1 << IS_FLAG_BIT_LOADED),
+  IS_FLAG_LOADING    = (1 << IS_FLAG_BIT_LOADING),
+  IS_FLAG_DISABLED   = (1 << IS_FLAG_BIT_DISABLED),
+  IS_FLAG_SUSPENDED  = (1 << IS_FLAG_BIT_SUSPENDED),
+  IS_FLAG_WAITING    = (1 << IS_FLAG_BIT_WAITING),
+  IS_FLAG_CLOSING    = (1 << IS_FLAG_BIT_CLOSING),
+  IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING),
+} IndexSessionFlag;
+
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sessionStats {
+  uint64_t postsFound;            /* Post calls that found an entry */
+  uint64_t postsFoundOpenChapter; /* Post calls found in the open chapter */
+  uint64_t postsFoundDense;       /* Post calls found in the dense index */
+  uint64_t postsFoundSparse;      /* Post calls found in the sparse index */
+  uint64_t postsNotFound;         /* Post calls that did not find an entry */
+  uint64_t updatesFound;          /* Update calls that found an entry */
+  uint64_t updatesNotFound;       /* Update calls that did not find an entry */
+  uint64_t deletionsFound;        /* Delete calls that found an entry */
+  uint64_t deletionsNotFound;     /* Delete calls that did not find an entry */
+  uint64_t queriesFound;          /* Query calls that found an entry */
+  uint64_t queriesNotFound;       /* Query calls that did not find an entry */
+  uint64_t requests;              /* Total number of requests */
+} SessionStats;
+
+/**
+ * States used in the index load context, reflecting the state of the index.
+ **/
+typedef enum {
+  /** The index has not been loaded or rebuilt completely */
+  INDEX_OPENING    = 0,
+  /** The index is able to handle requests */
+  INDEX_READY,
+  /** The index has a pending request to suspend */
+  INDEX_SUSPENDING,
+  /** The index is suspended in the midst of a rebuild */
+  INDEX_SUSPENDED,
+  /** The index is being shut down while suspended */
+  INDEX_FREEING,
+} IndexSuspendStatus;
+
+/**
+ * The CondVar here must be notified when the status changes to
+ * INDEX_SUSPENDED, in order to wake up the waiting udsSuspendIndexSession()
+ * call. It must also be notified when the status changes away from
+ * INDEX_SUSPENDED, to resume rebuild the index from checkForSuspend() in the
+ * index.
+ **/
+typedef struct indexLoadContext {
+  Mutex              mutex;
+  CondVar            cond;
+  IndexSuspendStatus status;  // Covered by indexLoadContext.mutex.
+} IndexLoadContext;
+
+/**
+ * The request CondVar here must be notified when IS_FLAG_WAITING is cleared,
+ * in case udsCloseIndex() or udsDestroyIndexSession() is waiting on that flag.
+ * It must also be notified when IS_FLAG_CLOSING is cleared, in case
+ * udsSuspendIndexSession(), udsCloseIndex() or udsDestroyIndexSession() is
+ * waiting on that flag.
+ * Finally, it must also be notified when IS_FLAG_LOADING is cleared, to inform
+ * udsDestroyIndexSession() that the index session can be safely freed.
+ **/
+struct uds_index_session {
+  unsigned int             state;   // Covered by requestMutex.
+  IndexRouter             *router;
+  RequestQueue            *callbackQueue;
+  struct udsConfiguration  userConfig;
+  IndexLoadContext         loadContext;
+  // Asynchronous Request synchronization
+  Mutex                    requestMutex;
+  CondVar                  requestCond;
+  int                      requestCount;
+  // Request statistics, all owned by the callback thread
+  SessionStats             stats;
+};
+
+/**
+ * Check that the index session is usable.
+ *
+ * @param indexSession  the session to query
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int checkIndexSession(struct uds_index_session *indexSession)
+  __attribute__((warn_unused_result));
+
+/**
+ * Make sure that the IndexSession is allowed to load an index, and if so, set
+ * its state to indicate that the load has started.
+ *
+ * @param indexSession  the session to load with
+ *
+ * @return UDS_SUCCESS, or an error code if an index already exists.
+ **/
+int startLoadingIndexSession(struct uds_index_session *indexSession)
+  __attribute__((warn_unused_result));
+
+/**
+ * Update the IndexSession state after attempting to load an index, to indicate
+ * that the load has completed, and whether or not it succeeded.
+ *
+ * @param indexSession  the session that was loading
+ * @param result        the result of the load operation
+ **/
+void finishLoadingIndexSession(struct uds_index_session *indexSession,
+                               int                       result);
+
+/**
+ * Disable an index session due to an error.
+ *
+ * @param indexSession  the session to be disabled
+ **/
+void disableIndexSession(struct uds_index_session *indexSession);
+
+/**
+ * Acquire the index session for an asynchronous index request.
+ *
+ * The pointer must eventually be released with a corresponding call to
+ * releaseIndexSession().
+ *
+ * @param indexSession  The index session
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getIndexSession(struct uds_index_session *indexSession)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release a pointer to an index session.
+ *
+ * @param indexSession  The session to release
+ **/
+void releaseIndexSession(struct uds_index_session *indexSession);
+
+/**
+ * Construct a new, empty index session.
+ *
+ * @param indexSessionPtr   The pointer to receive the new session
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Save an index while the session is quiescent.
+ *
+ * During the call to #udsSaveIndex, there should be no other call to
+ * #udsSaveIndex and there should be no calls to #udsStartChunkOperation.
+ *
+ * @param indexSession  The session to save
+ *
+ * @return Either #UDS_SUCCESS or an error code
+ **/
+int udsSaveIndex(struct uds_index_session *indexSession)
+  __attribute__((warn_unused_result));
+
+/**
+ * Close the index by saving the underlying index.
+ *
+ * @param indexSession  The index session to be shut down and freed
+ **/
+int saveAndFreeIndex(struct uds_index_session *indexSession);
+
+/**
+ * Set the checkpoint frequency of the grid.
+ *
+ * @param session    The index session to be modified.
+ * @param frequency  New checkpoint frequency.
+ *
+ * @return          Either UDS_SUCCESS or an error code.
+ *
+ **/
+int udsSetCheckpointFrequency(struct uds_index_session *session,
+                              unsigned int              frequency)
+  __attribute__((warn_unused_result));
+
+#endif /* INDEX_SESSION_H */
diff --git a/uds/indexState.c b/uds/indexState.c
new file mode 100644
index 0000000..86b9fd3
--- /dev/null
+++ b/uds/indexState.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexState.c#6 $
+ */
+
+#include "indexState.h"
+
+#include "errors.h"
+#include "indexComponent.h"
+#include "indexLayout.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+
+/*****************************************************************************/
+int makeIndexState(IndexLayout   *layout,
+                   unsigned int   numZones,
+                   unsigned int   maxComponents,
+                   IndexState   **statePtr)
+{
+  if (maxComponents == 0) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "cannot make index state with maxComponents 0");
+  }
+
+  IndexState *state = NULL;
+  int result = ALLOCATE_EXTENDED(IndexState, maxComponents, IndexComponent *,
+                                 "index state", &state);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  state->count     = 0;
+  state->layout    = layout;
+  state->length    = maxComponents;
+  state->loadZones = 0;
+  state->loadSlot  = UINT_MAX;
+  state->saveSlot  = UINT_MAX;
+  state->saving    = false;
+  state->zoneCount = numZones;
+
+  *statePtr = state;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void freeIndexState(IndexState **statePtr)
+{
+  IndexState *state = *statePtr;
+  *statePtr = NULL;
+  if (state != NULL) {
+    unsigned int i;
+    for (i = 0; i < state->count; ++i) {
+      freeIndexComponent(&state->entries[i]);
+    }
+    FREE(state);
+  }
+}
+
+/*****************************************************************************/
+/**
+ * Add a component to the index state.
+ *
+ * @param state         The index state.
+ * @param component     The index component.
+ *
+ * @return      UDS_SUCCESS or an error code.
+ **/
+static int addComponentToIndexState(IndexState     *state,
+                                    IndexComponent *component)
+{
+  if (findIndexComponent(state, component->info) != NULL) {
+    return logErrorWithStringError(
+      UDS_INVALID_ARGUMENT, "cannot add state component %s: already present",
+      component->info->name);
+  }
+
+  if (state->count >= state->length) {
+    return logErrorWithStringError(
+      UDS_RESOURCE_LIMIT_EXCEEDED,
+      "cannot add state component %s, %u components already added",
+      component->info->name, state->count);
+  }
+
+  state->entries[state->count] = component;
+  ++state->count;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int addIndexStateComponent(IndexState               *state,
+                           const IndexComponentInfo *info,
+                           void                     *data,
+                           void                     *context)
+{
+  IndexComponent *component = NULL;
+  int result = makeIndexComponent(state, info, state->zoneCount, data, context,
+                                  &component);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "cannot make region index component");
+  }
+
+  result = addComponentToIndexState(state, component);
+  if (result != UDS_SUCCESS) {
+    freeIndexComponent(&component);
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+IndexComponent *findIndexComponent(const IndexState         *state,
+                                   const IndexComponentInfo *info)
+{
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (info == component->info) {
+      return component;
+    }
+  }
+  return NULL;
+}
+
+/*****************************************************************************/
+static const char *indexSaveTypeName(IndexSaveType saveType)
+{
+  return saveType == IS_SAVE ? "save" : "checkpoint";
+}
+
+/*****************************************************************************/
+int loadIndexState(IndexState *state, bool *replayPtr)
+{
+  int result = findLatestIndexSaveSlot(state->layout, &state->loadZones,
+                                       &state->loadSlot);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  bool replayRequired = false;
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    result = readIndexComponent(component);
+    if (result != UDS_SUCCESS) {
+      if (!missingIndexComponentRequiresReplay(component)) {
+        state->loadZones = 0;
+        state->loadSlot  = UINT_MAX;
+        return logErrorWithStringError(result, "index component %s",
+                                       indexComponentName(component));
+      }
+      replayRequired = true;
+    }
+  }
+
+  state->loadZones = 0;
+  state->loadSlot  = UINT_MAX;
+  if (replayPtr != NULL) {
+    *replayPtr = replayRequired;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType)
+{
+  if (state->saving) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "already saving the index state");
+  }
+  int result = setupIndexSaveSlot(state->layout, state->zoneCount, saveType,
+                                  &state->saveSlot);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot prepare index %s",
+                                   indexSaveTypeName(saveType));
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+/**
+ *  Complete the saving of an index state.
+ *
+ *  @param state  the index state
+ *
+ *  @return UDS_SUCCESS or an error code
+ **/
+static int completeIndexSaving(IndexState *state)
+{
+  state->saving = false;
+  int result = commitIndexSave(state->layout, state->saveSlot);
+  state->saveSlot = UINT_MAX;
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot commit index state");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+static int cleanupSave(IndexState *state)
+{
+  int result = cancelIndexSave(state->layout, state->saveSlot);
+  state->saveSlot = UINT_MAX;
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot cancel index save");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int saveIndexState(IndexState *state)
+{
+  int result = prepareToSaveIndexState(state, IS_SAVE);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  
+
+unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    result = writeIndexComponent(component);
+    if (result != UDS_SUCCESS) {
+      cleanupSave(state);
+      return result;
+    }
+  }
+  return completeIndexSaving(state);
+}
+
+/*****************************************************************************/
+int writeIndexStateCheckpoint(IndexState *state)
+{
+  int result = prepareToSaveIndexState(state, IS_CHECKPOINT);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component)) {
+      continue;
+    }
+    result = writeIndexComponent(component);
+    if (result != UDS_SUCCESS) {
+      cleanupSave(state);
+      return result;
+    }
+  }
+
+  return completeIndexSaving(state);
+}
+
+/*****************************************************************************/
+int startIndexStateCheckpoint(IndexState *state)
+{
+  int result = prepareToSaveIndexState(state, IS_CHECKPOINT);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  state->saving = true;
+
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component)) {
+      continue;
+    }
+    result = startIndexComponentIncrementalSave(component);
+    if (result != UDS_SUCCESS) {
+      abortIndexStateCheckpoint(state);
+      return result;
+    }
+  }
+
+  return result;
+}
+
+/*****************************************************************************/
+int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state)
+{
+  if (!state->saving) {
+    return UDS_SUCCESS;
+  }
+
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component) ||
+        !deferIndexComponentCheckpointToChapterWriter(component)) {
+      continue;
+    }
+    int result = performIndexComponentChapterWriterSave(component);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ *  Wrapper function to do a zone-based checkpoint operation.
+ *
+ *  @param [in]  state          the index state
+ *  @param [in]  zone           the zone number
+ *  @param [in]  compFunc       the index component function to use
+ *  @param [out] completed      if non-NULL, where to save the completion status
+ *
+ *  @return UDS_SUCCESS or an error code
+ *
+ **/
+static int doIndexStateCheckpointInZone(IndexState       *state,
+                                        unsigned int      zone,
+                                        int (*compFunc)(IndexComponent *,
+                                                        unsigned int,
+                                                        CompletionStatus *),
+                                        CompletionStatus *completed)
+{
+  if (!state->saving) {
+    if (completed != NULL) {
+      *completed = CS_COMPLETED_PREVIOUSLY;
+    }
+    return UDS_SUCCESS;
+  }
+
+  CompletionStatus status = CS_COMPLETED_PREVIOUSLY;
+
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component)) {
+      continue;
+    }
+    if (zone > 0 && !component->info->multiZone) {
+      continue;
+    }
+    CompletionStatus componentStatus = CS_NOT_COMPLETED;
+    int result = (*compFunc)(component, zone, &componentStatus);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    // compute rolling least status
+    if (componentStatus < status) {
+      status = componentStatus;
+    }
+  }
+
+  if (completed != NULL) {
+    *completed = status;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int performIndexStateCheckpointInZone(IndexState       *state,
+                                      unsigned int      zone,
+                                      CompletionStatus *completed)
+{
+  return doIndexStateCheckpointInZone(state, zone,
+                                      &performIndexComponentZoneSave,
+                                      completed);
+}
+
+/*****************************************************************************/
+int finishIndexStateCheckpointInZone(IndexState       *state,
+                                     unsigned int      zone,
+                                     CompletionStatus *completed)
+{
+  return doIndexStateCheckpointInZone(state, zone,
+                                      &finishIndexComponentZoneSave,
+                                      completed);
+}
+
+/*****************************************************************************/
+int abortIndexStateCheckpointInZone(IndexState       *state,
+                                    unsigned int      zone,
+                                    CompletionStatus *completed)
+{
+  return doIndexStateCheckpointInZone(state, zone,
+                                      &abortIndexComponentZoneSave, completed);
+}
+
+/*****************************************************************************/
+int finishIndexStateCheckpoint(IndexState *state)
+{
+  if (!state->saving) {
+    return UDS_SUCCESS;
+  }
+
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component)) {
+      continue;
+    }
+    int result = finishIndexComponentIncrementalSave(component);
+    if (result != UDS_SUCCESS) {
+      abortIndexStateCheckpoint(state);
+      return result;
+    }
+  }
+
+  int result = completeIndexSaving(state);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int abortIndexStateCheckpoint(IndexState *state)
+{
+  if (!state->saving) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "not saving the index state");
+  }
+
+  logError("aborting index state checkpoint");
+
+  int result = UDS_SUCCESS;
+  unsigned int i;
+  for (i = 0; i < state->count; ++i) {
+    IndexComponent *component = state->entries[i];
+    if (skipIndexComponentOnCheckpoint(component)) {
+      continue;
+    }
+    int tmp = abortIndexComponentIncrementalSave(component);
+    if (result == UDS_SUCCESS) {
+      result = tmp;
+    }
+  }
+
+  cleanupSave(state);
+  state->saving = false;
+
+  return result;
+}
+
+/*****************************************************************************/
+int discardIndexStateData(IndexState *state)
+{
+  int result = discardIndexSaves(state->layout, true);
+  state->saveSlot = UINT_MAX;
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "%s: cannot destroy all index saves",
+                                   __func__);
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int discardLastIndexStateSave(IndexState *state)
+{
+  int result = discardIndexSaves(state->layout, false);
+  state->saveSlot = UINT_MAX;
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "%s: cannot destroy latest index save",
+                                   __func__);
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode)
+{
+  unsigned int slot = mode == IO_READ ? state->loadSlot : state->saveSlot;
+  return getIndexStateBuffer(state->layout, slot);
+}
+
+/*****************************************************************************/
+int openStateBufferedReader(IndexState      *state,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedReader **readerPtr)
+{
+  return openIndexBufferedReader(state->layout, state->loadSlot, kind, zone,
+                                 readerPtr);
+}
+
+/*****************************************************************************/
+int openStateBufferedWriter(IndexState      *state,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedWriter **writerPtr)
+{
+  return openIndexBufferedWriter(state->layout, state->saveSlot, kind, zone,
+                                 writerPtr);
+}
diff --git a/uds/indexState.h b/uds/indexState.h
new file mode 100644
index 0000000..82899c1
--- /dev/null
+++ b/uds/indexState.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexState.h#5 $
+ */
+
+#ifndef INDEX_STATE_H
+#define INDEX_STATE_H 1
+
+#include "buffer.h"
+#include "indexComponent.h"
+
+
+/**
+ * Used here and in SingleFileLayout.
+ **/
+typedef enum {
+  IS_SAVE,
+  IS_CHECKPOINT,
+  NO_SAVE = 9999,
+} IndexSaveType;
+
+/*
+ * Used in getStateIndexStateBuffer to identify whether the index state buffer
+ * is for the index being loaded or the index being saved.
+ */
+typedef enum {
+  IO_READ  = 0x1,
+  IO_WRITE = 0x2,
+} IOAccessMode;
+
+/**
+ * The index state structure controls the loading and saving of the index
+ * state.
+ **/
+typedef struct indexState {
+  struct indexLayout *layout;
+  unsigned int        zoneCount;  // number of index zones to use
+  unsigned int        loadZones;
+  unsigned int        loadSlot;
+  unsigned int        saveSlot;
+  unsigned int        count;     // count of registered entries (<= length)
+  unsigned int        length;    // total span of array allocation
+  bool                saving;    // incremental save in progress
+  IndexComponent     *entries[]; // array of index component entries
+} IndexState;
+
+/**
+ * Make an index state object,
+ *
+ * @param [in]  layout         The index layout.
+ * @param [in]  numZones       The number of zones to use.
+ * @param [in]  maxComponents  The maximum number of components to be handled.
+ * @param [out] statePtr       Where to store the index state object.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIndexState(struct indexLayout  *layout,
+                   unsigned int         numZones,
+                   unsigned int         maxComponents,
+                   IndexState         **statePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an index state (generically).
+ *
+ * @param statePtr      The pointer to the index state to be freed and
+ *                      set to NULL.
+ **/
+void freeIndexState(IndexState **statePtr);
+
+/**
+ * Add an index component to an index state.
+ *
+ * @param state     The index directory in which to add this component.
+ * @param info      The index component file specification.
+ * @param data      The per-component data structure.
+ * @param context   The load/save context of the component.
+ *
+ * @return          UDS_SUCCESS or an error code.
+ **/
+int addIndexStateComponent(IndexState               *state,
+                           const IndexComponentInfo *info,
+                           void                     *data,
+                           void                     *context)
+  __attribute__((warn_unused_result));
+
+/**
+ * Load index state
+ *
+ * @param state      The index state.
+ * @param replayPtr  If set, the place to hold whether a replay is required.
+ *
+ * @return           UDS_SUCCESS or error
+ **/
+int loadIndexState(IndexState *state, bool *replayPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Save the current index state, including the open chapter.
+ *
+ * @param state         The index state.
+ *
+ * @return              UDS_SUCCESS or error
+ **/
+int saveIndexState(IndexState *state) __attribute__((warn_unused_result));
+
+/**
+ *  Prepare to save the index state.
+ *
+ *  @param state     the index state
+ *  @param saveType  whether a checkpoint or save
+ *
+ *  @return UDS_SUCCESS or an error code
+ **/
+int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write index checkpoint non-incrementally (for testing).
+ *
+ * @param state         The index state.
+ *
+ * @return              UDS_SUCCESS or error
+ **/
+int writeIndexStateCheckpoint(IndexState *state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Sets up an index state checkpoint which will proceed incrementally.
+ * May create the directory but does not actually write any data.
+ *
+ * @param state         The index state.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int startIndexStateCheckpoint(IndexState *state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Perform operations on index state checkpoints that are synchronized to
+ * the chapter writer thread.
+ *
+ * @param state         The index state.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Performs zone-specific (and, for zone 0, general) incremental checkpointing.
+ *
+ * @param [in]  state           The index state.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Set to whether the checkpoint has completed
+ *                              for this zone.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int performIndexStateCheckpointInZone(IndexState       *state,
+                                      unsigned int      zone,
+                                      CompletionStatus *completed)
+  __attribute__((warn_unused_result));
+
+/**
+ * Force the completion of an incremental index state checkpoint
+ * for a particular zone.
+ *
+ * @param [in] state    The index state.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Set to whether the checkpoint has completed
+ *                              for this zone.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int finishIndexStateCheckpointInZone(IndexState       *state,
+                                     unsigned int      zone,
+                                     CompletionStatus *completed)
+  __attribute__((warn_unused_result));
+
+/**
+ * Force the completion of an incremental index state checkpoint once
+ * all zones are completed.
+ *
+ * @param [in] state    The index state.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int finishIndexStateCheckpoint(IndexState *state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Aborts an index state checkpoint which is proceeding incrementally
+ * for a particular zone.
+ *
+ * @param [in]  state           The index state.
+ * @param [in]  zone            The zone number.
+ * @param [out] completed       Set to whether the checkpoint has completed or
+ *                              aborted for this zone.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int abortIndexStateCheckpointInZone(IndexState       *state,
+                                    unsigned int      zone,
+                                    CompletionStatus *completed);
+
+/**
+ * Aborts an index state checkpoint which is proceeding incrementally,
+ * once all the zones are aborted.
+ *
+ * @param [in]  state   The index state.
+ *
+ * @return              UDS_SUCCESS or an error code.
+ **/
+int abortIndexStateCheckpoint(IndexState *state);
+
+/**
+ * Remove or disable the index state data, for testing.
+ *
+ * @param state         The index state
+ *
+ * @return UDS_SUCCESS or an error code
+ *
+ * @note the return value of this function is frequently ignored
+ **/
+int discardIndexStateData(IndexState *state);
+
+/**
+ * Discard the last index state save, for testing.
+ *
+ * @param state         The index state
+ *
+ * @return UDS_SUCCESS or an error code
+ *
+ * @note the return value of this function is frequently ignored
+ **/
+int discardLastIndexStateSave(IndexState *state);
+
+/**
+ * Find index component, for testing.
+ *
+ * @param state The index state
+ * @param info  The index component file specification
+ *
+ * @return      The index component, or NULL if not found
+ **/
+IndexComponent *findIndexComponent(const IndexState         *state,
+                                   const IndexComponentInfo *info)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the indexStateBuffer for a specified mode.
+ *
+ * @param state      The index state.
+ * @param mode       One of IO_READ or IO_WRITE.
+ *
+ * @return the index state buffer
+ **/
+Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open a BufferedReader for a specified state, kind, and zone.
+ * This helper function is used by IndexComponent.
+ *
+ * @param state      The index state.
+ * @param kind       The kind if index save region to open.
+ * @param zone       The zone number for the region.
+ * @param readerPtr  Where to store the BufferedReader.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openStateBufferedReader(IndexState      *state,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedReader **readerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open a BufferedWriter for a specified state, kind, and zone.
+ * This helper function is used by IndexComponent.
+ *
+ * @param state      The index state.
+ * @param kind       The kind if index save region to open.
+ * @param zone       The zone number for the region.
+ * @param writerPtr  Where to store the BufferedWriter.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int openStateBufferedWriter(IndexState      *state,
+                            RegionKind       kind,
+                            unsigned int     zone,
+                            BufferedWriter **writerPtr)
+  __attribute__((warn_unused_result));
+
+#endif // INDEX_STATE_H
diff --git a/uds/indexStateData.c b/uds/indexStateData.c
new file mode 100644
index 0000000..62038f0
--- /dev/null
+++ b/uds/indexStateData.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.c#3 $
+ */
+
+#include "indexStateData.h"
+
+#include "buffer.h"
+#include "errors.h"
+#include "index.h"
+#include "logger.h"
+#include "uds.h"
+
+/* The index state version header */
+typedef struct {
+  int32_t signature;
+  int32_t versionID;
+} IndexStateVersion;
+
+/* The version 301 index state */
+typedef struct {
+  uint64_t newestChapter;
+  uint64_t oldestChapter;
+  uint64_t lastCheckpoint;
+  uint32_t unused;
+  uint32_t padding;
+} IndexStateData301;
+
+static const IndexStateVersion INDEX_STATE_VERSION_301 = {
+  .signature = -1,
+  .versionID = 301,
+};
+
+/**
+ * The index state index component reader.
+ *
+ * @param portal the ReadPortal that handles the read of the component
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int readIndexStateData(ReadPortal *portal)
+{
+  Buffer *buffer = getStateIndexStateBuffer(portal->component->state, IO_READ);
+  int result = rewindBuffer(buffer, uncompactedAmount(buffer));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  IndexStateVersion fileVersion;
+  result = getInt32LEFromBuffer(buffer, &fileVersion.signature);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getInt32LEFromBuffer(buffer, &fileVersion.versionID);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (fileVersion.signature != -1 || fileVersion.versionID != 301) {
+    return logErrorWithStringError(UDS_UNSUPPORTED_VERSION,
+                                   "Index state version %d,%d is unsupported",
+                                   fileVersion.signature,
+                                   fileVersion.versionID);
+  }
+
+  IndexStateData301 state;
+  result = getUInt64LEFromBuffer(buffer, &state.newestChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &state.oldestChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &state.lastCheckpoint);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &state.unused);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &state.padding);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if ((state.unused != 0) || (state.padding != 0)) {
+    return UDS_CORRUPT_COMPONENT;
+  }
+
+  Index *index = indexComponentData(portal->component);
+  index->newestVirtualChapter = state.newestChapter;
+  index->oldestVirtualChapter = state.oldestChapter;
+  index->lastCheckpoint       = state.lastCheckpoint;
+  return UDS_SUCCESS;
+}
+
+/**
+ * The index state index component writer.
+ *
+ * @param component The component whose state is to be saved (an Index)
+ * @param writer    The buffered writer.
+ * @param zone      The zone to write.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int writeIndexStateData(IndexComponent *component,
+                               BufferedWriter *writer __attribute__((unused)),
+                               unsigned int zone __attribute__((unused)))
+{
+  Buffer *buffer = getStateIndexStateBuffer(component->state, IO_WRITE);
+  int result = resetBufferEnd(buffer, 0);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.signature);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.versionID);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  Index *index = indexComponentData(component);
+  IndexStateData301 state = {
+    .newestChapter  = index->newestVirtualChapter,
+    .oldestChapter  = index->oldestVirtualChapter,
+    .lastCheckpoint = index->lastCheckpoint,
+  };
+
+  result = putUInt64LEIntoBuffer(buffer, state.newestChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, state.oldestChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, state.lastCheckpoint);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, state.unused);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, state.padding);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+
+const IndexComponentInfo INDEX_STATE_INFO = {
+  .kind        = RL_KIND_INDEX_STATE,
+  .name        = "index state",
+  .saveOnly    = false,
+  .chapterSync = true,
+  .multiZone   = false,
+  .ioStorage   = false,
+  .loader      = readIndexStateData,
+  .saver       = writeIndexStateData,
+  .incremental = NULL,
+};
diff --git a/uds/indexStateData.h b/uds/indexStateData.h
new file mode 100644
index 0000000..b6aa9b2
--- /dev/null
+++ b/uds/indexStateData.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.h#1 $
+ */
+
+#ifndef INDEX_STATE_DATA_H
+#define INDEX_STATE_DATA_H 1
+
+#include "indexComponent.h"
+
+extern const IndexComponentInfo INDEX_STATE_INFO;
+
+#endif /* not INDEX_STATE_DATA_H */
diff --git a/uds/indexVersion.c b/uds/indexVersion.c
new file mode 100644
index 0000000..df16e73
--- /dev/null
+++ b/uds/indexVersion.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.c#1 $
+ */
+
+#include "indexVersion.h"
+
+void initializeIndexVersion(struct index_version *version,
+                            uint32_t              superVersion)
+{
+  /*
+   * Version 1 was introduced for the first single file layout. It was used in
+   * RHEL7 and in RHEL8.0 Beta.  No kernel index ever used an earlier version.
+   */
+   
+  /*
+   * Version 2 was created when we discovered that the volume header page was
+   * written in native endian format.  It was used in RHEL8.0 and RHEL8.1.  We
+   * stopped reading and the volume header page, and changed to version 2 so 
+   * that an index creaed on RHEL8 cannot be taken back an used on RHEL7.
+   *
+   * Versions 1 and 2 are identical in normal operation (i.e. after the index
+   * is loaded).
+   */
+  
+  /*
+   * Version 3 was created when we discovered the the chapter index headers
+   * were written in native endian format.  It was first used in RHEL8.2 and is
+   * the current version for new indices.
+   *
+   * Versions before 3 read and write native endian chapter headers.  Version 3
+   * reads chapter headers in any endian order, and writes little-endian
+   * chapter headers.
+   */
+  bool chapterIndexHeaderNativeEndian = superVersion < 3;
+
+  *version = (struct index_version) {
+    .chapterIndexHeaderNativeEndian = chapterIndexHeaderNativeEndian,
+  };
+}  
diff --git a/uds/indexVersion.h b/uds/indexVersion.h
new file mode 100644
index 0000000..f46b2e9
--- /dev/null
+++ b/uds/indexVersion.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.h#1 $
+ */
+
+#ifndef INDEX_VERSION_H
+#define INDEX_VERSION_H
+
+#include "typeDefs.h"
+
+struct index_version {
+  bool chapterIndexHeaderNativeEndian;
+};
+
+enum {
+  SUPER_VERSION_MINIMUM = 1,
+  SUPER_VERSION_MAXIMUM = 3,
+  SUPER_VERSION_CURRENT = 3,
+};
+
+/**
+ * Initialize the version parameters that we normally learn when loading the
+ * index but need to use during index operation.
+ *
+ * @param version       The version parameters
+ * @param superVersion  The SuperBlock version number
+ **/
+void initializeIndexVersion(struct index_version *version,
+                            uint32_t              superVersion);
+
+#endif // INDEX_VERSION_H
diff --git a/uds/indexZone.c b/uds/indexZone.c
new file mode 100644
index 0000000..f3cd8ed
--- /dev/null
+++ b/uds/indexZone.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexZone.c#4 $
+ */
+
+#include "indexZone.h"
+
+#include "errors.h"
+#include "index.h"
+#include "indexCheckpoint.h"
+#include "indexRouter.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "request.h"
+#include "sparseCache.h"
+#include "uds.h"
+
+/**********************************************************************/
+int makeIndexZone(struct index *index, unsigned int zoneNumber)
+{
+  IndexZone *zone;
+  int result = ALLOCATE(1, IndexZone, "index zone", &zone);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeOpenChapter(index->volume->geometry, index->zoneCount,
+                           &zone->openChapter);
+  if (result != UDS_SUCCESS) {
+    freeIndexZone(zone);
+    return result;
+  }
+
+  result = makeOpenChapter(index->volume->geometry, index->zoneCount,
+                           &zone->writingChapter);
+  if (result != UDS_SUCCESS) {
+    freeIndexZone(zone);
+    return result;
+  }
+
+  zone->index              = index;
+  zone->id                 = zoneNumber;
+  index->zones[zoneNumber] = zone;
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeIndexZone(IndexZone *zone)
+{
+  if (zone == NULL) {
+    return;
+  }
+
+  freeOpenChapter(zone->openChapter);
+  freeOpenChapter(zone->writingChapter);
+  FREE(zone);
+}
+
+/**********************************************************************/
+bool isZoneChapterSparse(const IndexZone *zone,
+                         uint64_t         virtualChapter)
+{
+  return isChapterSparse(zone->index->volume->geometry,
+                         zone->oldestVirtualChapter,
+                         zone->newestVirtualChapter,
+                         virtualChapter);
+}
+
+/**********************************************************************/
+void setActiveChapters(IndexZone *zone)
+{
+  zone->oldestVirtualChapter = zone->index->oldestVirtualChapter;
+  zone->newestVirtualChapter = zone->index->newestVirtualChapter;
+}
+
+/**
+ * Swap the open and writing chapters after blocking until there are no active
+ * chapter writers on the index.
+ *
+ * @param zone  The zone swapping chapters
+ *
+ * @return UDS_SUCCESS or a return code
+ **/
+static int swapOpenChapter(IndexZone *zone)
+{
+  // Wait for any currently writing chapter to complete
+  int result = finishPreviousChapter(zone->index->chapterWriter,
+                                     zone->newestVirtualChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Swap the writing and open chapters
+  OpenChapterZone *tempChapter = zone->openChapter;
+  zone->openChapter            = zone->writingChapter;
+  zone->writingChapter         = tempChapter;
+  return UDS_SUCCESS;
+}
+
+/**
+ * Advance to a new open chapter, and forget the oldest chapter in the
+ * index if necessary.
+ *
+ * @param zone                 The zone containing the chapter to reap
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int reapOldestChapter(IndexZone *zone)
+{
+  Index *index = zone->index;
+  unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume;
+  int result
+    = ASSERT(((zone->newestVirtualChapter - zone->oldestVirtualChapter)
+              <= chaptersPerVolume),
+             "newest (%llu) and oldest (%llu) virtual chapters "
+             "less than or equal to chapters per volume (%u)",
+             zone->newestVirtualChapter, zone->oldestVirtualChapter,
+             chaptersPerVolume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  setMasterIndexZoneOpenChapter(index->masterIndex, zone->id,
+                                zone->newestVirtualChapter);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int executeSparseCacheBarrierMessage(IndexZone          *zone,
+                                     BarrierMessageData *barrier)
+{
+  /*
+   * Check if the chapter index for the virtual chapter is already in the
+   * cache, and if it's not, rendezvous with the other zone threads to add the
+   * chapter index to the sparse index cache.
+   */
+  return updateSparseCache(zone, barrier->virtualChapter);
+}
+
+/**
+ * Handle notification that some other zone has closed its open chapter. If
+ * the chapter that was closed is still the open chapter for this zone,
+ * close it now in order to minimize skew.
+ *
+ * @param zone          The zone receiving the notification
+ * @param chapterClosed The notification
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int handleChapterClosed(IndexZone                *zone,
+                               ChapterClosedMessageData *chapterClosed)
+{
+  if (zone->newestVirtualChapter == chapterClosed->virtualChapter) {
+    return openNextChapter(zone, NULL);
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int dispatchIndexZoneControlRequest(Request *request)
+{
+  ZoneMessage *message = &request->zoneMessage;
+  IndexZone *zone = message->index->zones[request->zoneNumber];
+
+  switch (request->action) {
+  case REQUEST_SPARSE_CACHE_BARRIER:
+    return executeSparseCacheBarrierMessage(zone, &message->data.barrier);
+
+  case REQUEST_ANNOUNCE_CHAPTER_CLOSED:
+    return handleChapterClosed(zone, &message->data.chapterClosed);
+
+  default:
+    return ASSERT_FALSE("valid control message type: %d", request->action);
+  }
+}
+
+/**
+ * Announce the closure of the current open chapter to the other zones.
+ *
+ * @param request       The request which caused the chapter to close
+ *                      (may be NULL)
+ * @param zone          The zone which first closed the chapter
+ * @param closedChapter The chapter which was closed
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int announceChapterClosed(Request   *request,
+                                 IndexZone *zone,
+                                 uint64_t   closedChapter)
+{
+  IndexRouter *router = ((request != NULL) ? request->router : NULL);
+
+  ZoneMessage zoneMessage = {
+    .index = zone->index,
+    .data  = {
+      .chapterClosed = { .virtualChapter = closedChapter }
+    }
+  };
+
+  unsigned int i;
+  for (i = 0; i < zone->index->zoneCount; i++) {
+    if (zone->id == i) {
+      continue;
+    }
+    int result;
+    if (router != NULL) {
+      result = launchZoneControlMessage(REQUEST_ANNOUNCE_CHAPTER_CLOSED,
+                                        zoneMessage, i, router);
+    } else {
+      // We're in a test which doesn't have zone queues, so we can just
+      // call the message function directly.
+      result = handleChapterClosed(zone->index->zones[i],
+                                   &zoneMessage.data.chapterClosed);
+    }
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int openNextChapter(IndexZone *zone, Request *request)
+{
+  logDebug("closing chapter %llu of zone %d after %u entries (%u short)",
+           zone->newestVirtualChapter, zone->id, zone->openChapter->size,
+           zone->openChapter->capacity - zone->openChapter->size);
+
+  int result = swapOpenChapter(zone);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint64_t closedChapter = zone->newestVirtualChapter++;
+  result = reapOldestChapter(zone);
+  if (result != UDS_SUCCESS) {
+    return logUnrecoverable(result, "reapOldestChapter failed");
+  }
+
+  resetOpenChapter(zone->openChapter);
+
+  // begin, continue, or finish the checkpoint processing
+  // moved above startClosingChapter because some of the
+  // checkpoint processing now done by the chapter writer thread
+  result = processCheckpointing(zone->index,
+                                zone->id,
+                                zone->newestVirtualChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int finishedZones = startClosingChapter(zone->index->chapterWriter,
+                                                   zone->id,
+                                                   zone->writingChapter);
+  if ((finishedZones == 1) && (zone->index->zoneCount > 1)) {
+    // This is the first zone of a multi-zone index to close this chapter,
+    // so inform the other zones in order to control zone skew.
+    result = announceChapterClosed(request, zone, closedChapter);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  // If the chapter being opened won't overwrite the oldest chapter, we're
+  // done.
+  if (!areSamePhysicalChapter(zone->index->volume->geometry,
+                              zone->newestVirtualChapter,
+                              zone->oldestVirtualChapter)) {
+    return UDS_SUCCESS;
+  }
+
+  uint64_t victim = zone->oldestVirtualChapter++;
+  if (finishedZones < zone->index->zoneCount) {
+    // We are not the last zone to close the chapter, so we're done
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * We are the last zone to close the chapter, so clean up the cache. That
+   * it is safe to let the last thread out of the previous chapter to do this
+   * relies on the fact that although the new open chapter shadows the oldest
+   * chapter in the cache, until we write the new open chapter to disk, we'll
+   * never look for it in the cache.
+   */
+  return forgetChapter(zone->index->volume, victim, INVALIDATION_EXPIRE);
+}
+
+/**********************************************************************/
+IndexRegion computeIndexRegion(const IndexZone *zone,
+                               uint64_t         virtualChapter)
+{
+  if (virtualChapter == zone->newestVirtualChapter) {
+    return LOC_IN_OPEN_CHAPTER;
+  }
+  return (isZoneChapterSparse(zone, virtualChapter)
+          ? LOC_IN_SPARSE : LOC_IN_DENSE);
+}
+
+/**********************************************************************/
+int getRecordFromZone(IndexZone *zone,
+                      Request   *request,
+                      bool      *found,
+                      uint64_t   virtualChapter)
+{
+  if (virtualChapter == zone->newestVirtualChapter) {
+    searchOpenChapter(zone->openChapter, &request->chunkName,
+                      &request->oldMetadata, found);
+    return UDS_SUCCESS;
+  }
+
+  if ((zone->newestVirtualChapter > 0)
+      && (virtualChapter == (zone->newestVirtualChapter - 1))
+      && (zone->writingChapter->size > 0)) {
+    // Only search the writing chapter if it is full, else look on disk.
+    searchOpenChapter(zone->writingChapter, &request->chunkName,
+                      &request->oldMetadata, found);
+    return UDS_SUCCESS;
+  }
+
+  // The slow lane thread has determined the location previously. We don't need
+  // to search again. Just return the location.
+  if (request->slLocationKnown) {
+    *found = request->slLocation != LOC_UNAVAILABLE;
+    return UDS_SUCCESS;
+  }
+
+  Volume *volume = zone->index->volume;
+  if (isZoneChapterSparse(zone, virtualChapter)
+      && sparseCacheContains(volume->sparseCache, virtualChapter,
+                             request->zoneNumber)) {
+    // The named chunk, if it exists, is in a sparse chapter that is cached,
+    // so just run the chunk through the sparse chapter cache search.
+    return searchSparseCacheInZone(zone, request, virtualChapter, found);
+  }
+
+  return searchVolumePageCache(volume, request, &request->chunkName,
+                               virtualChapter, &request->oldMetadata, found);
+}
+
+/**********************************************************************/
+int putRecordInZone(IndexZone          *zone,
+                    Request            *request,
+                    const UdsChunkData *metadata)
+{
+  unsigned int remaining;
+  int result = putOpenChapter(zone->openChapter, &request->chunkName, metadata,
+                              &remaining);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (remaining == 0) {
+    return openNextChapter(zone, request);
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**************************************************************************/
+int searchSparseCacheInZone(IndexZone *zone,
+                            Request   *request,
+                            uint64_t   virtualChapter,
+                            bool      *found)
+{
+  int recordPageNumber;
+  int result = searchSparseCache(zone, &request->chunkName, &virtualChapter,
+                                 &recordPageNumber);
+  if ((result != UDS_SUCCESS) || (virtualChapter == UINT64_MAX)) {
+    return result;
+  }
+
+  Volume *volume = zone->index->volume;
+  // XXX map to physical chapter and validate. It would be nice to just pass
+  // the virtual in to the slow lane, since it's tracking invalidations.
+  unsigned int chapter
+    = mapToPhysicalChapter(volume->geometry, virtualChapter);
+
+  return searchCachedRecordPage(volume, request, &request->chunkName, chapter,
+                                recordPageNumber, &request->oldMetadata,
+                                found);
+}
diff --git a/uds/indexZone.h b/uds/indexZone.h
new file mode 100644
index 0000000..8301894
--- /dev/null
+++ b/uds/indexZone.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/indexZone.h#2 $
+ */
+
+#ifndef INDEX_ZONE_H
+#define INDEX_ZONE_H
+
+#include "common.h"
+#include "openChapterZone.h"
+#include "request.h"
+
+typedef struct {
+  struct index    *index;
+  OpenChapterZone *openChapter;
+  OpenChapterZone *writingChapter;
+  uint64_t         oldestVirtualChapter;
+  uint64_t         newestVirtualChapter;
+  unsigned int     id;
+} IndexZone;
+
+/**
+ * Allocate an index zone.
+ *
+ * @param index      The index receiving the zone
+ * @param zoneNumber The number of the zone to allocate
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int makeIndexZone(struct index *index, unsigned int zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up an index zone.
+ *
+ * @param zone The index zone to free
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+void freeIndexZone(IndexZone *zone);
+
+/**
+ * Check whether a chapter is sparse or dense based on the current state of
+ * the index zone.
+ *
+ * @param zone            The index zone to check against
+ * @param virtualChapter  The virtual chapter number of the chapter to check
+ *
+ * @return true if the chapter is in the sparse part of the volume
+ **/
+bool isZoneChapterSparse(const IndexZone *zone,
+                         uint64_t         virtualChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set the active chapter numbers for a zone based on its index. The active
+ * chapters consist of the range of chapters from the current oldest to
+ * the current newest virtual chapter.
+ *
+ * @param zone          The zone to set
+ **/
+void setActiveChapters(IndexZone *zone);
+
+/**
+ * Dispatch a control request to an index zone.
+ *
+ * @param request The request to dispatch
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int dispatchIndexZoneControlRequest(Request *request)
+  __attribute__((warn_unused_result));
+
+/**
+ * Execute a sparse chapter index cache barrier control request on the zone
+ * worker thread. This call into the sparse cache to coordinate the cache
+ * update with the other zones.
+ *
+ * @param zone     The index zone receiving the barrier message
+ * @param barrier  The barrier control message data
+ *
+ * @return UDS_SUCCESS or an error code if the chapter index could not be
+ *         read or decoded
+ **/
+int executeSparseCacheBarrierMessage(IndexZone          *zone,
+                                     BarrierMessageData *barrier)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open the next chapter.
+ *
+ * @param zone    The zone containing the open chapter
+ * @param request The request which requires the next chapter to be
+ *                opened
+ *
+ * @return UDS_SUCCESS if successful.
+ **/
+int openNextChapter(IndexZone *zone, Request *request)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine the IndexRegion in which a block was found.
+ *
+ * @param zone               The zone that was searched
+ * @param virtualChapter     The virtual chapter number
+ *
+ * @return the IndexRegion of the chapter in which the block was found
+ **/
+IndexRegion computeIndexRegion(const IndexZone *zone,
+                               uint64_t         virtualChapter);
+
+/**
+ * Get a record from either the volume or the open chapter in a zone.
+ *
+ * @param zone           The index zone to query
+ * @param request        The request originating the query
+ * @param found          A pointer to a bool which will be set to
+ *                       <code>true</code> if the record was found.
+ * @param virtualChapter The chapter in which to search
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getRecordFromZone(IndexZone *zone,
+                      Request   *request,
+                      bool      *found,
+                      uint64_t   virtualChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put a record in the open chapter. If this fills the chapter, the chapter
+ * will be closed and a new one will be opened.
+ *
+ * @param zone     The index zone containing the chapter
+ * @param request  The request containing the name of the record
+ * @param metadata The record metadata
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int putRecordInZone(IndexZone          *zone,
+                    Request            *request,
+                    const UdsChunkData *metadata)
+  __attribute__((warn_unused_result));
+
+/**
+ * Search the cached sparse chapter index, either for a cached sparse hook, or
+ * as the last chance for finding the record named by a request.
+ *
+ * @param [in]  zone            the index zone
+ * @param [in]  request         the request originating the search
+ * @param [in]  virtualChapter  if UINT64_MAX, search the entire cache;
+ *                              otherwise search this chapter, if cached
+ * @param [out] found           A pointer to a bool which will be set to
+ *                              <code>true</code> if the record was found
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int searchSparseCacheInZone(IndexZone *zone,
+                            Request   *request,
+                            uint64_t   virtualChapter,
+                            bool      *found)
+  __attribute__((warn_unused_result));
+
+#endif /* INDEX_ZONE_H */
diff --git a/uds/ioFactory.h b/uds/ioFactory.h
new file mode 100644
index 0000000..ef6cc90
--- /dev/null
+++ b/uds/ioFactory.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/ioFactory.h#7 $
+ */
+
+#ifndef IO_FACTORY_H
+#define IO_FACTORY_H
+
+#include "bufferedReader.h"
+#include "bufferedWriter.h"
+#ifdef __KERNEL__
+#include <linux/dm-bufio.h>
+#else
+#include "fileUtils.h"
+#include "ioRegion.h"
+#endif
+
+/*
+ * An IOFactory object is responsible for controlling access to index storage.
+ * The index is a contiguous range of blocks on a block device or within a
+ * file.
+ *
+ * The IOFactory holds the open device or file and is responsible for closing
+ * it.  The IOFactory has methods to make IORegions that are used to access
+ * sections of the index.
+ */
+typedef struct ioFactory IOFactory;
+
+/*
+ * Define the UDS block size as 4K.  Historically, we wrote the volume file in
+ * large blocks, but wrote all the other index data into byte streams stored in
+ * files.  When we converted to writing an index into a block device, we
+ * changed to writing the byte streams into page sized blocks.  Now that we
+ * support multiple architectures, we write into 4K blocks on all platforms.
+ *
+ * XXX We must convert all the rogue 4K constants to use UDS_BLOCK_SIZE.
+ */
+enum { UDS_BLOCK_SIZE = 4096 };
+
+#ifdef __KERNEL__
+/**
+ * Create an IOFactory.  The IOFactory is returned with a reference count of 1.
+ *
+ * @param path        The path to the block device or file that contains the
+ *                    block stream
+ * @param factoryPtr  The IOFactory is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIOFactory(const char *path, IOFactory **factoryPtr)
+  __attribute__((warn_unused_result));
+#else
+/**
+ * Create an IOFactory.  The IOFactory is returned with a reference count of 1.
+ *
+ * @param path        The path to the block device or file that contains the
+ *                    block stream
+ * @param access      The requested access kind.
+ * @param factoryPtr  The IOFactory is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIOFactory(const char  *path,
+                  FileAccess   access,
+                  IOFactory  **factoryPtr)
+  __attribute__((warn_unused_result));
+#endif
+
+/**
+ * Get another reference to an IOFactory, incrementing its reference count.
+ *
+ * @param factory  The IOFactory
+ **/
+void getIOFactory(IOFactory *factory);
+
+/**
+ * Free a reference to an IOFactory.  If the reference count drops to zero,
+ * free the IOFactory and release all its resources.
+ *
+ * @param factory  The IOFactory
+ **/
+void putIOFactory(IOFactory *factory);
+
+/**
+ * Get the maximum potential size of the device or file.  For a device, this is
+ * the actual size of the device.  For a file, this is the largest file that we
+ * can possibly write.
+ *
+ * @param factory  The IOFactory
+ *
+ * @return the writable size (in bytes)
+ **/
+size_t getWritableSize(IOFactory *factory) __attribute__((warn_unused_result));
+
+#ifdef __KERNEL__
+/**
+ * Create a struct dm_bufio_client for a region of the index.
+ *
+ * @param factory          The IOFactory
+ * @param offset           The byte offset to the region within the index
+ * @param size             The size of a block, in bytes
+ * @param reservedBuffers  The number of buffers that can be reserved
+ * @param clientPtr        The struct dm_bufio_client is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeBufio(IOFactory               *factory,
+              off_t                    offset,
+              size_t                   blockSize,
+              unsigned int             reservedBuffers,
+              struct dm_bufio_client **clientPtr)
+  __attribute__((warn_unused_result));
+#else
+/**
+ * Create an IORegion for a region of the index.
+ *
+ * @param factory    The IOFactory
+ * @param offset     The byte offset to the region within the index
+ * @param size       The size in bytes of the region
+ * @param regionPtr  The IORegion is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIORegion(IOFactory  *factory,
+                 off_t       offset,
+                 size_t      size,
+                 IORegion  **regionPtr)
+  __attribute__((warn_unused_result));
+#endif
+
+/**
+ * Create a BufferedReader for a region of the index.
+ *
+ * @param factory    The IOFactory
+ * @param offset     The byte offset to the region within the index
+ * @param size       The size in bytes of the region
+ * @param regionPtr  The IORegion is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int openBufferedReader(IOFactory       *factory,
+                       off_t            offset,
+                       size_t           size,
+                       BufferedReader **readerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a BufferedWriter for a region of the index.
+ *
+ * @param factory    The IOFactory
+ * @param offset     The byte offset to the region within the index
+ * @param size       The size in bytes of the region
+ * @param regionPtr  The IORegion is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int openBufferedWriter(IOFactory       *factory,
+                       off_t            offset,
+                       size_t           size,
+                       BufferedWriter **writerPtr)
+  __attribute__((warn_unused_result));
+
+#endif // IO_FACTORY_H
diff --git a/uds/ioFactoryLinuxKernel.c b/uds/ioFactoryLinuxKernel.c
new file mode 100644
index 0000000..9e45920
--- /dev/null
+++ b/uds/ioFactoryLinuxKernel.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/ioFactoryLinuxKernel.c#9 $
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mount.h>
+
+#include "atomicDefs.h"
+#include "ioFactory.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+enum { BLK_FMODE = FMODE_READ | FMODE_WRITE };
+
+/*
+ * A kernel mode IOFactory object controls access to an index stored on a block
+ * device.
+ */
+struct ioFactory {
+  struct block_device *bdev;
+  atomic_t             refCount;
+};
+
+/*****************************************************************************/
+void getIOFactory(IOFactory *factory)
+{
+  atomic_inc(&factory->refCount);
+}
+
+/*****************************************************************************/
+int makeIOFactory(const char *path, IOFactory **factoryPtr)
+{
+  struct block_device *bdev;
+  dev_t device = name_to_dev_t(path);
+  if (device != 0) {
+    bdev = blkdev_get_by_dev(device, BLK_FMODE, NULL);
+  } else {
+    bdev = blkdev_get_by_path(path, BLK_FMODE, NULL);
+  }
+  if (IS_ERR(bdev)) {
+    logErrorWithStringError(-PTR_ERR(bdev), "%s is not a block device", path);
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  IOFactory *factory;
+  int result = ALLOCATE(1, IOFactory, __func__, &factory);
+  if (result != UDS_SUCCESS) {
+    blkdev_put(bdev, BLK_FMODE);
+    return result;
+  }
+
+  factory->bdev = bdev;
+  atomic_set_release(&factory->refCount, 1);
+
+  *factoryPtr = factory;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void putIOFactory(IOFactory *factory)
+{
+  if (atomic_add_return(-1, &factory->refCount) <= 0) {
+    blkdev_put(factory->bdev, BLK_FMODE);
+    FREE(factory);
+  }
+}
+
+/*****************************************************************************/
+size_t getWritableSize(IOFactory *factory)
+{
+  return i_size_read(factory->bdev->bd_inode);
+}
+
+/*****************************************************************************/
+int makeBufio(IOFactory               *factory,
+              off_t                    offset,
+              size_t                   blockSize,
+              unsigned int             reservedBuffers,
+              struct dm_bufio_client **clientPtr)
+{
+  if (offset % SECTOR_SIZE != 0) {
+    return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT,
+                                   "offset %zd not multiple of %d",
+                                   offset, SECTOR_SIZE);
+  }
+  if (blockSize % UDS_BLOCK_SIZE != 0) {
+    return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT,
+                                   "blockSize %zd not multiple of %d",
+                                   blockSize, UDS_BLOCK_SIZE);
+  }
+
+  struct dm_bufio_client *client = dm_bufio_client_create(factory->bdev,
+                                                          blockSize,
+                                                          reservedBuffers, 0,
+                                                          NULL, NULL);
+  if (IS_ERR(client)) {
+    return -PTR_ERR(client);
+  }
+
+  dm_bufio_set_sector_offset(client, offset >> SECTOR_SHIFT);
+  *clientPtr = client;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int openBufferedReader(IOFactory       *factory,
+                       off_t            offset,
+                       size_t           size,
+                       BufferedReader **readerPtr)
+{
+  if (size % UDS_BLOCK_SIZE != 0) {
+    return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT,
+                                   "region size %zd is not multiple of %d",
+                                   size, UDS_BLOCK_SIZE);
+  }
+
+  struct dm_bufio_client *client = NULL;
+  int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeBufferedReader(factory, client, size / UDS_BLOCK_SIZE,
+                              readerPtr);
+  if (result != UDS_SUCCESS) {
+    dm_bufio_client_destroy(client);
+  }
+  return result;
+}
+
+/*****************************************************************************/
+int openBufferedWriter(IOFactory       *factory,
+                       off_t            offset,
+                       size_t           size,
+                       BufferedWriter **writerPtr)
+{
+  if (size % UDS_BLOCK_SIZE != 0) {
+    return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT,
+                                   "region size %zd is not multiple of %d",
+                                   size, UDS_BLOCK_SIZE);
+  }
+
+  struct dm_bufio_client *client = NULL;
+  int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = makeBufferedWriter(factory, client, size / UDS_BLOCK_SIZE,
+                              writerPtr);
+  if (result != UDS_SUCCESS) {
+    dm_bufio_client_destroy(client);
+  }
+  return result;
+}
diff --git a/uds/layoutRegion.h b/uds/layoutRegion.h
new file mode 100644
index 0000000..b49f979
--- /dev/null
+++ b/uds/layoutRegion.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/layoutRegion.h#1 $
+ */
+
+#ifndef LAYOUT_REGION_H
+#define LAYOUT_REGION_H
+
+/**
+ * Single file layouts are defined in terms of data regions. Each data region
+ * is a sub-section of the available space. Some data regions may contain
+ * subsidiary data regions, for example, a checkpoint or index save will
+ * contain master index regions (according to the number of zones), an
+ * index page map region, and possibly an open chapter region.
+ **/
+
+static const uint64_t REGION_MAGIC = 0x416c6252676e3031; // 'AlbRgn01'
+
+typedef struct regionHeader {
+  uint64_t      magic;                  // REGION_MAGIC
+  uint64_t      regionBlocks;           // size of whole region
+  uint16_t      type;                   // RH_TYPE_...
+  uint16_t      version;                // 1
+  uint16_t      numRegions;             // number of layouts in the table
+  uint16_t      payload;                // extra data beyond region table
+} RegionHeader;
+
+typedef struct layoutRegion {
+  uint64_t      startBlock;
+  uint64_t      numBlocks;
+  uint32_t      checksum;               // only used for save regions
+  uint16_t      kind;
+  uint16_t      instance;
+} LayoutRegion;
+
+typedef struct regionTable {
+  RegionHeader  header;
+  LayoutRegion  regions[];
+} RegionTable;
+
+#endif // LAYOUT_REGION_H
diff --git a/uds/loadType.c b/uds/loadType.c
new file mode 100644
index 0000000..125f8b0
--- /dev/null
+++ b/uds/loadType.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/loadType.c#1 $
+ */
+
+#include "loadType.h"
+
+#include "logger.h"
+
+/**********************************************************************/
+const char *getLoadType(LoadType loadType)
+{
+  switch (loadType) {
+  case LOAD_CREATE:
+    return "creating index";
+  case LOAD_LOAD:
+    return "loading index";
+  case LOAD_REBUILD:
+    return "loading or rebuilding index";
+  default:
+    return "no load method specified";
+  }
+}
diff --git a/uds/loadType.h b/uds/loadType.h
new file mode 100644
index 0000000..2b93e72
--- /dev/null
+++ b/uds/loadType.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/loadType.h#1 $
+ */
+
+#ifndef LOAD_TYPE_H
+#define LOAD_TYPE_H
+
+/**
+ * Methods of starting the index.  (Keep getLoadType() in sync.)
+ *
+ * Usage number 1 is to note the interface method that initiates loading the
+ * index.  As in this table:
+ *
+ *    name            type    opened by
+ *    ===========     ======  ====================
+ *    LOAD_CREATE     local   udsCreateLocalIndex
+ *    LOAD_LOAD       local   udsLoadLocalIndex
+ *    LOAD_REBUILD    local   udsRebuildLocalIndex
+ *
+ * Usage number 2 is to record how an index was really opened.  As in this
+ * table:
+ *
+ *    LOAD_CREATE   new empty index
+ *    LOAD_LOAD     loaded saved index
+ *    LOAD_REPLAY   loaded checkpoint and replayed new chapters
+ *    LOAD_EMPTY    empty master index from empty volume data
+ *    LOAD_REBUILD  rebuilt master index from volume data
+ **/
+typedef enum {
+  LOAD_UNDEFINED = 0,
+  LOAD_CREATE,
+  LOAD_LOAD,
+  LOAD_REBUILD,
+  LOAD_EMPTY,
+  LOAD_REPLAY,
+} LoadType;
+
+/**
+ * get a string indicating how an index is to be loaded.
+ *
+ * @param loadType    The load type to log
+ **/
+const char *getLoadType(LoadType loadType);
+
+#endif /* LOAD_TYPE_H */
diff --git a/uds/logger.c b/uds/logger.c
new file mode 100644
index 0000000..311bae1
--- /dev/null
+++ b/uds/logger.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/logger.c#3 $
+ */
+
+#include "logger.h"
+
+#include "common.h"
+#include "errors.h"
+#include "stringUtils.h"
+#include "threads.h"
+#include "uds.h"
+
+typedef struct {
+  const char *name;
+  const int   priority;
+} PriorityName;
+
+static const PriorityName PRIORITIES[] = {
+  { "ALERT",     LOG_ALERT   },
+  { "CRITICAL",  LOG_CRIT    },
+  { "CRIT",      LOG_CRIT    },
+  { "DEBUG",     LOG_DEBUG   },
+  { "EMERGENCY", LOG_EMERG   },
+  { "EMERG",     LOG_EMERG   },
+  { "ERROR",     LOG_ERR     },
+  { "ERR",       LOG_ERR     },
+  { "INFO",      LOG_INFO    },
+  { "NOTICE",    LOG_NOTICE  },
+  { "PANIC",     LOG_EMERG   },
+  { "WARN",      LOG_WARNING },
+  { "WARNING",   LOG_WARNING },
+  { NULL,        -1          },
+};
+
+static const char *const PRIORITY_STRINGS[] = {
+  "EMERGENCY",
+  "ALERT",
+  "CRITICAL",
+  "ERROR",
+  "WARN",
+  "NOTICE",
+  "INFO",
+  "DEBUG",
+};
+
+static int logLevel = LOG_INFO;
+
+/*****************************************************************************/
+int getLogLevel(void)
+{
+  return logLevel;
+}
+
+/*****************************************************************************/
+void setLogLevel(int newLogLevel)
+{
+  logLevel = newLogLevel;
+}
+
+/*****************************************************************************/
+int stringToPriority(const char *string)
+{
+  int i;
+  for (i = 0; PRIORITIES[i].name != NULL; i++) {
+    if (strcasecmp(string, PRIORITIES[i].name) == 0) {
+      return PRIORITIES[i].priority;
+    }
+  }
+  return LOG_INFO;
+}
+
+/*****************************************************************************/
+const char *priorityToString(int priority)
+{
+  if ((priority < 0) || (priority >= (int) COUNT_OF(PRIORITY_STRINGS))) {
+    return "unknown";
+  }
+  return PRIORITY_STRINGS[priority];
+}
+
+/*****************************************************************************/
+void logEmbeddedMessage(int         priority,
+                        const char *prefix,
+                        const char *fmt1,
+                        va_list     args1,
+                        const char *fmt2,
+                        ...)
+{
+  va_list ap;
+  va_start(ap, fmt2);
+  logMessagePack(priority, prefix, fmt1, args1, fmt2, ap);
+  va_end(ap);
+}
+
+#pragma GCC diagnostic push
+/*
+ * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems
+ * to think that this function should get a printf format
+ * attribute. But we have no second format string, and no additional
+ * arguments at the call site, and GCC also gets unhappy trying to
+ * analyze the format and values when there are none. So we'll just
+ * shut it up.
+ */
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+/**
+ * Log a message.
+ *
+ * This helper function exists solely to create a valid va_list with
+ * no useful info. It does the real work of vLogMessage, which wants a
+ * second va_list object to pass down.
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The variadic argument list of format parameters.
+ **/
+static void vLogMessageHelper(int         priority,
+                              const char *format,
+                              va_list     args,
+                              ...)
+{
+  va_list dummy;
+  va_start(dummy, args);
+  logMessagePack(priority, NULL, format, args, NULL, dummy);
+  va_end(dummy);
+}
+#pragma GCC diagnostic pop
+
+/*****************************************************************************/
+void vLogMessage(int priority, const char *format, va_list args)
+{
+  vLogMessageHelper(priority, format, args);
+}
+
+/*****************************************************************************/
+void logMessage(int priority, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(priority, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+void logDebug(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_DEBUG, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+void logInfo(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_INFO, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+void logNotice(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_NOTICE, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+void logWarning(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_WARNING, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+void logError(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_ERR, format, args);
+  va_end(args);
+}
+
+/*****************************************************************************/
+int vLogWithStringError(int         priority,
+                        int         errnum,
+                        const char *format,
+                        va_list     args)
+{
+  char errbuf[ERRBUF_SIZE];
+  logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)",
+                     stringError(errnum, errbuf, sizeof(errbuf)),
+                     errnum);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logWithStringError(int priority, int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(priority, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logErrorWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_ERR, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logWarningWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_WARNING, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logDebugWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_DEBUG, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logInfoWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_INFO, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logNoticeWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_NOTICE, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logFatalWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_CRIT, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/*****************************************************************************/
+int logUnrecoverable(int errnum, const char *format, ...)
+{
+  if (isSuccessful(errnum)) {
+    return errnum;
+  }
+  va_list args;
+  va_start(args, format);
+  vLogWithStringError(LOG_CRIT, errnum, format, args);
+  va_end(args);
+  return makeUnrecoverable(errnum);
+}
+
+/*****************************************************************************/
+void logFatal(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_CRIT, format, args);
+  va_end(args);
+}
diff --git a/uds/logger.h b/uds/logger.h
new file mode 100644
index 0000000..b1f9d56
--- /dev/null
+++ b/uds/logger.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/logger.h#5 $
+ */
+
+#ifndef LOGGER_H
+#define LOGGER_H 1
+
+#ifdef __KERNEL__
+#include <linux/ratelimit.h>
+#include <linux/version.h>
+#else
+#include <stdarg.h>
+#include "minisyslog.h"
+#endif
+
+#ifdef __KERNEL__
+#define LOG_EMERG       0       /* system is unusable */
+#define LOG_ALERT       1       /* action must be taken immediately */
+#define LOG_CRIT        2       /* critical conditions */
+#define LOG_ERR         3       /* error conditions */
+#define LOG_WARNING     4       /* warning conditions */
+#define LOG_NOTICE      5       /* normal but significant condition */
+#define LOG_INFO        6       /* informational */
+#define LOG_DEBUG       7       /* debug-level messages */
+#endif
+
+#ifdef __KERNEL__
+// Make it easy to log real pointer values using %px when in development.
+#ifdef LOG_INTERNAL
+#define PRIptr "px"
+#else
+#define PRIptr "pK"
+#endif
+#else // not __KERNEL__
+// For compatibility with hooks we need when compiling in kernel mode.
+#define PRIptr "p"
+#endif
+
+/*
+ * Apply a rate limiter to a log method call.
+ *
+ * @param logFunc  A method that does logging, which is not invoked if we are
+ *                 running in the kernel and the ratelimiter detects that we
+ *                 are calling it frequently.
+ */
+#ifdef __KERNEL__
+#define logRatelimit(logFunc, ...)                                 \
+  do {                                                             \
+    static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, \
+                                  DEFAULT_RATELIMIT_BURST);        \
+    if (__ratelimit(&_rs)) {                                       \
+      logFunc(__VA_ARGS__);                                        \
+    }                                                              \
+  } while (0)
+#else
+#define logRatelimit(logFunc, ...) logFunc(__VA_ARGS__)
+#endif
+
+/**
+ * @file
+ *
+ * All of the log<Level>() functions will preserve the callers value of errno.
+ **/
+
+#ifndef __KERNEL__
+/*
+ * In user mode, the functions in this file are not thread safe in the sense
+ * that nothing prevents multiple threads from closing loggers out from under
+ * other threads.  In reality this isn't a problem since there are no calls to
+ * closeLogger() in production code.
+ */
+
+/**
+ * Start the logger.
+ **/
+void openLogger(void);
+
+/**
+ * Stop the logger.
+ **/
+void closeLogger(void);
+#endif
+
+/**
+ * Get the current logging level.
+ *
+ * @return  the current logging priority level.
+ **/
+int getLogLevel(void);
+
+/**
+ * Set the current logging level.
+ *
+ * @param newLogLevel  the new value for the logging priority level.
+ **/
+void setLogLevel(int newLogLevel);
+
+/**
+ * Return the integer logging priority represented by a name.
+ *
+ * @param string  the name of the logging priority (case insensitive).
+ *
+ * @return the integer priority named by string, or LOG_INFO if not recognized.
+ **/
+int stringToPriority(const char *string);
+
+/**
+ * Return the printable name of a logging priority.
+ *
+ * @return the priority name
+ **/
+const char *priorityToString(int priority);
+
+/**
+ * Log a debug message.
+ *
+ * @param format The format of the message (a printf style format)
+ **/
+void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log an informational message.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a normal (but notable) condition.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a warning.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log an error.
+ *
+ * @param  format The format of the message (a printf style format)
+  **/
+void logError(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a message embedded within another message.
+ *
+ * @param priority      the priority at which to log the message
+ * @param prefix        optional string prefix to message, may be NULL
+ * @param fmt1          format of message first part, may be NULL
+ * @param args1         arguments for message first part
+ * @param fmt2          format of message second part
+ **/
+void logEmbeddedMessage(int         priority,
+                        const char *prefix,
+                        const char *fmt1,
+                        va_list     args1,
+                        const char *fmt2,
+                        ...)
+  __attribute__((format(printf, 3, 0), format(printf, 5, 6)));
+
+/**
+ * Log a message pack consisting of multiple variable sections.
+ *
+ * @param priority      the priority at which to log the message
+ * @param prefix        optional string prefix to message, may be NULL
+ * @param fmt1          format of message first part, may be NULL
+ * @param args1         arguments for message first part
+ * @param fmt2          format of message second part, may be NULL
+ * @param args2         arguments for message second part
+ **/
+void logMessagePack(int         priority,
+                    const char *prefix,
+                    const char *fmt1,
+                    va_list     args1,
+                    const char *fmt2,
+                    va_list     args2)
+  __attribute__((format(printf, 3, 0)));
+
+/**
+ * Log a stack backtrace.
+ *
+ * @param  priority The priority at which to log the backtrace
+ **/
+void logBacktrace(int priority);
+
+/**
+ * Log a message with an error from an error code.
+ *
+ * @param  priority The priority of the logging entry
+ * @param  errnum   Int value of errno or a UDS_* value.
+ * @param  format   The format of the message (a printf style format)
+ *
+ * @return errnum
+ **/
+int logWithStringError(int priority, int errnum, const char *format, ...)
+  __attribute__((format(printf, 3, 4)));
+
+/**
+ * Log a message with an error from an error code.
+ *
+ * @param  priority The priority of the logging entry
+ * @param  errnum   Int value of errno or a UDS_* value.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The list of arguments with format.
+ *
+ * @return errnum
+ **/
+int vLogWithStringError(int         priority,
+                        int         errnum,
+                        const char *format,
+                        va_list     args)
+  __attribute__((format(printf, 3, 0)));
+
+/**
+ * Log an error prefixed with the string associated with the errnum.
+ *
+ * @param errnum Int value of errno or a UDS_* value.
+ * @param format The format of the message (a printf style format)
+ *
+ * @return errnum
+ **/
+int logErrorWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logDebugWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logInfoWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logNoticeWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logWarningWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logFatalWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * IF the result is an error, log a FATAL level message and return the result
+ * after marking it unrecoverable.  The UDS_SUCCESS and UDS_QUEUED results are
+ * not considered errors and are returned unmodified.
+ *
+ * @param errnum  int value of errno or a UDS_* value.
+ * @param format  The format of the message (a printf style format)
+ *
+ * @return makeUnrecoverable(errnum) or UDS_SUCCESS or UDS_QUEUED
+ **/
+int logUnrecoverable(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Log a fatal error.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a message -- for internal use only.
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The variadic argument list of format parameters.
+ **/
+void vLogMessage(int priority, const char *format, va_list args)
+  __attribute__((format(printf, 2, 0)));
+
+/**
+ * Log a message
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ **/
+void logMessage(int priority, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Sleep or delay a short time (likely a few milliseconds) in an attempt allow
+ * the log buffers to be written out in case they might be overrun. This is
+ * unnecessary in user-space (and is a no-op there), but is needed when
+ * quickly issuing a lot of log output in the Linux kernel, as when dumping a
+ * large number of data structures.
+ **/
+void pauseForLogger(void);
+
+#endif /* LOGGER_H */
diff --git a/uds/loggerLinuxKernel.c b/uds/loggerLinuxKernel.c
new file mode 100644
index 0000000..bb1ad0b
--- /dev/null
+++ b/uds/loggerLinuxKernel.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/loggerLinuxKernel.c#2 $
+ */
+
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+#include "logger.h"
+
+/**********************************************************************/
+static const char *priorityToLogLevel(int priority)
+{
+  switch (priority) {
+    case LOG_EMERG:
+    case LOG_ALERT:
+    case LOG_CRIT:
+      return KERN_CRIT;
+    case LOG_ERR:
+      return KERN_ERR;
+    case LOG_WARNING:
+      return KERN_WARNING;
+    case LOG_NOTICE:
+      return KERN_NOTICE;
+    case LOG_INFO:
+      return KERN_INFO;
+    case LOG_DEBUG:
+      return KERN_DEBUG;
+    default:
+      return "";
+  }
+}
+
+/**********************************************************************/
+static const char *getCurrentInterruptType(void)
+{
+  if (in_nmi()) {
+    return "NMI";
+  }
+  if (in_irq()) {
+    return "HI";
+  }
+  if (in_softirq()) {
+    return "SI";
+  }
+  return "INTR";
+}
+
+/**********************************************************************/
+void logMessagePack(int         priority,
+                    const char *prefix,
+                    const char *fmt1,
+                    va_list     args1,
+                    const char *fmt2,
+                    va_list     args2)
+{
+  if (priority > getLogLevel()) {
+    return;
+  }
+
+  /*
+   * The kernel's printk has some magic for indirection to a secondary
+   * va_list. It wants us to supply a pointer to the va_list.
+   *
+   * However, va_list varies across platforms and can be an array
+   * type, which makes passing it around as an argument kind of
+   * tricky, due to the automatic conversion to a pointer. This makes
+   * taking the address of the argument a dicey thing; if we use "&a"
+   * it works fine for non-array types, but for array types we get the
+   * address of a pointer. Functions like va_copy and sprintf don't
+   * care as they get "va_list" values passed and are written to do
+   * the right thing, but printk explicitly wants the address of the
+   * va_list.
+   *
+   * So, we copy the va_list values to ensure that "&" consistently
+   * works the way we want.
+   */
+  va_list args1Copy;
+  va_copy(args1Copy, args1);
+  va_list args2Copy;
+  va_copy(args2Copy, args2);
+  struct va_format vaf1 = {
+    .fmt = (fmt1 != NULL) ? fmt1 : "",
+    .va  = &args1Copy,
+  };
+  struct va_format vaf2 = {
+    .fmt = (fmt2 != NULL) ? fmt2 : "",
+    .va  = &args2Copy,
+  };
+
+  if (prefix == NULL) {
+    prefix = "";
+  }
+
+  /*
+   * Context info formats:
+   *
+   * interrupt:   uds[NMI]: blah
+   * process:     uds: myprog: blah
+   *
+   * Fields: module name, interrupt level or process name.
+   *
+   * XXX need the equivalent of VDO's deviceInstance here
+   */
+  if (in_interrupt()) {
+    printk("%s%s[%s]: %s%pV%pV\n", priorityToLogLevel(priority),
+	   THIS_MODULE->name, getCurrentInterruptType(), prefix, &vaf1, &vaf2);
+  } else {
+    printk("%s%s: %s: %s%pV%pV\n", priorityToLogLevel(priority),
+	   THIS_MODULE->name, current->comm, prefix, &vaf1, &vaf2);
+  }
+
+  va_end(args1Copy);
+  va_end(args2Copy);
+}
+
+/**********************************************************************/
+void logBacktrace(int priority)
+{
+  if (priority > getLogLevel()) {
+    return;
+  }
+  logMessage(priority, "[backtrace]");
+  dump_stack();
+}
+
+/**********************************************************************/
+void pauseForLogger(void)
+{
+  // Hopefully, a few milliseconds of sleep will be large enough
+  // for the kernel log buffer to be flushed.
+  msleep(4);
+}
diff --git a/uds/masterIndex005.c b/uds/masterIndex005.c
new file mode 100644
index 0000000..3f9a5b2
--- /dev/null
+++ b/uds/masterIndex005.c
@@ -0,0 +1,1470 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.c#3 $
+ */
+#include "masterIndex005.h"
+
+#include "buffer.h"
+#include "compiler.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "uds.h"
+#include "zone.h"
+
+/*
+ * The master index is a kept as a delta index where the payload is a
+ * chapter number.  The master index adds 2 basic functions to the delta
+ * index:
+ *
+ *  (1) How to get the delta list number and address out of the chunk name.
+ *
+ *  (2) Dealing with chapter numbers, and especially the lazy flushing of
+ *      chapters from the index.
+ *
+ * There are three ways of expressing chapter numbers: virtual, index, and
+ * rolling.  The interface to the the master index uses virtual chapter
+ * numbers, which are 64 bits long.  We do not store such large values in
+ * memory, so we internally use a binary value using the minimal number of
+ * bits.
+ *
+ * The delta index stores the index chapter number, which is the low-order
+ * bits of the virtual chapter number.
+ *
+ * When we need to deal with ordering of index chapter numbers, we roll the
+ * index chapter number around so that the smallest one we are using has
+ * the representation 0.  See convertIndexToVirtual() or
+ * flushInvalidEntries() for an example of this technique.
+ */
+
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone {
+  uint64_t virtualChapterLow;      // The lowest virtual chapter indexed
+  uint64_t virtualChapterHigh;     // The highest virtual chapter indexed
+  long     numEarlyFlushes;        // The number of early flushes
+} MasterIndexZone;
+
+typedef struct {
+  MasterIndex common;              // Common master index methods
+  DeltaIndex deltaIndex;           // The delta index
+  uint64_t *flushChapters;         // The first chapter to be flushed
+  MasterIndexZone *masterZones;    // The Zones
+  uint64_t     volumeNonce;        // The volume nonce
+  uint64_t     chapterZoneBits;    // Expected size of a chapter (per zone)
+  uint64_t     maxZoneBits;        // Maximum size index (per zone)
+  unsigned int addressBits;        // Number of bits in address mask
+  unsigned int addressMask;        // Mask to get address within delta list
+  unsigned int chapterBits;        // Number of bits in chapter number
+  unsigned int chapterMask;        // Largest storable chapter number
+  unsigned int numChapters;        // Number of chapters used
+  unsigned int numDeltaLists;      // The number of delta lists
+  unsigned int numZones;           // The number of zones
+} MasterIndex5;
+
+typedef struct chapterRange {
+  unsigned int chapterStart;    // The first chapter
+  unsigned int chapterCount;    // The number of chapters
+} ChapterRange;
+
+// Constants for the magic byte of a MasterIndexRecord
+static const byte masterIndexRecordMagic = 0xAA;
+static const byte badMagic = 0;
+
+/*
+ * In production, the default value for minMasterIndexDeltaLists will be
+ * replaced by MAX_ZONES*MAX_ZONES.  Some unit tests will replace
+ * minMasterIndexDeltaLists with the non-default value 1, because those
+ * tests really want to run with a single delta list.
+ */
+unsigned int minMasterIndexDeltaLists;
+
+/**
+ * Maximum of two unsigned ints
+ *
+ * @param a  One unsigned int
+ * @param b  Another unsigned int
+ *
+ * @return the bigger one
+ **/
+static INLINE unsigned int maxUint(unsigned int a, unsigned int b)
+{
+  return a > b ? a : b;
+}
+
+/**
+ * Extract the address from a block name.
+ *
+ * @param mi5   The master index
+ * @param name  The block name
+ *
+ * @return the address
+ **/
+static INLINE unsigned int extractAddress(const MasterIndex5 *mi5,
+                                          const UdsChunkName *name)
+{
+  return extractMasterIndexBytes(name) & mi5->addressMask;
+}
+
+/**
+ * Extract the delta list number from a block name.
+ *
+ * @param mi5   The master index
+ * @param name  The block name
+ *
+ * @return the delta list number
+ **/
+static INLINE unsigned int extractDListNum(const MasterIndex5 *mi5,
+                                           const UdsChunkName *name)
+{
+  uint64_t bits = extractMasterIndexBytes(name);
+  return (bits >> mi5->addressBits) % mi5->numDeltaLists;
+}
+
+/**
+ * Get the master index zone containing a given master index record
+ *
+ * @param record  The master index record
+ *
+ * @return the master index zone
+ **/
+static INLINE const MasterIndexZone *getMasterZone(const MasterIndexRecord *record)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  return &mi5->masterZones[record->zoneNumber];
+}
+
+/**
+ * Convert an index chapter number to a virtual chapter number.
+ *
+ * @param record        The master index record
+ * @param indexChapter  The index chapter number
+ *
+ * @return the virtual chapter number
+ **/
+static INLINE uint64_t convertIndexToVirtual(const MasterIndexRecord *record,
+                                             unsigned int indexChapter)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  const MasterIndexZone *masterZone = getMasterZone(record);
+  unsigned int rollingChapter
+    = ((indexChapter - masterZone->virtualChapterLow) & mi5->chapterMask);
+  return masterZone->virtualChapterLow + rollingChapter;
+}
+
+/**
+ * Convert a virtual chapter number to an index chapter number.
+ *
+ * @param mi5             The master index
+ * @param virtualChapter  The virtual chapter number
+ *
+ * @return the index chapter number
+ **/
+static INLINE unsigned int convertVirtualToIndex(const MasterIndex5 *mi5,
+                                                 uint64_t virtualChapter)
+{
+  return virtualChapter & mi5->chapterMask;
+}
+
+/**
+ * Determine whether a virtual chapter number is in the range being indexed
+ *
+ * @param record          The master index record
+ * @param virtualChapter  The virtual chapter number
+ *
+ * @return true if the virtual chapter number is being indexed
+ **/
+static INLINE bool isVirtualChapterIndexed(const MasterIndexRecord *record,
+                                           uint64_t virtualChapter)
+{
+  const MasterIndexZone *masterZone = getMasterZone(record);
+  return ((virtualChapter >= masterZone->virtualChapterLow)
+          && (virtualChapter <= masterZone->virtualChapterHigh));
+}
+
+/***********************************************************************/
+/**
+ * Flush an invalid entry from the master index, advancing to the next
+ * valid entry.
+ *
+ * @param record                   Updated to describe the next valid record
+ * @param flushRange               Range of chapters to flush from the index
+ * @param nextChapterToInvalidate  Updated to record the next chapter that we
+ *                                 will need to invalidate
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int flushInvalidEntries(MasterIndexRecord *record,
+                                      ChapterRange *flushRange,
+                                      unsigned int *nextChapterToInvalidate)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  int result = nextDeltaIndexEntry(&record->deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  while (!record->deltaEntry.atEnd) {
+    unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry);
+    unsigned int relativeChapter = ((indexChapter - flushRange->chapterStart)
+                                    & mi5->chapterMask);
+    if (likely(relativeChapter >= flushRange->chapterCount)) {
+      if (relativeChapter < *nextChapterToInvalidate) {
+        *nextChapterToInvalidate = relativeChapter;
+      }
+      break;
+    }
+    result = removeDeltaIndexEntry(&record->deltaEntry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**
+ * Find the delta index entry, or the insertion point for a delta index
+ * entry, while processing chapter LRU flushing.
+ *
+ * @param record       Updated to describe the entry being looked for
+ * @param listNumber   The delta list number
+ * @param key          The address field being looked for
+ * @param flushRange   The range of chapters to flush from the index
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int getMasterIndexEntry(MasterIndexRecord *record,
+                               unsigned int       listNumber,
+                               unsigned int       key,
+                               ChapterRange      *flushRange)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  unsigned int nextChapterToInvalidate = mi5->chapterMask;
+
+  int result = startDeltaIndexSearch(&mi5->deltaIndex, listNumber, 0,
+                                     false, &record->deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  do {
+    result = flushInvalidEntries(record, flushRange, &nextChapterToInvalidate);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  } while (!record->deltaEntry.atEnd && (key > record->deltaEntry.key));
+
+  result = rememberDeltaIndexOffset(&record->deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // We probably found the record we want, but we need to keep going
+  MasterIndexRecord otherRecord = *record;
+  if (!otherRecord.deltaEntry.atEnd && (key == otherRecord.deltaEntry.key)) {
+    for (;;) {
+      result = flushInvalidEntries(&otherRecord, flushRange,
+                                   &nextChapterToInvalidate);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      if (otherRecord.deltaEntry.atEnd
+          || !otherRecord.deltaEntry.isCollision) {
+        break;
+      }
+      byte collisionName[UDS_CHUNK_NAME_SIZE];
+      result = getDeltaEntryCollision(&otherRecord.deltaEntry, collisionName);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      if (memcmp(collisionName, record->name, UDS_CHUNK_NAME_SIZE) == 0) {
+        // This collision record is the one we are looking for
+        *record = otherRecord;
+        break;
+      }
+    }
+  }
+  while (!otherRecord.deltaEntry.atEnd) {
+    result = flushInvalidEntries(&otherRecord, flushRange,
+                                 &nextChapterToInvalidate);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  nextChapterToInvalidate += flushRange->chapterStart;
+  nextChapterToInvalidate &= mi5->chapterMask;
+  flushRange->chapterStart = nextChapterToInvalidate;
+  flushRange->chapterCount = 0;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Terminate and clean up the master index
+ *
+ * @param masterIndex The master index to terminate
+ **/
+static void freeMasterIndex_005(MasterIndex *masterIndex)
+{
+  if (masterIndex != NULL) {
+    MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+    FREE(mi5->flushChapters);
+    mi5->flushChapters = NULL;
+    FREE(mi5->masterZones);
+    mi5->masterZones = NULL;
+    uninitializeDeltaIndex(&mi5->deltaIndex);
+    FREE(masterIndex);
+  }
+}
+
+/**
+ * Constants and structures for the saved master index file.  "MI5" is for
+ * masterIndex005, and "-XXXX" is a number to increment when the format of
+ * the data changes.
+ **/
+enum { MAGIC_SIZE = 8 };
+static const char MAGIC_MI_START[] = "MI5-0005";
+
+struct mi005_data {
+  char magic[MAGIC_SIZE];       // MAGIC_MI_START
+  uint64_t volumeNonce;
+  uint64_t virtualChapterLow;
+  uint64_t virtualChapterHigh;
+  unsigned int firstList;
+  unsigned int numLists;
+};
+
+/***********************************************************************/
+/**
+ * Set the tag value used when saving and/or restoring a master index.
+ *
+ * @param masterIndex  The master index
+ * @param tag          The tag value
+ **/
+static void setMasterIndexTag_005(MasterIndex *masterIndex, byte tag)
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  setDeltaIndexTag(&mi5->deltaIndex, tag);
+}
+
+/***********************************************************************/
+__attribute__((warn_unused_result))
+static int encodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header)
+{
+  int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->volumeNonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->virtualChapterLow);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt64LEIntoBuffer(buffer, header->virtualChapterHigh);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->firstList);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->numLists);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi005_data),
+                           "%zu bytes of config written, of %zu expected",
+                           contentLength(buffer), sizeof(struct mi005_data));
+  return result;
+}
+
+/**
+ * Start saving a master index to a buffered output stream.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The number of the zone to save
+ * @param bufferedWriter  The index state component being written
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int startSavingMasterIndex_005(const MasterIndex *masterIndex,
+                                      unsigned int zoneNumber,
+                                      BufferedWriter *bufferedWriter)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber];
+  unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex,
+                                                      zoneNumber);
+  unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex,
+                                                    zoneNumber);
+
+  struct mi005_data header;
+  memset(&header, 0, sizeof(header));
+  memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE);
+  header.volumeNonce        = mi5->volumeNonce;
+  header.virtualChapterLow  = masterZone->virtualChapterLow;
+  header.virtualChapterHigh = masterZone->virtualChapterHigh;
+  header.firstList          = firstList;
+  header.numLists           = numLists;
+
+  Buffer *buffer;
+  int result = makeBuffer(sizeof(struct mi005_data), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = encodeMasterIndexHeader(buffer, &header);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result,
+                                     "failed to write master index header");
+  }
+  result = makeBuffer(numLists * sizeof(uint64_t), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  uint64_t *firstFlushChapter = &mi5->flushChapters[firstList];
+  result = putUInt64LEsIntoBuffer(buffer, numLists, firstFlushChapter);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result,
+                                     "failed to write master index flush "
+                                     "ranges");
+  }
+
+  return startSavingDeltaIndex(&mi5->deltaIndex, zoneNumber, bufferedWriter);
+}
+
+/***********************************************************************/
+/**
+ * Have all the data been written while saving a master index to an output
+ * stream?  If the answer is yes, it is still necessary to call
+ * finishSavingMasterIndex(), which will return quickly.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return true if all the data are written
+ **/
+static bool isSavingMasterIndexDone_005(const MasterIndex *masterIndex,
+                                        unsigned int zoneNumber)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  return isSavingDeltaIndexDone(&mi5->deltaIndex, zoneNumber);
+}
+
+/***********************************************************************/
+/**
+ * Finish saving a master index to an output stream.  Force the writing of
+ * all of the remaining data.  If an error occurred asynchronously during
+ * the save operation, it will be returned here.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int finishSavingMasterIndex_005(const MasterIndex *masterIndex,
+                                       unsigned int zoneNumber)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  return finishSavingDeltaIndex(&mi5->deltaIndex, zoneNumber);
+}
+
+/***********************************************************************/
+/**
+ * Abort saving a master index to an output stream.  If an error occurred
+ * asynchronously during the save operation, it will be dropped.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int abortSavingMasterIndex_005(const MasterIndex *masterIndex,
+                                      unsigned int zoneNumber)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  return abortSavingDeltaIndex(&mi5->deltaIndex, zoneNumber);
+}
+
+/***********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header)
+{
+  int result = getBytesFromBuffer(buffer, sizeof(header->magic),
+                                  &header->magic);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->volumeNonce);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->virtualChapterLow);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt64LEFromBuffer(buffer, &header->virtualChapterHigh);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->firstList);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->numLists);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer) - contentLength(buffer),
+                           bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    result = UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/**
+ * Start restoring the master index from multiple buffered readers
+ *
+ * @param masterIndex      The master index to restore into
+ * @param bufferedReaders  The buffered readers to read the master index from
+ * @param numReaders       The number of buffered readers
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int startRestoringMasterIndex_005(MasterIndex *masterIndex,
+                                         BufferedReader **bufferedReaders,
+                                         int numReaders)
+{
+  if (masterIndex == NULL) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "cannot restore to null master index");
+  }
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  emptyDeltaIndex(&mi5->deltaIndex);
+
+  uint64_t virtualChapterLow = 0;
+  uint64_t virtualChapterHigh = 0;
+  int i;
+  for (i = 0; i < numReaders; i++) {
+    Buffer *buffer;
+    int result = makeBuffer(sizeof(struct mi005_data), &buffer);
+    if (result != UDS_SUCCESS)  {
+      return result;
+    }
+    result = readFromBufferedReader(bufferedReaders[i],
+                                    getBufferContents(buffer),
+                                    bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return logWarningWithStringError(result,
+                                       "failed to read master index header");
+    }
+    result = resetBufferEnd(buffer, bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return result;
+    }
+    struct mi005_data header;
+    result = decodeMasterIndexHeader(buffer, &header);
+    freeBuffer(&buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "master index file had bad magic"
+                                       " number");
+    }
+    if (mi5->volumeNonce == 0) {
+      mi5->volumeNonce = header.volumeNonce;
+    } else if (header.volumeNonce != mi5->volumeNonce) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "master index volume nonce incorrect");
+    }
+    if (i == 0) {
+      virtualChapterLow  = header.virtualChapterLow;
+      virtualChapterHigh = header.virtualChapterHigh;
+    } else if (virtualChapterHigh != header.virtualChapterHigh) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "Inconsistent master index zone files:"
+                                       " Chapter range is [%llu,%"
+                                       PRIu64 "], chapter range %d is [%"
+                                       PRIu64 ",%llu]",
+                                       virtualChapterLow, virtualChapterHigh,
+                                       i, header.virtualChapterLow,
+                                       header.virtualChapterHigh);
+    } else if (virtualChapterLow < header.virtualChapterLow) {
+      virtualChapterLow = header.virtualChapterLow;
+    }
+    uint64_t *firstFlushChapter = &mi5->flushChapters[header.firstList];
+    result = makeBuffer(header.numLists * sizeof(uint64_t), &buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = readFromBufferedReader(bufferedReaders[i],
+                                    getBufferContents(buffer),
+                                    bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return logWarningWithStringError(result,
+                                       "failed to read master index flush"
+                                       " ranges");
+    }
+    result = resetBufferEnd(buffer, bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return result;
+    }
+    result = getUInt64LEsFromBuffer(buffer, header.numLists,
+                                    firstFlushChapter);
+    freeBuffer(&buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  unsigned int z;
+  for (z = 0; z < mi5->numZones; z++) {
+    memset(&mi5->masterZones[z], 0, sizeof(MasterIndexZone));
+    mi5->masterZones[z].virtualChapterLow  = virtualChapterLow;
+    mi5->masterZones[z].virtualChapterHigh = virtualChapterHigh;
+  }
+
+  int result = startRestoringDeltaIndex(&mi5->deltaIndex, bufferedReaders,
+                                        numReaders);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result, "restoring delta index failed");
+  }
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Have all the data been read while restoring a master index from an
+ * input stream?
+ *
+ * @param masterIndex  The master index to restore into
+ *
+ * @return true if all the data are read
+ **/
+static bool isRestoringMasterIndexDone_005(const MasterIndex *masterIndex)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  return isRestoringDeltaIndexDone(&mi5->deltaIndex);
+}
+
+/***********************************************************************/
+/**
+ * Restore a saved delta list
+ *
+ * @param masterIndex  The master index to restore into
+ * @param dlsi         The DeltaListSaveInfo describing the delta list
+ * @param data         The saved delta list bit stream
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+static int restoreDeltaListToMasterIndex_005(MasterIndex *masterIndex,
+                                             const DeltaListSaveInfo *dlsi,
+                                             const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  return restoreDeltaListToDeltaIndex(&mi5->deltaIndex, dlsi, data);
+}
+
+/***********************************************************************/
+/**
+ * Abort restoring a master index from an input stream.
+ *
+ * @param masterIndex  The master index
+ **/
+static void abortRestoringMasterIndex_005(MasterIndex *masterIndex)
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  abortRestoringDeltaIndex(&mi5->deltaIndex);
+}
+
+/***********************************************************************/
+static void removeNewestChapters(MasterIndex5 *mi5,
+                                 unsigned int zoneNumber,
+                                 uint64_t virtualChapter)
+{
+  // Get the range of delta lists belonging to this zone
+  unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex,
+                                                      zoneNumber);
+  unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex,
+                                                    zoneNumber);
+  unsigned int lastList = firstList + numLists - 1;
+
+  if (virtualChapter > mi5->chapterMask) {
+    // The virtual chapter number is large enough so that we can use the
+    // normal LRU mechanism without an unsigned underflow.
+    virtualChapter -= mi5->chapterMask + 1;
+    // Eliminate the newest chapters by renumbering them to become the
+    // oldest chapters
+    unsigned int i;
+    for (i = firstList; i <= lastList; i++) {
+      if (virtualChapter < mi5->flushChapters[i]) {
+        mi5->flushChapters[i] = virtualChapter;
+      }
+    }
+  } else {
+    // Underflow will prevent the fast path.  Do it the slow and painful way.
+    MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber];
+    ChapterRange range;
+    range.chapterStart = convertVirtualToIndex(mi5, virtualChapter);
+    range.chapterCount = (mi5->chapterMask + 1
+                          - (virtualChapter - masterZone->virtualChapterLow));
+    UdsChunkName name;
+    memset(&name, 0, sizeof(UdsChunkName));
+    MasterIndexRecord record = (MasterIndexRecord) {
+      .magic       = masterIndexRecordMagic,
+      .masterIndex = &mi5->common,
+      .name        = &name,
+      .zoneNumber  = zoneNumber,
+    };
+    unsigned int i;
+    for (i = firstList; i <= lastList; i++) {
+      ChapterRange tempRange = range;
+      getMasterIndexEntry(&record, i, 0, &tempRange);
+    }
+  }
+}
+
+/***********************************************************************/
+/**
+ * Set the open chapter number on a zone.  The master index zone will be
+ * modified to index the proper number of chapters ending with the new open
+ * chapter.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The zone number
+ * @param virtualChapter  The new open chapter number
+ **/
+static void setMasterIndexZoneOpenChapter_005(MasterIndex *masterIndex,
+                                              unsigned int zoneNumber,
+                                              uint64_t virtualChapter)
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber];
+  // Take care here to avoid underflow of an unsigned value.  Note that
+  // this is the smallest valid virtual low.  We may or may not actually
+  // use this value.
+  uint64_t newVirtualLow = (virtualChapter >= mi5->numChapters
+                            ? virtualChapter - mi5->numChapters + 1
+                            : 0);
+
+  if (virtualChapter <= masterZone->virtualChapterLow) {
+    /*
+     * Moving backwards and the new range is totally before the old range.
+     * Note that moving to the lowest virtual chapter counts as totally before
+     * the old range, as we need to remove the entries in the open chapter.
+     */
+    emptyDeltaIndexZone(&mi5->deltaIndex, zoneNumber);
+    masterZone->virtualChapterLow  = virtualChapter;
+    masterZone->virtualChapterHigh = virtualChapter;
+  } else if (virtualChapter <= masterZone->virtualChapterHigh) {
+    // Moving backwards and the new range overlaps the old range.  Note
+    // that moving to the same open chapter counts as backwards, as we need
+    // to remove the entries in the open chapter.
+    removeNewestChapters(mi5, zoneNumber, virtualChapter);
+    masterZone->virtualChapterHigh = virtualChapter;
+  } else if (newVirtualLow < masterZone->virtualChapterLow) {
+    // Moving forwards and we can keep all the old chapters
+    masterZone->virtualChapterHigh = virtualChapter;
+  } else if (newVirtualLow <= masterZone->virtualChapterHigh) {
+    // Moving forwards and we can keep some old chapters
+    masterZone->virtualChapterLow  = newVirtualLow;
+    masterZone->virtualChapterHigh = virtualChapter;
+  } else {
+    // Moving forwards and the new range is totally after the old range
+    masterZone->virtualChapterLow  = virtualChapter;
+    masterZone->virtualChapterHigh = virtualChapter;
+  }
+  // Check to see if the zone data has grown to be too large
+  if (masterZone->virtualChapterLow < masterZone->virtualChapterHigh) {
+    uint64_t usedBits = getDeltaIndexZoneDlistBitsUsed(&mi5->deltaIndex,
+                                                       zoneNumber);
+    if (usedBits > mi5->maxZoneBits) {
+      // Expire enough chapters to free the desired space
+      uint64_t expireCount
+        = 1 + (usedBits - mi5->maxZoneBits) / mi5->chapterZoneBits;
+      if (expireCount == 1) {
+        logRatelimit(logInfo,
+                     "masterZone %u:  At chapter %" PRIu64
+                     ", expiring chapter %llu early",
+                     zoneNumber, virtualChapter,
+                     masterZone->virtualChapterLow);
+        masterZone->numEarlyFlushes++;
+        masterZone->virtualChapterLow++;
+      } else {
+        uint64_t firstExpired = masterZone->virtualChapterLow;
+        if (firstExpired + expireCount < masterZone->virtualChapterHigh) {
+          masterZone->numEarlyFlushes += expireCount;
+          masterZone->virtualChapterLow += expireCount;
+        } else {
+          masterZone->numEarlyFlushes
+            += masterZone->virtualChapterHigh - masterZone->virtualChapterLow;
+          masterZone->virtualChapterLow = masterZone->virtualChapterHigh;
+        }
+        logRatelimit(logInfo,
+                     "masterZone %u:  At chapter %" PRIu64
+                     ", expiring chapters %llu to %llu early",
+                     zoneNumber, virtualChapter, firstExpired,
+                     masterZone->virtualChapterLow - 1);
+      }
+    }
+  }
+}
+
+/***********************************************************************/
+/**
+ * Set the open chapter number.  The master index will be modified to index
+ * the proper number of chapters ending with the new open chapter.
+ *
+ * @param masterIndex     The master index
+ * @param virtualChapter  The new open chapter number
+ **/
+static void setMasterIndexOpenChapter_005(MasterIndex *masterIndex,
+                                          uint64_t virtualChapter)
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  unsigned int z;
+  for (z = 0; z < mi5->numZones; z++) {
+    // In normal operation, we advance forward one chapter at a time.
+    // Log all abnormal changes.
+    MasterIndexZone *masterZone = &mi5->masterZones[z];
+    bool logMove = virtualChapter != masterZone->virtualChapterHigh + 1;
+    if (logMove) {
+      logDebug("masterZone %u: The range of indexed chapters is moving from [%"
+               PRIu64 ", %llu] ...",
+               z,
+               masterZone->virtualChapterLow,
+               masterZone->virtualChapterHigh);
+    }
+
+    setMasterIndexZoneOpenChapter_005(masterIndex, z, virtualChapter);
+
+    if (logMove) {
+      logDebug("masterZone %u: ... and moving to [%llu, %llu]",
+               z,
+               masterZone->virtualChapterLow,
+               masterZone->virtualChapterHigh);
+    }
+  }
+}
+
+/***********************************************************************/
+/**
+ * Find the master index zone associated with a chunk name
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ *
+ * @return the zone that the chunk name belongs to
+ **/
+static unsigned int getMasterIndexZone_005(const MasterIndex *masterIndex,
+                                           const UdsChunkName *name)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  unsigned int deltaListNumber = extractDListNum(mi5, name);
+  return getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber);
+}
+
+/***********************************************************************/
+/**
+ * Do a quick read-only lookup of the chunk name and return information
+ * needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int lookupMasterIndexName_005(const MasterIndex *masterIndex,
+                                     const UdsChunkName *name,
+                                     MasterIndexTriage *triage)
+{
+  triage->isSample = false;
+  triage->inSampledChapter = false;
+  triage->zone = getMasterIndexZone_005(masterIndex, name);
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Do a quick read-only lookup of the sampled chunk name and return
+ * information needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name.  The zone and
+ *                    isSample fields are already filled in.  Set
+ *                    inSampledChapter and virtualChapter if the chunk
+ *                    name is found in the index.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int lookupMasterIndexSampledName_005(const MasterIndex *masterIndex,
+                                            const UdsChunkName *name,
+                                            MasterIndexTriage *triage)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  unsigned int address = extractAddress(mi5, name);
+  unsigned int deltaListNumber = extractDListNum(mi5, name);
+  DeltaIndexEntry deltaEntry;
+  int result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address,
+                                  name->name, true, &deltaEntry);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  triage->inSampledChapter = !deltaEntry.atEnd && (deltaEntry.key == address);
+  if (triage->inSampledChapter) {
+    const MasterIndexZone *masterZone = &mi5->masterZones[triage->zone];
+    unsigned int indexChapter = getDeltaEntryValue(&deltaEntry);
+    unsigned int rollingChapter = ((indexChapter
+                                    - masterZone->virtualChapterLow)
+                                   & mi5->chapterMask);
+    triage->virtualChapter = masterZone->virtualChapterLow + rollingChapter;
+    if (triage->virtualChapter > masterZone->virtualChapterHigh) {
+      triage->inSampledChapter = false;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Find the master index record associated with a block name
+ *
+ * This is always the first routine to be called when dealing with a delta
+ * master index entry.  The fields of the record parameter should be
+ * examined to determine the state of the record:
+ *
+ * If isFound is false, then we did not find an entry for the block
+ * name.  Information is saved in the MasterIndexRecord so that
+ * putMasterIndexRecord() will insert an entry for that block name at
+ * the proper place.
+ *
+ * If isFound is true, then we did find an entry for the block name.
+ * Information is saved in the MasterIndexRecord so that the "chapter"
+ * and "isCollision" fields reflect the entry found.
+ * Calls to removeMasterIndexRecord() will remove the entry, calls to
+ * setMasterIndexRecordChapter() can modify the entry, and calls to
+ * putMasterIndexRecord() can insert a collision record with this
+ * entry.
+ *
+ * @param masterIndex The master index to search
+ * @param name        The chunk name
+ * @param record      Set to the info about the record searched for
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int getMasterIndexRecord_005(MasterIndex *masterIndex,
+                                    const UdsChunkName *name,
+                                    MasterIndexRecord *record)
+{
+  MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common);
+  unsigned int address = extractAddress(mi5, name);
+  unsigned int deltaListNumber = extractDListNum(mi5, name);
+  uint64_t flushChapter = mi5->flushChapters[deltaListNumber];
+  record->magic       = masterIndexRecordMagic;
+  record->masterIndex = masterIndex;
+  record->mutex       = NULL;
+  record->name        = name;
+  record->zoneNumber  = getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber);
+  const MasterIndexZone *masterZone = getMasterZone(record);
+
+  int result;
+  if (flushChapter < masterZone->virtualChapterLow) {
+    ChapterRange range;
+    uint64_t flushCount = masterZone->virtualChapterLow - flushChapter;
+    range.chapterStart = convertVirtualToIndex(mi5, flushChapter);
+    range.chapterCount = (flushCount > mi5->chapterMask
+                          ? mi5->chapterMask + 1
+                          : flushCount);
+    result = getMasterIndexEntry(record, deltaListNumber, address, &range);
+    flushChapter = convertIndexToVirtual(record, range.chapterStart);
+    if (flushChapter > masterZone->virtualChapterHigh) {
+      flushChapter = masterZone->virtualChapterHigh;
+    }
+    mi5->flushChapters[deltaListNumber] = flushChapter;
+  } else {
+    result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address,
+                                name->name, false, &record->deltaEntry);
+  }
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  record->isFound = (!record->deltaEntry.atEnd
+                     && (record->deltaEntry.key == address));
+  if (record->isFound) {
+    unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry);
+    record->virtualChapter = convertIndexToVirtual(record, indexChapter);
+  }
+  record->isCollision = record->deltaEntry.isCollision;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Create a new record associated with a block name.
+ *
+ * @param record          The master index record found by getRecord()
+ * @param virtualChapter  The chapter number where block info is found
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  if (record->magic != masterIndexRecordMagic) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "bad magic number in master index record");
+  }
+  if (!isVirtualChapterIndexed(record, virtualChapter)) {
+    const MasterIndexZone *masterZone = getMasterZone(record);
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot put record into chapter number %"
+                                     PRIu64 " that is out of the valid range %"
+                                     PRIu64 " to %llu",
+                                     virtualChapter,
+                                     masterZone->virtualChapterLow,
+                                     masterZone->virtualChapterHigh);
+  }
+  unsigned int address = extractAddress(mi5, record->name);
+  if (unlikely(record->mutex != NULL)) {
+    lockMutex(record->mutex);
+  }
+  int result = putDeltaIndexEntry(&record->deltaEntry, address,
+                                  convertVirtualToIndex(mi5, virtualChapter),
+                                  record->isFound ? record->name->name : NULL);
+  if (unlikely(record->mutex != NULL)) {
+    unlockMutex(record->mutex);
+  }
+  switch (result) {
+  case UDS_SUCCESS:
+    record->virtualChapter = virtualChapter;
+    record->isCollision    = record->deltaEntry.isCollision;
+    record->isFound        = true;
+    break;
+  case UDS_OVERFLOW:
+    logRatelimit(logWarningWithStringError, UDS_OVERFLOW,
+                 "Master index entry dropped due to overflow condition");
+    logDeltaIndexEntry(&record->deltaEntry);
+    break;
+  default:
+    break;
+  }
+  return result;
+}
+
+/**********************************************************************/
+static INLINE int validateRecord(MasterIndexRecord *record)
+{
+  if (record->magic != masterIndexRecordMagic) {
+    return logWarningWithStringError(
+      UDS_BAD_STATE, "bad magic number in master index record");
+  }
+  if (!record->isFound) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "illegal operation on new record");
+  }
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Remove an existing record.
+ *
+ * @param record      The master index record found by getRecord()
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int removeMasterIndexRecord(MasterIndexRecord *record)
+{
+  int result = validateRecord(record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // Mark the record so that it cannot be used again
+  record->magic = badMagic;
+  if (unlikely(record->mutex != NULL)) {
+    lockMutex(record->mutex);
+  }
+  result = removeDeltaIndexEntry(&record->deltaEntry);
+  if (unlikely(record->mutex != NULL)) {
+    unlockMutex(record->mutex);
+  }
+  return result;
+}
+
+/***********************************************************************/
+/**
+ * Set the chapter number associated with a block name.
+ *
+ * @param record         The master index record found by getRecord()
+ * @param virtualChapter The chapter number where the block info is now found.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int setMasterIndexRecordChapter(MasterIndexRecord *record,
+                                uint64_t virtualChapter)
+{
+  const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5,
+                                         common);
+  int result = validateRecord(record);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (!isVirtualChapterIndexed(record, virtualChapter)) {
+    const MasterIndexZone *masterZone = getMasterZone(record);
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot set chapter number %" PRIu64
+                                     " that is out of the valid range %" PRIu64
+                                     " to %llu",
+                                     virtualChapter,
+                                     masterZone->virtualChapterLow,
+                                     masterZone->virtualChapterHigh);
+  }
+  if (unlikely(record->mutex != NULL)) {
+    lockMutex(record->mutex);
+  }
+  result = setDeltaEntryValue(&record->deltaEntry,
+                              convertVirtualToIndex(mi5, virtualChapter));
+  if (unlikely(record->mutex != NULL)) {
+    unlockMutex(record->mutex);
+  }
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  record->virtualChapter = virtualChapter;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Get the number of bytes used for master index entries.
+ *
+ * @param masterIndex The master index
+ *
+ * @return The number of bytes in use
+ **/
+static size_t getMasterIndexMemoryUsed_005(const MasterIndex *masterIndex)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  uint64_t bits = getDeltaIndexDlistBitsUsed(&mi5->deltaIndex);
+  return (bits + CHAR_BIT - 1) / CHAR_BIT;
+}
+
+/***********************************************************************/
+/**
+ * Return the master index stats.  There is only one portion of the master
+ * index in this implementation, and we call it the dense portion of the
+ * index.
+ *
+ * @param masterIndex The master index
+ * @param dense       Stats for the dense portion of the index
+ * @param sparse      Stats for the sparse portion of the index
+ **/
+static void getMasterIndexStats_005(const MasterIndex *masterIndex,
+                                    MasterIndexStats *dense,
+                                    MasterIndexStats *sparse)
+{
+  const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5,
+                                               common);
+  DeltaIndexStats dis;
+  getDeltaIndexStats(&mi5->deltaIndex, &dis);
+  dense->memoryAllocated = (dis.memoryAllocated
+                            + sizeof(MasterIndex5)
+                            + mi5->numDeltaLists * sizeof(uint64_t)
+                            + mi5->numZones * sizeof(MasterIndexZone));
+  dense->rebalanceTime   = dis.rebalanceTime;
+  dense->rebalanceCount  = dis.rebalanceCount;
+  dense->recordCount     = dis.recordCount;
+  dense->collisionCount  = dis.collisionCount;
+  dense->discardCount    = dis.discardCount;
+  dense->overflowCount   = dis.overflowCount;
+  dense->numLists        = dis.numLists;
+  dense->earlyFlushes    = 0;
+  unsigned int z;
+  for (z = 0; z < mi5->numZones; z++) {
+    dense->earlyFlushes += mi5->masterZones[z].numEarlyFlushes;
+  }
+  memset(sparse, 0, sizeof(MasterIndexStats));
+}
+
+/***********************************************************************/
+/**
+ * Determine whether a given chunk name is a hook.
+ *
+ * @param masterIndex    The master index
+ * @param name           The block name
+ *
+ * @return whether to use as sample
+ **/
+static bool isMasterIndexSample_005(const MasterIndex  *masterIndex
+                                    __attribute__((unused)),
+                                    const UdsChunkName *name
+                                    __attribute__((unused)))
+{
+  return false;
+}
+
+/***********************************************************************/
+typedef struct {
+  unsigned int addressBits;    // Number of bits in address mask
+  unsigned int chapterBits;    // Number of bits in chapter number
+  unsigned int meanDelta;      // The mean delta
+  unsigned long numDeltaLists; // The number of delta lists
+  unsigned long numChapters;   // Number of chapters used
+  size_t numBitsPerChapter;    // The number of bits per chapter
+  size_t memorySize;           // The number of bytes of delta list memory
+  size_t targetFreeSize;       // The number of free bytes we desire
+} Parameters005;
+
+/***********************************************************************/
+static int computeMasterIndexParameters005(const Configuration *config,
+                                           Parameters005 *params)
+{
+  enum { DELTA_LIST_SIZE = 256 };
+  /*
+   * For a given zone count, setting the the minimum number of delta lists
+   * to the square of the number of zones ensures that the distribution of
+   * delta lists over zones doesn't underflow, leaving the last zone with
+   * an invalid number of delta lists. See the explanation in
+   * initializeDeltaIndex(). Because we can restart with a different number
+   * of zones but the number of delta lists is invariant across restart,
+   * we must use the largest number of zones to compute this minimum.
+   */
+  unsigned long minDeltaLists = (minMasterIndexDeltaLists
+                                 ? minMasterIndexDeltaLists
+                                 : MAX_ZONES * MAX_ZONES);
+
+  Geometry *geometry = config->geometry;
+  unsigned long recordsPerChapter = geometry->recordsPerChapter;
+  params->numChapters = geometry->chaptersPerVolume;
+  unsigned long recordsPerVolume = recordsPerChapter * params->numChapters;
+  unsigned int numAddresses = config->masterIndexMeanDelta * DELTA_LIST_SIZE;
+  params->numDeltaLists
+    = maxUint(recordsPerVolume / DELTA_LIST_SIZE, minDeltaLists);
+  params->addressBits = computeBits(numAddresses - 1);
+  params->chapterBits = computeBits(params->numChapters - 1);
+
+  if ((unsigned int) params->numDeltaLists != params->numDeltaLists) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize master index with %lu"
+                                     " delta lists",
+                                     params->numDeltaLists);
+  }
+  if (params->addressBits > 31) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize master index with %u"
+                                     " address bits",
+                                     params->addressBits);
+  }
+  if (geometry->sparseChaptersPerVolume > 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize dense master index"
+                                     " with %u sparse chapters",
+                                     geometry->sparseChaptersPerVolume);
+  }
+  if (recordsPerChapter == 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize master index with %lu"
+                                     " records per chapter",
+                                     recordsPerChapter);
+  }
+  if (params->numChapters == 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cannot initialize master index with %lu"
+                                     " chapters per volume",
+                                     params->numChapters);
+  }
+
+  /*
+   * We can now compute the probability that a delta list is not touched during
+   * the writing of an entire chapter.  The computation is:
+   *
+   * double pNotTouched = pow((double) (params->numDeltaLists - 1)
+   *                          / params->numDeltaLists,
+   *                          recordsPerChapter);
+   *
+   * For the standard index sizes, about 78% of the delta lists are not
+   * touched, and therefore contain dead index entries that have not been
+   * eliminated by the lazy LRU processing.  We can then compute how many dead
+   * index entries accumulate over time.  The computation is:
+   *
+   * double invalidChapters = pNotTouched / (1.0 - pNotTouched);
+   *
+   * For the standard index sizes, we will need about 3.5 chapters of space for
+   * the dead index entries in a 1K chapter index.  Since we do not want to do
+   * that floating point computation, we use 4 chapters per 1K of chapters.
+   */
+  unsigned long invalidChapters = maxUint(params->numChapters / 256, 2);
+  unsigned long chaptersInMasterIndex = params->numChapters + invalidChapters;
+  unsigned long entriesInMasterIndex
+    = recordsPerChapter * chaptersInMasterIndex;
+  // Compute the mean delta
+  unsigned long addressSpan = params->numDeltaLists << params->addressBits;
+  params->meanDelta = addressSpan / entriesInMasterIndex;
+  // Project how large we expect a chapter to be
+  params->numBitsPerChapter = getDeltaMemorySize(recordsPerChapter,
+                                                 params->meanDelta,
+                                                 params->chapterBits);
+  // Project how large we expect the index to be
+  size_t numBitsPerIndex = params->numBitsPerChapter * chaptersInMasterIndex;
+  size_t expectedIndexSize = numBitsPerIndex / CHAR_BIT;
+  /*
+   * Set the total memory to be 6% larger than the expected index size.  We
+   * want this number to be large enough that the we do not do a great many
+   * rebalances as the list when the list is full.  We use MasterIndex_p1
+   * to tune this setting.
+   */
+  params->memorySize = expectedIndexSize * 106 / 100;
+  // Set the target free size to 5% of the expected index size
+  params->targetFreeSize = expectedIndexSize / 20;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int computeMasterIndexSaveBytes005(const Configuration *config,
+                                   size_t *numBytes)
+{
+  Parameters005 params = { .addressBits = 0 };
+  int result = computeMasterIndexParameters005(config, &params);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // Saving a MasterIndex005 needs a header plus one uint64_t per delta
+  // list plus the delta index.
+  *numBytes = (sizeof(struct mi005_data)
+               + params.numDeltaLists * sizeof(uint64_t)
+               + computeDeltaIndexSaveBytes(params.numDeltaLists,
+                                            params.memorySize));
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int makeMasterIndex005(const Configuration *config, unsigned int numZones,
+                       uint64_t volumeNonce, MasterIndex  **masterIndex)
+{
+  Parameters005 params = { .addressBits = 0 };
+  int result = computeMasterIndexParameters005(config, &params);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  MasterIndex5 *mi5;
+  result = ALLOCATE(1, MasterIndex5, "master index", &mi5);
+  if (result != UDS_SUCCESS) {
+    *masterIndex = NULL;
+    return result;
+  }
+
+  mi5->common.abortRestoringMasterIndex     = abortRestoringMasterIndex_005;
+  mi5->common.abortSavingMasterIndex        = abortSavingMasterIndex_005;
+  mi5->common.finishSavingMasterIndex       = finishSavingMasterIndex_005;
+  mi5->common.freeMasterIndex               = freeMasterIndex_005;
+  mi5->common.getMasterIndexMemoryUsed      = getMasterIndexMemoryUsed_005;
+  mi5->common.getMasterIndexRecord          = getMasterIndexRecord_005;
+  mi5->common.getMasterIndexStats           = getMasterIndexStats_005;
+  mi5->common.getMasterIndexZone            = getMasterIndexZone_005;
+  mi5->common.isMasterIndexSample           = isMasterIndexSample_005;
+  mi5->common.isRestoringMasterIndexDone    = isRestoringMasterIndexDone_005;
+  mi5->common.isSavingMasterIndexDone       = isSavingMasterIndexDone_005;
+  mi5->common.lookupMasterIndexName         = lookupMasterIndexName_005;
+  mi5->common.lookupMasterIndexSampledName  = lookupMasterIndexSampledName_005;
+  mi5->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_005;
+  mi5->common.setMasterIndexOpenChapter     = setMasterIndexOpenChapter_005;
+  mi5->common.setMasterIndexTag             = setMasterIndexTag_005;
+  mi5->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_005;
+  mi5->common.startRestoringMasterIndex     = startRestoringMasterIndex_005;
+  mi5->common.startSavingMasterIndex        = startSavingMasterIndex_005;
+
+  mi5->addressBits     = params.addressBits;
+  mi5->addressMask     = (1u << params.addressBits) - 1;
+  mi5->chapterBits     = params.chapterBits;
+  mi5->chapterMask     = (1u << params.chapterBits) - 1;
+  mi5->numChapters     = params.numChapters;
+  mi5->numDeltaLists   = params.numDeltaLists;
+  mi5->numZones        = numZones;
+  mi5->chapterZoneBits = params.numBitsPerChapter / numZones;
+  mi5->volumeNonce     = volumeNonce;
+
+  result = initializeDeltaIndex(&mi5->deltaIndex, numZones,
+                                params.numDeltaLists, params.meanDelta,
+                                params.chapterBits, params.memorySize);
+  if (result == UDS_SUCCESS) {
+    mi5->maxZoneBits = ((getDeltaIndexDlistBitsAllocated(&mi5->deltaIndex)
+                         - params.targetFreeSize * CHAR_BIT)
+                        / numZones);
+  }
+
+  // Initialize the chapter flush ranges to be empty.  This depends upon
+  // allocate returning zeroed memory.
+  if (result == UDS_SUCCESS) {
+    result = ALLOCATE(params.numDeltaLists, uint64_t,
+                      "first chapter to flush", &mi5->flushChapters);
+  }
+
+  // Initialize the virtual chapter ranges to start at zero.  This depends
+  // upon allocate returning zeroed memory.
+  if (result == UDS_SUCCESS) {
+    result = ALLOCATE(numZones, MasterIndexZone, "master index zones",
+                      &mi5->masterZones);
+  }
+
+  if (result == UDS_SUCCESS) {
+    *masterIndex = &mi5->common;
+  } else {
+    freeMasterIndex_005(&mi5->common);
+    *masterIndex = NULL;
+  }
+  return result;
+}
diff --git a/uds/masterIndex005.h b/uds/masterIndex005.h
new file mode 100644
index 0000000..5436c7f
--- /dev/null
+++ b/uds/masterIndex005.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.h#1 $
+ */
+
+#ifndef MASTERINDEX005_H
+#define MASTERINDEX005_H 1
+
+#include "masterIndexOps.h"
+
+/**
+ * Make a new master index.
+ *
+ * @param config          The configuration of the master index
+ * @param numZones        The number of zones
+ * @param volumeNonce     The nonce used to authenticate the index
+ * @param masterIndex     Location to hold new master index ptr
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int makeMasterIndex005(const Configuration *config, unsigned int numZones,
+                       uint64_t volumeNonce, MasterIndex **masterIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the number of bytes required to save a master index of a given
+ * configuration.
+ *
+ * @param config    The configuration of the master index
+ * @param numBytes  The number of bytes required to save the master index
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int computeMasterIndexSaveBytes005(const Configuration *config,
+                                   size_t *numBytes)
+  __attribute__((warn_unused_result));
+
+#endif /* MASTERINDEX005_H */
diff --git a/uds/masterIndex006.c b/uds/masterIndex006.c
new file mode 100644
index 0000000..3e1ef00
--- /dev/null
+++ b/uds/masterIndex006.c
@@ -0,0 +1,791 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.c#2 $
+ */
+#include "masterIndex006.h"
+
+#include "buffer.h"
+#include "compiler.h"
+#include "errors.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "masterIndex005.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "threads.h"
+#include "uds.h"
+
+/*
+ * The master index is a kept as a wrapper around 2 master index
+ * implementations, one for dense chapters and one for sparse chapters.
+ * Methods will be routed to one or the other, or both, depending on the
+ * method and data passed in.
+ *
+ * The master index is divided into zones, and in normal operation there is
+ * one thread operating on each zone.  Any operation that operates on all
+ * the zones needs to do its operation at a safe point that ensures that
+ * only one thread is operating on the master index.
+ *
+ * The only multithreaded operation supported by the sparse master index is
+ * the lookupMasterIndexName() method.  It is called by the thread that
+ * assigns an index request to the proper zone, and needs to do a master
+ * index query for sampled chunk names.  The zone mutexes are used to make
+ * this lookup operation safe.
+ */
+
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone {
+  Mutex hookMutex;          // Protects the sampled index in this zone
+} MasterIndexZone;
+
+typedef struct {
+  MasterIndex      common;           // Common master index methods
+  unsigned int     sparseSampleRate; // The sparse sample rate
+  unsigned int     numZones;         // The number of zones
+  MasterIndex     *miNonHook;        // The non-hook index
+  MasterIndex     *miHook;           // The hook index == sample index
+  MasterIndexZone *masterZones;      // The zones
+} MasterIndex6;
+
+/**
+ * Determine whether a given chunk name is a hook.
+ *
+ * @param masterIndex    The master index
+ * @param name           The block name
+ *
+ * @return whether to use as sample
+ **/
+static INLINE bool isMasterIndexSample_006(const MasterIndex  *masterIndex,
+                                           const UdsChunkName *name)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  return (extractSamplingBytes(name) % mi6->sparseSampleRate) == 0;
+}
+
+/***********************************************************************/
+/**
+ * Get the subindex for the given chunk name
+ *
+ * @param masterIndex    The master index
+ * @param name           The block name
+ *
+ * @return the subindex
+ **/
+static INLINE MasterIndex *getSubIndex(const MasterIndex *masterIndex,
+                                       const UdsChunkName *name)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  return (isMasterIndexSample_006(masterIndex, name)
+          ? mi6->miHook
+          : mi6->miNonHook);
+}
+
+/***********************************************************************/
+/**
+ * Terminate and clean up the master index
+ *
+ * @param masterIndex The master index to terminate
+ **/
+static void freeMasterIndex_006(MasterIndex *masterIndex)
+{
+  if (masterIndex != NULL) {
+    MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+    if (mi6->masterZones != NULL) {
+      unsigned int zone;
+      for (zone = 0; zone < mi6->numZones; zone++) {
+        destroyMutex(&mi6->masterZones[zone].hookMutex);
+      }
+      FREE(mi6->masterZones);
+      mi6->masterZones = NULL;
+    }
+    if (mi6->miNonHook != NULL) {
+      freeMasterIndex(mi6->miNonHook);
+      mi6->miNonHook = NULL;
+    }
+    if (mi6->miHook != NULL) {
+      freeMasterIndex(mi6->miHook);
+      mi6->miHook = NULL;
+    }
+    FREE(masterIndex);
+  }
+}
+
+/***********************************************************************/
+/**
+ * Constants and structures for the saved master index file.  "MI6" is for
+ * masterIndex006, and "-XXXX" is a number to increment when the format of
+ * the data changes.
+ **/
+enum { MAGIC_SIZE = 8 };
+static const char MAGIC_MI_START[] = "MI6-0001";
+
+struct mi006_data {
+  char         magic[MAGIC_SIZE]; // MAGIC_MI_START
+  unsigned int sparseSampleRate;
+};
+
+/***********************************************************************/
+/**
+ * Set the tag value used when saving and/or restoring a master index.
+ *
+ * @param masterIndex  The master index
+ * @param tag          The tag value
+ **/
+static void setMasterIndexTag_006(MasterIndex *masterIndex
+                                  __attribute__((unused)),
+                                  byte tag __attribute__((unused)))
+{
+}
+
+/***********************************************************************/
+__attribute__((warn_unused_result))
+static int encodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header)
+{
+  int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = putUInt32LEIntoBuffer(buffer, header->sparseSampleRate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi006_data),
+                           "%zu bytes of config written, of %zu expected",
+                           contentLength(buffer), sizeof(struct mi006_data));
+  return result;
+}
+
+/**
+ * Start saving a master index to a buffered output stream.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The number of the zone to save
+ * @param bufferedWriter  The index state component being written
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int startSavingMasterIndex_006(const MasterIndex *masterIndex,
+                                      unsigned int zoneNumber,
+                                      BufferedWriter *bufferedWriter)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  Buffer *buffer;
+  int result = makeBuffer(sizeof(struct mi006_data), &buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  struct mi006_data header;
+  memset(&header, 0, sizeof(header));
+  memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE);
+  header.sparseSampleRate = mi6->sparseSampleRate;
+  result = encodeMasterIndexHeader(buffer, &header);
+  if (result != UDS_SUCCESS) {
+    freeBuffer(&buffer);
+    return result;
+  }
+  result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer),
+                                 contentLength(buffer));
+  freeBuffer(&buffer);
+  if (result != UDS_SUCCESS) {
+    logWarningWithStringError(result, "failed to write master index header");
+    return result;
+  }
+
+  result = startSavingMasterIndex(mi6->miNonHook, zoneNumber, bufferedWriter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = startSavingMasterIndex(mi6->miHook, zoneNumber, bufferedWriter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+/**
+ * Have all the data been written while saving a master index to an output
+ * stream?  If the answer is yes, it is still necessary to call
+ * finishSavingMasterIndex(), which will return quickly.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return true if all the data are written
+ **/
+static bool isSavingMasterIndexDone_006(const MasterIndex *masterIndex,
+                                        unsigned int zoneNumber)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  return (isSavingMasterIndexDone(mi6->miNonHook, zoneNumber)
+          && isSavingMasterIndexDone(mi6->miHook, zoneNumber));
+}
+
+/***********************************************************************/
+/**
+ * Finish saving a master index to an output stream.  Force the writing of
+ * all of the remaining data.  If an error occurred asynchronously during
+ * the save operation, it will be returned here.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int finishSavingMasterIndex_006(const MasterIndex *masterIndex,
+                                       unsigned int zoneNumber)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  int result = finishSavingMasterIndex(mi6->miNonHook, zoneNumber);
+  if (result == UDS_SUCCESS) {
+    result = finishSavingMasterIndex(mi6->miHook, zoneNumber);
+  }
+  return result;
+}
+
+/***********************************************************************/
+/**
+ * Abort saving a master index to an output stream.  If an error occurred
+ * asynchronously during the save operation, it will be dropped.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int abortSavingMasterIndex_006(const MasterIndex *masterIndex,
+                                      unsigned int zoneNumber)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  int result = abortSavingMasterIndex(mi6->miNonHook, zoneNumber);
+  int result2 = abortSavingMasterIndex(mi6->miHook, zoneNumber);
+  if (result == UDS_SUCCESS) {
+    result = result2;
+  }
+  return result;
+}
+
+/***********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header)
+{
+  int result = getBytesFromBuffer(buffer, sizeof(header->magic),
+                                  &header->magic);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = getUInt32LEFromBuffer(buffer, &header->sparseSampleRate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_LOG_ONLY(contentLength(buffer) == 0,
+                           "%zu bytes decoded of %zu expected",
+                           bufferLength(buffer) - contentLength(buffer),
+                           bufferLength(buffer));
+  if (result != UDS_SUCCESS) {
+    result = UDS_CORRUPT_COMPONENT;
+  }
+  return result;
+}
+
+/**
+ * Start restoring the master index from multiple buffered readers
+ *
+ * @param masterIndex      The master index to restore into
+ * @param bufferedReaders  The buffered reader to read the master index from
+ * @param numReaders       The number of buffered readers
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static int startRestoringMasterIndex_006(MasterIndex *masterIndex,
+                                         BufferedReader **bufferedReaders,
+                                         int numReaders)
+{
+  MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+  int result = ASSERT_WITH_ERROR_CODE(masterIndex != NULL, UDS_BAD_STATE,
+                                      "cannot restore to null master index");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  int i;
+  for (i = 0; i < numReaders; i++) {
+    Buffer *buffer;
+    result = makeBuffer(sizeof(struct mi006_data), &buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    result = readFromBufferedReader(bufferedReaders[i],
+                                    getBufferContents(buffer),
+                                    bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return logWarningWithStringError(result,
+                                       "failed to read master index header");
+    }
+    result = resetBufferEnd(buffer, bufferLength(buffer));
+    if (result != UDS_SUCCESS) {
+      freeBuffer(&buffer);
+      return result;
+    }
+    struct mi006_data header;
+    result = decodeMasterIndexHeader(buffer, &header);
+    freeBuffer(&buffer);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) {
+      return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                       "master index file had bad magic"
+                                       " number");
+    }
+    if (i == 0) {
+      mi6->sparseSampleRate = header.sparseSampleRate;
+    } else if (mi6->sparseSampleRate != header.sparseSampleRate) {
+      logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                "Inconsistent sparse sample rate in delta"
+                                " index zone files: %u vs. %u",
+                                mi6->sparseSampleRate,
+                                header.sparseSampleRate);
+        return UDS_CORRUPT_COMPONENT;
+    }
+  }
+
+  result = startRestoringMasterIndex(mi6->miNonHook, bufferedReaders,
+                                     numReaders);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return startRestoringMasterIndex(mi6->miHook, bufferedReaders, numReaders);
+}
+
+/***********************************************************************/
+/**
+ * Have all the data been read while restoring a master index from an
+ * input stream?
+ *
+ * @param masterIndex  The master index to restore into
+ *
+ * @return true if all the data are read
+ **/
+static bool isRestoringMasterIndexDone_006(const MasterIndex *masterIndex)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  return (isRestoringMasterIndexDone(mi6->miNonHook)
+          && isRestoringMasterIndexDone(mi6->miHook));
+}
+
+/***********************************************************************/
+/**
+ * Restore a saved delta list
+ *
+ * @param masterIndex  The master index to restore into
+ * @param dlsi         The DeltaListSaveInfo describing the delta list
+ * @param data         The saved delta list bit stream
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+static int restoreDeltaListToMasterIndex_006(MasterIndex *masterIndex,
+                                             const DeltaListSaveInfo *dlsi,
+                                             const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+  int result = restoreDeltaListToMasterIndex(mi6->miNonHook, dlsi, data);
+  if (result != UDS_SUCCESS) {
+    result = restoreDeltaListToMasterIndex(mi6->miHook, dlsi, data);
+  }
+  return result;
+}
+
+/***********************************************************************/
+/**
+ * Abort restoring a master index from an input stream.
+ *
+ * @param masterIndex  The master index
+ **/
+static void abortRestoringMasterIndex_006(MasterIndex *masterIndex)
+{
+  MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+  abortRestoringMasterIndex(mi6->miNonHook);
+  abortRestoringMasterIndex(mi6->miHook);
+}
+
+/***********************************************************************/
+/**
+ * Set the open chapter number on a zone.  The master index zone will be
+ * modified to index the proper number of chapters ending with the new open
+ * chapter.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The zone number
+ * @param virtualChapter  The new open chapter number
+ **/
+static void setMasterIndexZoneOpenChapter_006(MasterIndex *masterIndex,
+                                              unsigned int zoneNumber,
+                                              uint64_t virtualChapter)
+{
+  MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+  setMasterIndexZoneOpenChapter(mi6->miNonHook, zoneNumber, virtualChapter);
+
+  // We need to prevent a lookupMasterIndexName() happening while we are
+  // changing the open chapter number
+  Mutex *mutex = &mi6->masterZones[zoneNumber].hookMutex;
+  lockMutex(mutex);
+  setMasterIndexZoneOpenChapter(mi6->miHook, zoneNumber, virtualChapter);
+  unlockMutex(mutex);
+}
+
+/***********************************************************************/
+/**
+ * Set the open chapter number.  The master index will be modified to index
+ * the proper number of chapters ending with the new open chapter.
+ *
+ * @param masterIndex     The master index
+ * @param virtualChapter  The new open chapter number
+ **/
+static void setMasterIndexOpenChapter_006(MasterIndex *masterIndex,
+                                          uint64_t virtualChapter)
+{
+  MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common);
+  unsigned int zone;
+  for (zone = 0; zone < mi6->numZones; zone++) {
+    setMasterIndexZoneOpenChapter_006(masterIndex, zone, virtualChapter);
+  }
+}
+
+/***********************************************************************/
+/**
+ * Find the master index zone associated with a chunk name
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ *
+ * @return the zone that the chunk name belongs to
+ **/
+static unsigned int getMasterIndexZone_006(const MasterIndex *masterIndex,
+                                           const UdsChunkName *name)
+{
+  return getMasterIndexZone(getSubIndex(masterIndex, name), name);
+}
+
+/***********************************************************************/
+/**
+ * Do a quick read-only lookup of the chunk name and return information
+ * needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int lookupMasterIndexName_006(const MasterIndex *masterIndex,
+                                     const UdsChunkName *name,
+                                     MasterIndexTriage *triage)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  triage->isSample = isMasterIndexSample_006(masterIndex, name);
+  triage->inSampledChapter = false;
+  triage->zone = getMasterIndexZone_006(masterIndex, name);
+  int result = UDS_SUCCESS;
+  if (triage->isSample) {
+    Mutex *mutex = &mi6->masterZones[triage->zone].hookMutex;
+    lockMutex(mutex);
+    result = lookupMasterIndexSampledName(mi6->miHook, name, triage);
+    unlockMutex(mutex);
+  }
+  return result;
+}
+
+/***********************************************************************/
+/**
+ * Do a quick read-only lookup of the sampled chunk name and return
+ * information needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name.  The zone and
+ *                    isSample fields are already filled in.  Set
+ *                    inSampledChapter and virtualChapter if the chunk
+ *                    name is found in the index.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int lookupMasterIndexSampledName_006(const MasterIndex *masterIndex
+                                            __attribute__((unused)),
+                                            const UdsChunkName *name
+                                            __attribute__((unused)),
+                                            MasterIndexTriage *triage
+                                            __attribute__((unused)))
+{
+  return ASSERT_WITH_ERROR_CODE(false, UDS_BAD_STATE,
+                                "%s should not be called", __func__);
+}
+
+/***********************************************************************/
+/**
+ * Find the master index record associated with a block name
+ *
+ * This is always the first routine to be called when dealing with a delta
+ * master index entry.  The fields of the record parameter should be
+ * examined to determine the state of the record:
+ *
+ * If isFound is false, then we did not find an entry for the block
+ * name.  Information is saved in the MasterIndexRecord so that
+ * putMasterIndexRecord() will insert an entry for that block name at
+ * the proper place.
+ *
+ * If isFound is true, then we did find an entry for the block name.
+ * Information is saved in the MasterIndexRecord so that the "chapter"
+ * and "isCollision" fields reflect the entry found.
+ * Calls to removeMasterIndexRecord() will remove the entry, calls to
+ * setMasterIndexRecordChapter() can modify the entry, and calls to
+ * putMasterIndexRecord() can insert a collision record with this
+ * entry.
+ *
+ * @param masterIndex The master index to search
+ * @param name        The chunk name
+ * @param record      Set to the info about the record searched for
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int getMasterIndexRecord_006(MasterIndex *masterIndex,
+                                    const UdsChunkName *name,
+                                    MasterIndexRecord *record)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  int result;
+  if (isMasterIndexSample_006(masterIndex, name)) {
+    /*
+     * We need to prevent a lookupMasterIndexName() happening while we are
+     * finding the master index record.  Remember that because of lazy LRU
+     * flushing of the master index, getMasterIndexRecord() is not a
+     * read-only operation.
+     */
+    unsigned int zone = getMasterIndexZone(mi6->miHook, name);
+    Mutex *mutex = &mi6->masterZones[zone].hookMutex;
+    lockMutex(mutex);
+    result = getMasterIndexRecord(mi6->miHook, name, record);
+    unlockMutex(mutex);
+    // Remember the mutex so that other operations on the MasterIndexRecord
+    // can use it
+    record->mutex = mutex;
+  } else {
+    result = getMasterIndexRecord(mi6->miNonHook, name, record);
+  }
+  return result;
+}
+
+/***********************************************************************/
+/**
+ * Get the number of bytes used for master index entries.
+ *
+ * @param masterIndex The master index
+ *
+ * @return The number of bytes in use
+ **/
+static size_t getMasterIndexMemoryUsed_006(const MasterIndex *masterIndex)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  return (getMasterIndexMemoryUsed(mi6->miNonHook)
+          + getMasterIndexMemoryUsed(mi6->miHook));
+}
+
+/***********************************************************************/
+/**
+ * Return the master index stats.  There is only one portion of the master
+ * index in this implementation, and we call it the dense portion of the
+ * index.
+ *
+ * @param masterIndex The master index
+ * @param dense       Stats for the dense portion of the index
+ * @param sparse      Stats for the sparse portion of the index
+ **/
+static void getMasterIndexStats_006(const MasterIndex *masterIndex,
+                                    MasterIndexStats *dense,
+                                    MasterIndexStats *sparse)
+{
+  const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6,
+                                               common);
+  MasterIndexStats dummyStats;
+  getMasterIndexStats(mi6->miNonHook, dense,  &dummyStats);
+  getMasterIndexStats(mi6->miHook,    sparse, &dummyStats);
+}
+
+/***********************************************************************/
+typedef struct {
+  Configuration hookConfig;      // Describe the hook part of the index
+  Geometry      hookGeometry;
+  Configuration nonHookConfig;   // Describe the non-hook part of the index
+  Geometry      nonHookGeometry;
+} SplitConfig;
+
+/***********************************************************************/
+static int splitConfiguration006(const Configuration *config,
+                                 SplitConfig *split)
+{
+  int result
+    = ASSERT_WITH_ERROR_CODE(config->geometry->sparseChaptersPerVolume != 0,
+                             UDS_INVALID_ARGUMENT,
+                             "cannot initialize sparse+dense master index"
+                             " with no sparse chapters");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_WITH_ERROR_CODE(config->sparseSampleRate != 0,
+                                  UDS_INVALID_ARGUMENT,
+                                  "cannot initialize sparse+dense master"
+                                  " index with a sparse sample rate of %u",
+                                  config->sparseSampleRate);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Start with copies of the base configuration
+  split->hookConfig = *config;
+  split->hookGeometry = *config->geometry;
+  split->hookConfig.geometry = &split->hookGeometry;
+  split->nonHookConfig   = *config;
+  split->nonHookGeometry = *config->geometry;
+  split->nonHookConfig.geometry = &split->nonHookGeometry;
+
+  uint64_t sampleRate        = config->sparseSampleRate;
+  uint64_t numChapters       = config->geometry->chaptersPerVolume;
+  uint64_t numSparseChapters = config->geometry->sparseChaptersPerVolume;
+  uint64_t numDenseChapters  = numChapters - numSparseChapters;
+  uint64_t sampleRecords = config->geometry->recordsPerChapter / sampleRate;
+
+  // Adjust the number of records indexed for each chapter
+  split->hookGeometry.recordsPerChapter     = sampleRecords;
+  split->nonHookGeometry.recordsPerChapter -= sampleRecords;
+
+  // Adjust the number of chapters indexed
+  split->hookGeometry.sparseChaptersPerVolume    = 0;
+  split->nonHookGeometry.sparseChaptersPerVolume = 0;
+  split->nonHookGeometry.chaptersPerVolume       = numDenseChapters;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int computeMasterIndexSaveBytes006(const Configuration *config,
+                                   size_t *numBytes)
+{
+  SplitConfig split;
+  int result = splitConfiguration006(config, &split);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  size_t hookBytes, nonHookBytes;
+  result = computeMasterIndexSaveBytes005(&split.hookConfig, &hookBytes);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = computeMasterIndexSaveBytes005(&split.nonHookConfig, &nonHookBytes);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // Saving a MasterIndex006 needs a header plus the hook index plus the
+  // non-hook index
+  *numBytes = sizeof(struct mi006_data) + hookBytes + nonHookBytes;
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int makeMasterIndex006(const Configuration *config, unsigned int numZones,
+                       uint64_t volumeNonce, MasterIndex **masterIndex)
+{
+  SplitConfig split;
+  int result = splitConfiguration006(config, &split);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  MasterIndex6 *mi6;
+  result = ALLOCATE(1, MasterIndex6, "master index", &mi6);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  mi6->common.abortRestoringMasterIndex     = abortRestoringMasterIndex_006;
+  mi6->common.abortSavingMasterIndex        = abortSavingMasterIndex_006;
+  mi6->common.finishSavingMasterIndex       = finishSavingMasterIndex_006;
+  mi6->common.freeMasterIndex               = freeMasterIndex_006;
+  mi6->common.getMasterIndexMemoryUsed      = getMasterIndexMemoryUsed_006;
+  mi6->common.getMasterIndexRecord          = getMasterIndexRecord_006;
+  mi6->common.getMasterIndexStats           = getMasterIndexStats_006;
+  mi6->common.getMasterIndexZone            = getMasterIndexZone_006;
+  mi6->common.isMasterIndexSample           = isMasterIndexSample_006;
+  mi6->common.isRestoringMasterIndexDone    = isRestoringMasterIndexDone_006;
+  mi6->common.isSavingMasterIndexDone       = isSavingMasterIndexDone_006;
+  mi6->common.lookupMasterIndexName         = lookupMasterIndexName_006;
+  mi6->common.lookupMasterIndexSampledName  = lookupMasterIndexSampledName_006;
+  mi6->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_006;
+  mi6->common.setMasterIndexOpenChapter     = setMasterIndexOpenChapter_006;
+  mi6->common.setMasterIndexTag             = setMasterIndexTag_006;
+  mi6->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_006;
+  mi6->common.startRestoringMasterIndex     = startRestoringMasterIndex_006;
+  mi6->common.startSavingMasterIndex        = startSavingMasterIndex_006;
+
+  mi6->numZones         = numZones;
+  mi6->sparseSampleRate = config->sparseSampleRate;
+
+  result = ALLOCATE(numZones, MasterIndexZone, "master index zones",
+                    &mi6->masterZones);
+  unsigned int zone;
+  for (zone = 0; zone < numZones; zone++) {
+    if (result == UDS_SUCCESS) {
+      result = initMutex(&mi6->masterZones[zone].hookMutex);
+    }
+  }
+  if (result != UDS_SUCCESS) {
+    freeMasterIndex_006(&mi6->common);
+    return result;
+  }
+
+  result = makeMasterIndex005(&split.nonHookConfig, numZones, volumeNonce,
+                              &mi6->miNonHook);
+  if (result != UDS_SUCCESS) {
+    freeMasterIndex_006(&mi6->common);
+    return logErrorWithStringError(result,
+                                   "Error creating non hook master index");
+  }
+  setMasterIndexTag(mi6->miNonHook, 'd');
+
+  result = makeMasterIndex005(&split.hookConfig, numZones, volumeNonce,
+                              &mi6->miHook);
+  if (result != UDS_SUCCESS) {
+    freeMasterIndex_006(&mi6->common);
+    return logErrorWithStringError(result,
+                                   "Error creating hook master index");
+  }
+  setMasterIndexTag(mi6->miHook, 's');
+
+  *masterIndex = &mi6->common;
+  return UDS_SUCCESS;
+}
diff --git a/uds/masterIndex006.h b/uds/masterIndex006.h
new file mode 100644
index 0000000..1d3b377
--- /dev/null
+++ b/uds/masterIndex006.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.h#1 $
+ */
+
+#ifndef MASTERINDEX006_H
+#define MASTERINDEX006_H 1
+
+#include "masterIndexOps.h"
+
+/**
+ * Make a new master index.
+ *
+ * @param config          The configuration of the master index
+ * @param numZones        The number of zones
+ * @param volumeNonce     The nonce used to authenticate the index
+ * @param masterIndex     Location to hold new master index ptr
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int makeMasterIndex006(const Configuration *config, unsigned int numZones,
+                       uint64_t volumeNonce, MasterIndex **masterIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the number of bytes required to save a master index of a given
+ * configuration.
+ *
+ * @param config    The configuration of the master index
+ * @param numBytes  The number of bytes required to save the master index
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int computeMasterIndexSaveBytes006(const Configuration *config,
+                                   size_t *numBytes)
+  __attribute__((warn_unused_result));
+
+#endif /* MASTERINDEX006_H */
diff --git a/uds/masterIndexOps.c b/uds/masterIndexOps.c
new file mode 100644
index 0000000..1cbd10b
--- /dev/null
+++ b/uds/masterIndexOps.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.c#4 $
+ */
+#include "masterIndexOps.h"
+
+#include "compiler.h"
+#include "errors.h"
+#include "indexComponent.h"
+#include "logger.h"
+#include "masterIndex005.h"
+#include "masterIndex006.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "uds.h"
+#include "zone.h"
+
+/**********************************************************************/
+static INLINE bool usesSparse(const Configuration *config)
+{
+  return config->geometry->sparseChaptersPerVolume > 0;
+}
+
+/**********************************************************************/
+void getMasterIndexCombinedStats(const MasterIndex *masterIndex,
+                                 MasterIndexStats *stats)
+{
+  MasterIndexStats dense, sparse;
+  getMasterIndexStats(masterIndex, &dense, &sparse);
+  stats->memoryAllocated = dense.memoryAllocated + sparse.memoryAllocated;
+  stats->rebalanceTime   = dense.rebalanceTime   + sparse.rebalanceTime;
+  stats->rebalanceCount  = dense.rebalanceCount  + sparse.rebalanceCount;
+  stats->recordCount     = dense.recordCount     + sparse.recordCount;
+  stats->collisionCount  = dense.collisionCount  + sparse.collisionCount;
+  stats->discardCount    = dense.discardCount    + sparse.discardCount;
+  stats->overflowCount   = dense.overflowCount   + sparse.overflowCount;
+  stats->numLists        = dense.numLists        + sparse.numLists;
+  stats->earlyFlushes    = dense.earlyFlushes    + sparse.earlyFlushes;
+}
+
+/**********************************************************************/
+int makeMasterIndex(const Configuration  *config, unsigned int numZones,
+                    uint64_t volumeNonce, MasterIndex **masterIndex)
+{
+  if (usesSparse(config)) {
+    return makeMasterIndex006(config, numZones, volumeNonce, masterIndex);
+  } else {
+    return makeMasterIndex005(config, numZones, volumeNonce, masterIndex);
+  }
+}
+
+/**********************************************************************/
+int computeMasterIndexSaveBlocks(const Configuration *config,
+                                 size_t blockSize, uint64_t *blockCount)
+{
+  size_t numBytes;
+  int result = (usesSparse(config)
+                ? computeMasterIndexSaveBytes006(config, &numBytes)
+                : computeMasterIndexSaveBytes005(config, &numBytes));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  numBytes += sizeof(DeltaListSaveInfo);
+  *blockCount = (numBytes + blockSize - 1) / blockSize + MAX_ZONES;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int readMasterIndex(ReadPortal *portal)
+{
+  MasterIndex *masterIndex = indexComponentContext(portal->component);
+  unsigned int numZones = portal->zones;
+  if (numZones > MAX_ZONES) {
+    return logErrorWithStringError(UDS_BAD_STATE,
+                                   "zone count %u must not exceed MAX_ZONES",
+                                   numZones);
+  }
+
+  BufferedReader *readers[MAX_ZONES];
+  unsigned int z;
+  for (z = 0; z < numZones; ++z) {
+    int result = getBufferedReaderForPortal(portal, z, &readers[z]);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "cannot read component for zone %u", z);
+    }
+  }
+  return restoreMasterIndex(readers, numZones, masterIndex);
+}
+
+/**********************************************************************/
+static int writeMasterIndex(IndexComponent           *component,
+                            BufferedWriter           *writer,
+                            unsigned int              zone,
+                            IncrementalWriterCommand  command,
+                            bool                     *completed)
+{
+  MasterIndex *masterIndex = indexComponentContext(component);
+  bool isComplete = false;
+
+  int result = UDS_SUCCESS;
+
+  switch (command) {
+    case IWC_START:
+      result = startSavingMasterIndex(masterIndex, zone, writer);
+      isComplete = result != UDS_SUCCESS;
+      break;
+    case IWC_CONTINUE:
+      isComplete = isSavingMasterIndexDone(masterIndex, zone);
+      break;
+    case IWC_FINISH:
+      result = finishSavingMasterIndex(masterIndex, zone);
+      if (result == UDS_SUCCESS) {
+        result = writeGuardDeltaList(writer);
+      }
+      isComplete = true;
+      break;
+    case IWC_ABORT:
+      result = abortSavingMasterIndex(masterIndex, zone);
+      isComplete = true;
+      break;
+    default:
+      result = logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                         "Invalid writer command");
+      break;
+  }
+  if (completed != NULL) {
+    *completed = isComplete;
+  }
+  return result;
+}
+
+/**********************************************************************/
+
+static const IndexComponentInfo MASTER_INDEX_INFO_DATA = {
+  .kind        = RL_KIND_MASTER_INDEX,
+  .name        = "master index",
+  .saveOnly    = false,
+  .chapterSync = false,
+  .multiZone   = true,
+  .ioStorage   = true,
+  .loader      = readMasterIndex,
+  .saver       = NULL,
+  .incremental = writeMasterIndex,
+};
+const IndexComponentInfo *const MASTER_INDEX_INFO = &MASTER_INDEX_INFO_DATA;
+
+/**********************************************************************/
+static int restoreMasterIndexBody(BufferedReader **bufferedReaders,
+                                  unsigned int     numReaders,
+                                  MasterIndex     *masterIndex,
+                                  byte dlData[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  // Start by reading the "header" section of the stream
+  int result = startRestoringMasterIndex(masterIndex, bufferedReaders,
+                                         numReaders);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // Loop to read the delta lists, stopping when they have all been processed.
+  unsigned int z;
+  for (z = 0; z < numReaders; z++) {
+    for (;;) {
+      DeltaListSaveInfo dlsi;
+      result = readSavedDeltaList(&dlsi, dlData, bufferedReaders[z]);
+      if (result == UDS_END_OF_FILE) {
+        break;
+      } else if (result != UDS_SUCCESS) {
+        abortRestoringMasterIndex(masterIndex);
+        return result;
+      }
+      result = restoreDeltaListToMasterIndex(masterIndex, &dlsi, dlData);
+      if (result != UDS_SUCCESS) {
+        abortRestoringMasterIndex(masterIndex);
+        return result;
+      }
+    }
+  }
+  if (!isRestoringMasterIndexDone(masterIndex)) {
+    abortRestoringMasterIndex(masterIndex);
+    return logWarningWithStringError(UDS_CORRUPT_COMPONENT,
+                                     "incomplete delta list data");
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int restoreMasterIndex(BufferedReader **bufferedReaders,
+                       unsigned int     numReaders,
+                       MasterIndex     *masterIndex)
+{
+  byte *dlData;
+  int result = ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, byte, __func__, &dlData);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = restoreMasterIndexBody(bufferedReaders, numReaders, masterIndex,
+                                  dlData);
+  FREE(dlData);
+  return result;
+}
diff --git a/uds/masterIndexOps.h b/uds/masterIndexOps.h
new file mode 100644
index 0000000..90802ac
--- /dev/null
+++ b/uds/masterIndexOps.h
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.h#1 $
+ */
+
+#ifndef MASTERINDEXOPS_H
+#define MASTERINDEXOPS_H 1
+
+#include "compiler.h"
+#include "deltaIndex.h"
+#include "indexComponent.h"
+#include "indexConfig.h"
+#include "threads.h"
+#include "uds.h"
+
+extern const IndexComponentInfo *const MASTER_INDEX_INFO;
+extern unsigned int minMasterIndexDeltaLists;
+
+typedef struct masterIndex MasterIndex;
+
+typedef struct {
+  size_t memoryAllocated;  // Number of bytes allocated
+  RelTime rebalanceTime;   // The number of seconds spent rebalancing
+  int  rebalanceCount;     // Number of memory rebalances
+  long recordCount;        // The number of records in the index
+  long collisionCount;     // The number of collision records
+  long discardCount;       // The number of records removed
+  long overflowCount;      // The number of UDS_OVERFLOWs detected
+  unsigned int numLists;   // The number of delta lists
+  long earlyFlushes;       // Number of early flushes
+} MasterIndexStats;
+
+/*
+ * The MasterIndexTriage structure is used by lookupMasterIndexName(),
+ * which is a read-only operation that looks at the chunk name and returns
+ * some information used by the index to select the thread/queue/code_path
+ * that will process the chunk.
+ */
+typedef struct {
+  uint64_t virtualChapter;  // If inSampledChapter is true, then this is the
+                            // chapter containing the entry for the chunk name
+  unsigned int zone;        // The zone containing the chunk name
+  bool isSample;            // If true, this chunk name belongs to the
+                            // sampled index
+  bool inSampledChapter;    // If true, this chunk already has an entry in the
+                            // sampled index and virtualChapter is valid
+} MasterIndexTriage;
+
+/*
+ * The MasterIndexRecord structure is used for normal index read-write
+ * processing of a chunk name.  The first call must be to
+ * getMasterIndexRecord() to find the master index record for a chunk name.
+ * This call can be followed by putMasterIndexRecord() to add a master
+ * index record, or by setMasterIndexRecordChapter() to associate the chunk
+ * name with a different chapter, or by removeMasterIndexRecord() to delete
+ * a master index record.
+ */
+typedef struct {
+  // Public fields
+  uint64_t virtualChapter;  // Chapter where the block info is found
+  bool     isCollision;     // This record is a collision
+  bool     isFound;         // This record is the block searched for
+
+  // Private fields
+  unsigned char       magic;       // The magic number for valid records
+  unsigned int        zoneNumber;  // Zone that contains this block
+  MasterIndex        *masterIndex; // The master index
+  Mutex              *mutex;       // Mutex that must be held while accessing
+                                   // this delta index entry; used only for
+                                   // a sampled index; otherwise is NULL
+  const UdsChunkName *name;        // The blockname to which this record refers
+  DeltaIndexEntry     deltaEntry;  // The delta index entry for this record
+} MasterIndexRecord;
+
+struct masterIndex {
+  void (*abortRestoringMasterIndex)(MasterIndex *masterIndex);
+  int (*abortSavingMasterIndex)(const MasterIndex *masterIndex,
+                                unsigned int zoneNumber);
+  int (*finishSavingMasterIndex)(const MasterIndex *masterIndex,
+                                 unsigned int zoneNumber);
+  void (*freeMasterIndex)(MasterIndex *masterIndex);
+  size_t (*getMasterIndexMemoryUsed)(const MasterIndex *masterIndex);
+  int (*getMasterIndexRecord)(MasterIndex *masterIndex,
+                              const UdsChunkName *name,
+                              MasterIndexRecord *record);
+  void (*getMasterIndexStats)(const MasterIndex *masterIndex,
+                              MasterIndexStats *dense,
+                              MasterIndexStats *sparse);
+  unsigned int (*getMasterIndexZone)(const MasterIndex *masterIndex,
+                                     const UdsChunkName *name);
+  bool (*isMasterIndexSample)(const MasterIndex *masterIndex,
+                              const UdsChunkName *name);
+  bool (*isRestoringMasterIndexDone)(const MasterIndex *masterIndex);
+  bool (*isSavingMasterIndexDone)(const MasterIndex *masterIndex,
+                                  unsigned int zoneNumber);
+  int (*lookupMasterIndexName)(const MasterIndex *masterIndex,
+                               const UdsChunkName *name,
+                               MasterIndexTriage *triage);
+  int (*lookupMasterIndexSampledName)(const MasterIndex *masterIndex,
+                                      const UdsChunkName *name,
+                                      MasterIndexTriage *triage);
+  int (*restoreDeltaListToMasterIndex)(MasterIndex *masterIndex,
+                                       const DeltaListSaveInfo *dlsi,
+                                       const byte data[DELTA_LIST_MAX_BYTE_COUNT]);
+  void (*setMasterIndexOpenChapter)(MasterIndex *masterIndex,
+                                    uint64_t virtualChapter);
+  void (*setMasterIndexTag)(MasterIndex *masterIndex, byte tag);
+  void (*setMasterIndexZoneOpenChapter)(MasterIndex *masterIndex,
+                                        unsigned int zoneNumber,
+                                        uint64_t virtualChapter);
+  int (*startRestoringMasterIndex)(MasterIndex *masterIndex,
+                                   BufferedReader **bufferedReaders,
+                                   int numReaders);
+  int (*startSavingMasterIndex)(const MasterIndex *masterIndex,
+                                unsigned int zoneNumber,
+                                BufferedWriter *bufferedWriter);
+};
+
+/**
+ * Return the combined master index stats.
+ *
+ * @param masterIndex The master index
+ * @param stats       Combined stats for the index
+ **/
+void getMasterIndexCombinedStats(const MasterIndex *masterIndex,
+                                 MasterIndexStats *stats);
+
+/**
+ * Make a new master index.
+ *
+ * @param config       The configuration of the master index
+ * @param numZones     The number of zones
+ * @param volumeNonce  The nonce used to store the index
+ * @param masterIndex  Location to hold new master index ptr
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+int makeMasterIndex(const Configuration *config, unsigned int numZones,
+                    uint64_t volumeNonce, MasterIndex **masterIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the number of blocks required to save a master index of a given
+ * configuration.
+ *
+ * @param [in]  config          The configuration of a master index
+ * @param [in]  blockSize       The size of a block in bytes.
+ * @param [out] blockCount      The resulting number of blocks.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+int computeMasterIndexSaveBlocks(const Configuration *config,
+                                 size_t               blockSize,
+                                 uint64_t            *blockCount)
+  __attribute__((warn_unused_result));
+
+/**
+ * Restore a master index.  This is exposed for unit tests.
+ *
+ * @param readers      The readers to read from.
+ * @param numReaders   The number of readers.
+ * @param masterIndex  The master index
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+int restoreMasterIndex(BufferedReader **readers,
+                       unsigned int     numReaders,
+                       MasterIndex     *masterIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort restoring a master index from an input stream.
+ *
+ * @param masterIndex  The master index
+ **/
+static INLINE void abortRestoringMasterIndex(MasterIndex *masterIndex)
+{
+  masterIndex->abortRestoringMasterIndex(masterIndex);
+}
+
+/**
+ * Abort saving a master index to an output stream.  If an error occurred
+ * asynchronously during the save operation, it will be dropped.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static INLINE int abortSavingMasterIndex(const MasterIndex *masterIndex,
+                                         unsigned int zoneNumber)
+{
+  return masterIndex->abortSavingMasterIndex(masterIndex, zoneNumber);
+}
+
+/**
+ * Finish saving a master index to an output stream.  Force the writing of
+ * all of the remaining data.  If an error occurred asynchronously during
+ * the save operation, it will be returned here.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static INLINE int finishSavingMasterIndex(const MasterIndex *masterIndex,
+                                          unsigned int zoneNumber)
+{
+  return masterIndex->finishSavingMasterIndex(masterIndex, zoneNumber);
+}
+
+/**
+ * Terminate and clean up the master index
+ *
+ * @param masterIndex The master index to terminate
+ **/
+static INLINE void freeMasterIndex(MasterIndex *masterIndex)
+{
+  masterIndex->freeMasterIndex(masterIndex);
+}
+
+/**
+ * Get the number of bytes used for master index entries.
+ *
+ * @param masterIndex The master index
+ *
+ * @return The number of bytes in use
+ **/
+static INLINE size_t getMasterIndexMemoryUsed(const MasterIndex *masterIndex)
+{
+  return masterIndex->getMasterIndexMemoryUsed(masterIndex);
+}
+
+/**
+ * Find the master index record associated with a block name
+ *
+ * This is always the first routine to be called when dealing with a delta
+ * master index entry.  The fields of the record parameter should be
+ * examined to determine the state of the record:
+ *
+ * If isFound is false, then we did not find an entry for the block name.
+ * Information is saved in the MasterIndexRecord so that
+ * putMasterIndexRecord() will insert an entry for that block name at the
+ * proper place.
+ *
+ * If isFound is true, then we did find an entry for the block name.
+ * Information is saved in the MasterIndexRecord so that the "chapter" and
+ * "isCollision" fields reflect the entry found.  Calls to
+ * removeMasterIndexRecord() will remove the entry, calls to
+ * setMasterIndexRecordChapter() can modify the entry, and calls to
+ * putMasterIndexRecord() can insert a collision record with this entry.
+ *
+ * @param masterIndex The master index to search
+ * @param name        The chunk name
+ * @param record      Set to the info about the record searched for
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int getMasterIndexRecord(MasterIndex *masterIndex,
+                                       const UdsChunkName *name,
+                                       MasterIndexRecord *record)
+{
+  return masterIndex->getMasterIndexRecord(masterIndex, name, record);
+}
+
+/**
+ * Return the master index stats.
+ *
+ * @param masterIndex The master index
+ * @param dense       Stats for the dense portion of the index
+ * @param sparse      Stats for the sparse portion of the index
+ **/
+static INLINE void getMasterIndexStats(const MasterIndex *masterIndex,
+                                       MasterIndexStats *dense,
+                                       MasterIndexStats *sparse)
+{
+  masterIndex->getMasterIndexStats(masterIndex, dense, sparse);
+}
+
+/**
+ * Find the master index zone associated with a chunk name
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ *
+ * @return the zone that the chunk name belongs to
+ **/
+static INLINE unsigned int getMasterIndexZone(const MasterIndex *masterIndex,
+                                              const UdsChunkName *name)
+{
+  return masterIndex->getMasterIndexZone(masterIndex, name);
+}
+
+/**
+ * Determine whether a given chunk name is a hook.
+ *
+ * @param masterIndex  The master index
+ * @param name         The block name
+ *
+ * @return whether to use as sample
+ **/
+static INLINE bool isMasterIndexSample(const MasterIndex *masterIndex,
+                                       const UdsChunkName *name)
+{
+  return masterIndex->isMasterIndexSample(masterIndex, name);
+}
+
+/**
+ * Have all the data been read while restoring a master index from an input
+ * stream?
+ *
+ * @param masterIndex  The master index to restore into
+ *
+ * @return true if all the data are read
+ **/
+static INLINE bool isRestoringMasterIndexDone(const MasterIndex *masterIndex)
+{
+  return masterIndex->isRestoringMasterIndexDone(masterIndex);
+}
+
+/**
+ * Have all the data been written while saving a master index to an
+ * output stream?  If the answer is yes, it is still necessary to call
+ * finishSavingMasterIndex(), which will return quickly.
+ *
+ * @param masterIndex  The master index
+ * @param zoneNumber   The number of the zone to save
+ *
+ * @return true if all the data are written
+ **/
+static INLINE bool isSavingMasterIndexDone(const MasterIndex *masterIndex,
+                                           unsigned int zoneNumber)
+{
+  return masterIndex->isSavingMasterIndexDone(masterIndex, zoneNumber);
+}
+
+/**
+ * Do a quick read-only lookup of the chunk name and return information
+ * needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int lookupMasterIndexName(const MasterIndex *masterIndex,
+                                        const UdsChunkName *name,
+                                        MasterIndexTriage *triage)
+{
+  return masterIndex->lookupMasterIndexName(masterIndex, name, triage);
+}
+
+/**
+ * Do a quick read-only lookup of the sampled chunk name and return
+ * information needed by the index code to process the chunk name.
+ *
+ * @param masterIndex The master index
+ * @param name        The chunk name
+ * @param triage      Information about the chunk name.  The zone and
+ *                    isSample fields are already filled in.  Set
+ *                    inSampledChapter and virtualChapter if the chunk
+ *                    name is found in the index.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int lookupMasterIndexSampledName(const MasterIndex *masterIndex,
+                                               const UdsChunkName *name,
+                                               MasterIndexTriage *triage)
+{
+  return masterIndex->lookupMasterIndexSampledName(masterIndex, name, triage);
+}
+
+/**
+ * Create a new record associated with a block name.
+ *
+ * @param record          The master index record found by getRecord()
+ * @param virtualChapter  The chapter number where block info is found
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove an existing record.
+ *
+ * @param record  The master index record found by getRecord()
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int removeMasterIndexRecord(MasterIndexRecord *record)
+  __attribute__((warn_unused_result));
+
+/**
+ * Restore a saved delta list
+ *
+ * @param masterIndex  The master index to restore into
+ * @param dlsi         The DeltaListSaveInfo describing the delta list
+ * @param data         The saved delta list bit stream
+ *
+ * @return error code or UDS_SUCCESS
+ **/
+static INLINE int restoreDeltaListToMasterIndex(MasterIndex *masterIndex,
+                                                const DeltaListSaveInfo *dlsi,
+                                                const byte data[DELTA_LIST_MAX_BYTE_COUNT])
+{
+  return masterIndex->restoreDeltaListToMasterIndex(masterIndex, dlsi, data);
+}
+
+/**
+ * Set the open chapter number.  The master index will be modified to index
+ * the proper number of chapters ending with the new open chapter.
+ *
+ * In normal operation, the virtual chapter number will be the next chapter
+ * following the currently open chapter.  We will advance the master index
+ * one chapter forward in the virtual chapter space, invalidating the
+ * oldest chapter in the index and be prepared to add index entries for the
+ * newly opened chapter.
+ *
+ * In abnormal operation we make a potentially large change to the range of
+ * chapters being indexed.  This happens when we are replaying chapters or
+ * rebuilding an entire index.  If we move the open chapter forward, we
+ * will invalidate many chapters (potentially the entire index).  If we
+ * move the open chapter backward, we invalidate any entry in the newly
+ * open chapter and any higher numbered chapter (potentially the entire
+ * index).
+ *
+ * @param masterIndex     The master index
+ * @param virtualChapter  The new open chapter number
+ **/
+static INLINE void setMasterIndexOpenChapter(MasterIndex *masterIndex,
+                                             uint64_t virtualChapter)
+{
+  masterIndex->setMasterIndexOpenChapter(masterIndex, virtualChapter);
+}
+
+/**
+ * Set the chapter number associated with a block name.
+ *
+ * @param record          The master index record found by getRecord()
+ * @param virtualChapter  The chapter number where block info is now found.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int setMasterIndexRecordChapter(MasterIndexRecord *record, uint64_t chapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set the tag value used when saving and/or restoring a master index.
+ *
+ * @param masterIndex  The master index
+ * @param tag          The tag value
+ **/
+static INLINE void setMasterIndexTag(MasterIndex *masterIndex, byte tag)
+{
+  masterIndex->setMasterIndexTag(masterIndex, tag);
+}
+
+/**
+ * Set the open chapter number on a zone.  The master index zone will be
+ * modified to index the proper number of chapters ending with the new open
+ * chapter.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The zone number
+ * @param virtualChapter  The new open chapter number
+ **/
+static INLINE void setMasterIndexZoneOpenChapter(MasterIndex *masterIndex,
+                                                 unsigned int zoneNumber,
+                                                 uint64_t virtualChapter)
+{
+  masterIndex->setMasterIndexZoneOpenChapter(masterIndex, zoneNumber,
+                                             virtualChapter);
+}
+
+/**
+ * Start restoring the master index from multiple buffered readers
+ *
+ * @param masterIndex      The master index to restore into
+ * @param bufferedReaders  The buffered reader to read the master index from
+ * @param numReaders       The number of buffered readers
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static INLINE int startRestoringMasterIndex(MasterIndex *masterIndex,
+                                            BufferedReader **bufferedReaders,
+                                            int numReaders)
+{
+  return masterIndex->startRestoringMasterIndex(masterIndex, bufferedReaders,
+                                                numReaders);
+}
+
+/**
+ * Start saving a master index to a buffered output stream.
+ *
+ * @param masterIndex     The master index
+ * @param zoneNumber      The number of the zone to save
+ * @param bufferedWriter  The index state component being written
+ *
+ * @return UDS_SUCCESS on success, or an error code on failure
+ **/
+static INLINE int startSavingMasterIndex(const MasterIndex *masterIndex,
+                                         unsigned int zoneNumber,
+                                         BufferedWriter *bufferedWriter)
+{
+  return masterIndex->startSavingMasterIndex(masterIndex, zoneNumber,
+                                             bufferedWriter);
+}
+
+#endif /* MASTERINDEXOPS_H */
diff --git a/uds/memoryAlloc.c b/uds/memoryAlloc.c
new file mode 100644
index 0000000..e47494c
--- /dev/null
+++ b/uds/memoryAlloc.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.c#1 $
+ */
+
+#include "memoryAlloc.h"
+
+#include "stringUtils.h"
+
+/**********************************************************************/
+int duplicateString(const char *string, const char *what, char **newString)
+{
+  return memdup(string, strlen(string) + 1, what, newString);
+}
+
+/**********************************************************************/
+int memdup(const void *buffer, size_t size, const char *what, void *dupPtr)
+{
+  byte *dup;
+  int result = ALLOCATE(size, byte, what, &dup);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  memcpy(dup, buffer, size);
+  *((void **) dupPtr) = dup;
+  return UDS_SUCCESS;
+}
diff --git a/uds/memoryAlloc.h b/uds/memoryAlloc.h
new file mode 100644
index 0000000..c669e2b
--- /dev/null
+++ b/uds/memoryAlloc.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.h#2 $
+ */
+
+#ifndef MEMORY_ALLOC_H
+#define MEMORY_ALLOC_H 1
+
+#include <stdarg.h>
+
+#include "compiler.h"
+#include "cpu.h"
+#include "memoryDefs.h"
+#include "permassert.h"
+
+/**
+ * Allocate storage based on memory size and  alignment, logging an error if
+ * the allocation fails. The memory will be zeroed.
+ *
+ * @param size   The size of an object
+ * @param align  The required alignment
+ * @param what   What is being allocated (for error logging)
+ * @param ptr    A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int allocateMemory(size_t size, size_t align, const char *what, void *ptr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free storage
+ *
+ * @param ptr  The memory to be freed
+ **/
+void freeMemory(void *ptr);
+
+/**
+ * Allocate storage based on element counts, sizes, and alignment.
+ *
+ * This is a generalized form of our allocation use case: It allocates
+ * an array of objects, optionally preceded by one object of another
+ * type (i.e., a struct with trailing variable-length array), with the
+ * alignment indicated.
+ *
+ * Why is this inline?  The sizes and alignment will always be
+ * constant, when invoked through the macros below, and often the
+ * count will be a compile-time constant 1 or the number of extra
+ * bytes will be a compile-time constant 0.  So at least some of the
+ * arithmetic can usually be optimized away, and the run-time
+ * selection between allocation functions always can.  In many cases,
+ * it'll boil down to just a function call with a constant size.
+ *
+ * @param count   The number of objects to allocate
+ * @param size    The size of an object
+ * @param extra   The number of additional bytes to allocate
+ * @param align   The required alignment
+ * @param what    What is being allocated (for error logging)
+ * @param ptr     A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int doAllocation(size_t      count,
+                               size_t      size,
+                               size_t      extra,
+                               size_t      align,
+                               const char *what,
+                               void       *ptr)
+{
+  size_t totalSize = count * size + extra;
+  // Overflow check:
+  if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) {
+    /*
+     * This is kind of a hack: We rely on the fact that SIZE_MAX would
+     * cover the entire address space (minus one byte) and thus the
+     * system can never allocate that much and the call will always
+     * fail.  So we can report an overflow as "out of memory" by asking
+     * for "merely" SIZE_MAX bytes.
+     */
+    totalSize = SIZE_MAX;
+  }
+
+  return allocateMemory(totalSize, align, what, ptr);
+}
+
+/**
+ * Reallocate dynamically allocated memory.  There are no alignment guarantees
+ * for the reallocated memory.
+ *
+ * @param ptr      The memory to reallocate.
+ * @param oldSize  The old size of the memory
+ * @param size     The new size to allocate
+ * @param what     What is being allocated (for error logging)
+ * @param newPtr   A pointer to hold the reallocated pointer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int reallocateMemory(void       *ptr,
+                     size_t      oldSize,
+                     size_t      size,
+                     const char *what,
+                     void       *newPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate one or more elements of the indicated type, logging an
+ * error if the allocation fails. The memory will be zeroed.
+ *
+ * @param COUNT  The number of objects to allocate
+ * @param TYPE   The type of objects to allocate.  This type determines the
+ *               alignment of the allocated memory.
+ * @param WHAT   What is being allocated (for error logging)
+ * @param PTR    A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+#define ALLOCATE(COUNT, TYPE, WHAT, PTR) \
+  doAllocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR)
+
+/**
+ * Allocate one object of an indicated type, followed by one or more
+ * elements of a second type, logging an error if the allocation
+ * fails. The memory will be zeroed.
+ *
+ * @param TYPE1  The type of the primary object to allocate.  This type
+ *               determines the alignment of the allocated memory.
+ * @param COUNT  The number of objects to allocate
+ * @param TYPE2  The type of array objects to allocate
+ * @param WHAT   What is being allocated (for error logging)
+ * @param PTR    A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+#define ALLOCATE_EXTENDED(TYPE1, COUNT, TYPE2, WHAT, PTR)             \
+  __extension__ ({                                                    \
+      TYPE1 **_ptr = (PTR);                                           \
+      STATIC_ASSERT(__alignof__(TYPE1) >= __alignof__(TYPE2));        \
+      int _result = doAllocation(COUNT, sizeof(TYPE2), sizeof(TYPE1), \
+                                 __alignof__(TYPE1), WHAT, _ptr);     \
+      _result;                                                        \
+    })
+
+/**
+ * Free memory allocated with ALLOCATE().
+ *
+ * @param ptr    Pointer to the memory to free
+ **/
+static INLINE void FREE(void *ptr)
+{
+  freeMemory(ptr);
+}
+
+/**
+ * Allocate memory starting on a cache line boundary, logging an error if the
+ * allocation fails. The memory will be zeroed.
+ *
+ * @param size  The number of bytes to allocate
+ * @param what  What is being allocated (for error logging)
+ * @param ptr   A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static INLINE int allocateCacheAligned(size_t      size,
+                                       const char *what,
+                                       void       *ptr)
+{
+  return allocateMemory(size, CACHE_LINE_BYTES, what, ptr);
+}
+
+/**
+ * Duplicate a string.
+ *
+ * @param string    The string to duplicate
+ * @param what      What is being allocated (for error logging)
+ * @param newString A pointer to hold the duplicated string
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int duplicateString(const char *string, const char *what, char **newString)
+  __attribute__((warn_unused_result));
+
+/**
+ * Duplicate a buffer, logging an error if the allocation fails.
+ *
+ * @param ptr     The buffer to copy
+ * @param size    The size of the buffer
+ * @param what    What is being duplicated (for error logging)
+ * @param dupPtr  A pointer to hold the allocated array
+ *
+ * @return UDS_SUCCESS or ENOMEM
+ **/
+int memdup(const void *ptr, size_t size, const char *what, void *dupPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Wrapper which permits freeing a const pointer.
+ *
+ * @param pointer  the pointer to be freed
+ **/
+static INLINE void freeConst(const void *pointer)
+{
+  union {
+    const void *constP;
+    void *notConst;
+  } u = { .constP = pointer };
+  FREE(u.notConst);
+}
+
+/**
+ * Wrapper which permits freeing a volatile pointer.
+ *
+ * @param pointer  the pointer to be freed
+ **/
+static INLINE void freeVolatile(volatile void *pointer)
+{
+  union {
+    volatile void *volP;
+    void *notVol;
+  } u = { .volP = pointer };
+  FREE(u.notVol);
+}
+
+#endif /* MEMORY_ALLOC_H */
diff --git a/uds/memoryDefs.h b/uds/memoryDefs.h
new file mode 100644
index 0000000..3f8041e
--- /dev/null
+++ b/uds/memoryDefs.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryDefs.h#2 $
+ */
+
+#ifndef LINUX_KERNEL_MEMORY_DEFS_H
+#define LINUX_KERNEL_MEMORY_DEFS_H 1
+
+#include <linux/io.h>  // for PAGE_SIZE
+
+#include "compiler.h"
+#include "threadRegistry.h"
+#include "typeDefs.h"
+
+/**
+ * Allocate one or more elements of the indicated type, aligning them
+ * on the boundary that will allow them to be used in io, logging an
+ * error if the allocation fails. The memory will be zeroed.
+ *
+ * @param COUNT  The number of objects to allocate
+ * @param TYPE   The type of objects to allocate
+ * @param WHAT   What is being allocated (for error logging)
+ * @param PTR    A pointer to hold the allocated memory
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+#define ALLOCATE_IO_ALIGNED(COUNT, TYPE, WHAT, PTR) \
+  doAllocation(COUNT, sizeof(TYPE), 0, PAGE_SIZE, WHAT, PTR)
+
+/**
+ * Allocate one element of the indicated type immediately, failing if the
+ * required memory is not immediately available.
+ *
+ * @param TYPE   The type of objects to allocate
+ * @param WHAT   What is being allocated (for error logging)
+ *
+ * @return pointer to the memory, or NULL if the memory is not available.
+ **/
+#define ALLOCATE_NOWAIT(TYPE, WHAT) allocateMemoryNowait(sizeof(TYPE), WHAT)
+
+/**
+ * Perform termination of the memory allocation subsystem.
+ **/
+void memoryExit(void);
+
+/**
+ * Perform initialization of the memory allocation subsystem.
+ **/
+void memoryInit(void);
+
+/**
+ * Allocate storage based on memory size, failing immediately if the required
+ * memory is not available.  The memory will be zeroed.
+ *
+ * @param size  The size of an object.
+ * @param what  What is being allocated (for error logging)
+ *
+ * @return pointer to the allocated memory, or NULL if the required space is
+ *         not available.
+ **/
+void *allocateMemoryNowait(size_t size, const char *what)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Register the current thread as an allocating thread.
+ *
+ * An optional flag location can be supplied indicating whether, at
+ * any given point in time, the threads associated with that flag
+ * should be allocating storage.  If the flag is false, a message will
+ * be logged.
+ *
+ * If no flag is supplied, the thread is always allowed to allocate
+ * storage without complaint.
+ *
+ * @param newThread  RegisteredThread structure to use for the current thread
+ * @param flagPtr    Location of the allocation-allowed flag
+ **/
+void registerAllocatingThread(RegisteredThread *newThread,
+                              const bool       *flagPtr);
+
+/**
+ * Unregister the current thread as an allocating thread.
+ **/
+void unregisterAllocatingThread(void);
+
+/**
+ * Get the memory statistics.
+ *
+ * @param bytesUsed     A pointer to hold the number of bytes in use
+ * @param peakBytesUsed A pointer to hold the maximum value bytesUsed has
+ *                      attained
+ **/
+void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed);
+
+/**
+ * Report stats on any allocated memory that we're tracking.
+ *
+ * Not all allocation types are guaranteed to be tracked in bytes
+ * (e.g., bios).
+ **/
+void reportMemoryUsage(void);
+
+
+#endif /* LINUX_KERNEL_MEMORY_DEFS_H */
diff --git a/uds/memoryLinuxKernel.c b/uds/memoryLinuxKernel.c
new file mode 100644
index 0000000..5a42583
--- /dev/null
+++ b/uds/memoryLinuxKernel.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryLinuxKernel.c#6 $
+ */
+
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched/mm.h>
+#include <linux/slab.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include "compilerDefs.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+
+/*
+ ******************************************************************************
+ * Production: UDS and VDO keep track of which threads are allowed to allocate
+ * memory freely, and which threads must be careful to not do a memory
+ * allocation that does an I/O request.  The allocatingThreads ThreadsRegistry
+ * and its associated methods implement this tracking.
+ */
+
+static ThreadRegistry allocatingThreads;
+
+/*****************************************************************************/
+static bool allocationsAllowed(void)
+{
+  const bool *pointer = lookupThread(&allocatingThreads);
+  return pointer != NULL ? *pointer : false;
+}
+
+/*****************************************************************************/
+void registerAllocatingThread(RegisteredThread *newThread, const bool *flagPtr)
+{
+  if (flagPtr == NULL) {
+    static const bool allocationAlwaysAllowed = true;
+    flagPtr = &allocationAlwaysAllowed;
+  }
+  registerThread(&allocatingThreads, newThread, flagPtr);
+}
+
+/*****************************************************************************/
+void unregisterAllocatingThread(void)
+{
+  unregisterThread(&allocatingThreads);
+}
+
+/*
+ ******************************************************************************
+ * Production: We track how much memory has been allocated and freed.  When we
+ * unload the UDS module, we log an error if we have not freed all the memory
+ * that we allocated.  Nearly all memory allocation and freeing is done using
+ * this module.
+ *
+ * We do not use kernel functions like the kvasprintf() method, which allocate
+ * memory indirectly using kmalloc.
+ *
+ * These data structures and methods are used to track the amount of memory
+ * used.
+ */
+
+// We allocate very few large objects, and allocation/deallocation isn't done
+// in a performance-critical stage for us, so a linked list should be fine.
+typedef struct vmallocBlockInfo {
+  void                    *ptr;
+  size_t                   size;
+  struct vmallocBlockInfo *next;
+} VmallocBlockInfo;
+
+static struct {
+  spinlock_t        lock;
+  size_t            kmallocBlocks;
+  size_t            kmallocBytes;
+  size_t            vmallocBlocks;
+  size_t            vmallocBytes;
+  size_t            peakBytes;
+  VmallocBlockInfo *vmallocList;
+} memoryStats __cacheline_aligned;
+
+/*****************************************************************************/
+static void updatePeakUsage(void)
+{
+  size_t totalBytes = memoryStats.kmallocBytes + memoryStats.vmallocBytes;
+  if (totalBytes > memoryStats.peakBytes) {
+    memoryStats.peakBytes = totalBytes;
+  }
+}
+
+/*****************************************************************************/
+static void addKmallocBlock(size_t size)
+{
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  memoryStats.kmallocBlocks++;
+  memoryStats.kmallocBytes += size;
+  updatePeakUsage();
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+}
+
+/*****************************************************************************/
+static void removeKmallocBlock(size_t size)
+{
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  memoryStats.kmallocBlocks--;
+  memoryStats.kmallocBytes -= size;
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+}
+
+/*****************************************************************************/
+static void addVmallocBlock(VmallocBlockInfo *block)
+{
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  block->next = memoryStats.vmallocList;
+  memoryStats.vmallocList = block;
+  memoryStats.vmallocBlocks++;
+  memoryStats.vmallocBytes += block->size;
+  updatePeakUsage();
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+}
+
+/*****************************************************************************/
+static void removeVmallocBlock(void *ptr)
+{
+  VmallocBlockInfo *block, **blockPtr;
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  for (blockPtr = &memoryStats.vmallocList;
+       (block = *blockPtr) != NULL;
+       blockPtr = &block->next) {
+    if (block->ptr == ptr) {
+      *blockPtr = block->next;
+      memoryStats.vmallocBlocks--;
+      memoryStats.vmallocBytes -= block->size;
+      break;
+    }
+  }
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+  if (block != NULL) {
+    FREE(block);
+  } else {
+    logInfo("attempting to remove ptr %" PRIptr " not found in vmalloc list",
+	    ptr);
+  }
+}
+
+
+
+/**
+ * Determine whether allocating a memory block should use kmalloc or vmalloc.
+ *
+ * vmalloc can allocate any integral number of pages.
+ *
+ * kmalloc can allocate any number of bytes up to a configured limit, which
+ * defaults to 8 megabytes on some of our systems.  kmalloc is especially good
+ * when memory is being both allocated and freed, and it does this efficiently
+ * in a multi CPU environment.
+ *
+ * kmalloc usually rounds the size of the block up to the next power of two.
+ * So when the requested block is bigger than PAGE_SIZE / 2 bytes, kmalloc will
+ * never give you less space than the corresponding vmalloc allocation.
+ * Sometimes vmalloc will use less overhead than kmalloc.
+ *
+ * The advantages of kmalloc do not help out UDS or VDO, because we allocate
+ * all our memory up front and do not free and reallocate it.  Sometimes we
+ * have problems using kmalloc, because the Linux memory page map can become so
+ * fragmented that kmalloc will not give us a 32KB chunk.  We have used vmalloc
+ * as a backup to kmalloc in the past, and a followup vmalloc of 32KB will
+ * work.  But there is no strong case to be made for using kmalloc over vmalloc
+ * for these size chunks.
+ *
+ * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB
+ * requests.  There is no strong reason for favoring either kmalloc or vmalloc
+ * for 4KB requests, except that the keeping of vmalloc statistics uses a
+ * linked list implementation.  Using a simple test, this choice of boundary
+ * results in 132 vmalloc calls.  Using vmalloc for requests of exactly 4KB
+ * results in an additional 6374 vmalloc calls, which will require a change to
+ * the code that tracks vmalloc statistics.
+ *
+ * @param size  How many bytes to allocate
+ **/
+static INLINE bool useKmalloc(size_t size)
+{
+  return size <= PAGE_SIZE;
+}
+
+/*****************************************************************************/
+int allocateMemory(size_t size, size_t align, const char *what, void *ptr)
+{
+  if (ptr == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+  if (size == 0) {
+    *((void **) ptr) = NULL;
+    return UDS_SUCCESS;
+  }
+
+
+  /*
+   * The __GFP_RETRY_MAYFAIL means: The VM implementation will retry memory
+   * reclaim procedures that have previously failed if there is some indication
+   * that progress has been made else where.  It can wait for other tasks to
+   * attempt high level approaches to freeing memory such as compaction (which
+   * removes fragmentation) and page-out.  There is still a definite limit to
+   * the number of retries, but it is a larger limit than with __GFP_NORETRY.
+   * Allocations with this flag may fail, but only when there is genuinely
+   * little unused memory. While these allocations do not directly trigger the
+   * OOM killer, their failure indicates that the system is likely to need to
+   * use the OOM killer soon.  The caller must handle failure, but can
+   * reasonably do so by failing a higher-level request, or completing it only
+   * in a much less efficient manner.
+   */
+  const gfp_t gfpFlags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL;
+
+  bool allocationsRestricted = !allocationsAllowed();
+  unsigned int noioFlags;
+  if (allocationsRestricted) {
+    noioFlags = memalloc_noio_save();
+  }
+
+  unsigned long startTime = jiffies;
+  void *p = NULL;
+  if (useKmalloc(size) && (align < PAGE_SIZE)) {
+    p = kmalloc(size, gfpFlags | __GFP_NOWARN);
+    if (p == NULL) {
+      /*
+       * If we had just done kmalloc(size, gfpFlags) it is possible that the
+       * allocation would fail (see VDO-3688).  The kernel log would then
+       * contain a long report about the failure.  Although the failure occurs
+       * because there is no page available to allocate, by the time it logs
+       * the available space, there is a page available.  So hopefully a short
+       * sleep will allow the page reclaimer to free a single page, which is
+       * all that we need.
+       */
+      msleep(1);
+      p = kmalloc(size, gfpFlags);
+    }
+    if (p != NULL) {
+      addKmallocBlock(ksize(p));
+    }
+  } else {
+    VmallocBlockInfo *block;
+    if (ALLOCATE(1, VmallocBlockInfo, __func__, &block) == UDS_SUCCESS) {
+      /*
+       * If we just do __vmalloc(size, gfpFlags, PAGE_KERNEL) it is possible
+       * that the allocation will fail (see VDO-3661).  The kernel log will
+       * then contain a long report about the failure.  Although the failure
+       * occurs because there are not enough pages available to allocate, by
+       * the time it logs the available space, there may enough pages available
+       * for smaller allocations.  So hopefully a short sleep will allow the
+       * page reclaimer to free enough pages for us.
+       *
+       * For larger allocations, the kernel page_alloc code is racing against
+       * the page reclaimer.  If the page reclaimer can stay ahead of
+       * page_alloc, the __vmalloc will succeed.  But if page_alloc overtakes
+       * the page reclaimer, the allocation fails.  It is possible that more
+       * retries will succeed.
+       */
+      for (;;) {
+        p = __vmalloc(size, gfpFlags | __GFP_NOWARN, PAGE_KERNEL);
+        // Try again unless we succeeded or more than 1 second has elapsed.
+        if ((p != NULL) || (jiffies_to_msecs(jiffies - startTime) > 1000)) {
+          break;
+        }
+        msleep(1);
+      }
+      if (p == NULL) {
+        // Try one more time, logging a failure for this call.
+        p = __vmalloc(size, gfpFlags, PAGE_KERNEL);
+      }
+      if (p == NULL) {
+        FREE(block);
+      } else {
+        block->ptr = p;
+        block->size = PAGE_ALIGN(size);
+        addVmallocBlock(block);
+      }
+    }
+  }
+
+  if (allocationsRestricted) {
+    memalloc_noio_restore(noioFlags);
+  }
+
+  if (p == NULL) {
+    unsigned int duration = jiffies_to_msecs(jiffies - startTime);
+    logError("Could not allocate %zu bytes for %s in %u msecs",
+             size, what, duration);
+    return ENOMEM;
+  }
+  *((void **) ptr) = p;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void *allocateMemoryNowait(size_t      size,
+                           const char *what __attribute__((unused)))
+{
+  void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO);
+  if (p != NULL) {
+    addKmallocBlock(ksize(p));
+  }
+  return p;
+}
+
+/*****************************************************************************/
+void freeMemory(void *ptr)
+{
+  if (ptr != NULL) {
+    if (is_vmalloc_addr(ptr)) {
+      removeVmallocBlock(ptr);
+      vfree(ptr);
+    } else {
+      removeKmallocBlock(ksize(ptr));
+      kfree(ptr);
+    }
+  }
+}
+
+/*****************************************************************************/
+int reallocateMemory(void       *ptr,
+                     size_t      oldSize,
+                     size_t      size,
+                     const char *what,
+                     void       *newPtr)
+{
+  // Handle special case of zero sized result
+  if (size == 0) {
+    FREE(ptr);
+    *(void **)newPtr = NULL;
+    return UDS_SUCCESS;
+  }
+
+  int result = ALLOCATE(size, char, what, newPtr);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (ptr != NULL) {
+    if (oldSize < size) {
+      size = oldSize;
+    }
+    memcpy(*((void **) newPtr), ptr, size);
+    FREE(ptr);
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void memoryInit(void)
+{
+
+  spin_lock_init(&memoryStats.lock);
+  initializeThreadRegistry(&allocatingThreads);
+}
+
+
+/*****************************************************************************/
+void memoryExit(void)
+{
+
+  ASSERT_LOG_ONLY(memoryStats.kmallocBytes == 0,
+                  "kmalloc memory used (%zd bytes in %zd blocks)"
+                  " is returned to the kernel",
+                  memoryStats.kmallocBytes, memoryStats.kmallocBlocks);
+  ASSERT_LOG_ONLY(memoryStats.vmallocBytes == 0,
+                  "vmalloc memory used (%zd bytes in %zd blocks)"
+                  " is returned to the kernel",
+                  memoryStats.vmallocBytes, memoryStats.vmallocBlocks);
+  logDebug("%s peak usage %zd bytes", THIS_MODULE->name,
+           memoryStats.peakBytes);
+}
+
+/**********************************************************************/
+void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed)
+{
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  *bytesUsed     = memoryStats.kmallocBytes + memoryStats.vmallocBytes;
+  *peakBytesUsed = memoryStats.peakBytes;
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+}
+
+/**********************************************************************/
+void reportMemoryUsage()
+{
+  unsigned long flags;
+  spin_lock_irqsave(&memoryStats.lock, flags);
+  uint64_t kmallocBlocks = memoryStats.kmallocBlocks;
+  uint64_t kmallocBytes = memoryStats.kmallocBytes;
+  uint64_t vmallocBlocks = memoryStats.vmallocBlocks;
+  uint64_t vmallocBytes = memoryStats.vmallocBytes;
+  uint64_t peakUsage = memoryStats.peakBytes;
+  spin_unlock_irqrestore(&memoryStats.lock, flags);
+  uint64_t totalBytes = kmallocBytes + vmallocBytes;
+  logInfo("current module memory tracking"
+          " (actual allocation sizes, not requested):");
+  logInfo("  %llu bytes in %llu kmalloc blocks",
+          kmallocBytes, kmallocBlocks);
+  logInfo("  %llu bytes in %llu vmalloc blocks",
+          vmallocBytes, vmallocBlocks);
+  logInfo("  total %llu bytes, peak usage %llu bytes",
+          totalBytes, peakUsage);
+}
diff --git a/uds/murmur/MurmurHash3.c b/uds/murmur/MurmurHash3.c
new file mode 100644
index 0000000..42af11a
--- /dev/null
+++ b/uds/murmur/MurmurHash3.c
@@ -0,0 +1,379 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "MurmurHash3.h"
+
+#include "cpu.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE	__forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x,y)	_rotl(x,y)
+#define ROTL64(x,y)	_rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else	// defined(_MSC_VER)
+
+#if __GNUC__ >= 7
+#pragma GCC diagnostic warning "-Wimplicit-fallthrough=0"
+#endif
+
+#define	FORCE_INLINE __attribute__((always_inline)) inline
+
+static inline uint32_t rotl32 ( uint32_t x, int8_t r )
+{
+  return (x << r) | (x >> (32 - r));
+}
+
+static inline uint64_t rotl64 ( uint64_t x, int8_t r )
+{
+  return (x << r) | (x >> (64 - r));
+}
+
+#define	ROTL32(x,y)	rotl32(x,y)
+#define ROTL64(x,y)	rotl64(x,y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif // !defined(_MSC_VER)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return p[i];
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return __builtin_bswap32(p[i]);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+static FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return p[i];
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return __builtin_bswap64(p[i]);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+// Block write
+static FORCE_INLINE void putblock (uint32_t *p, int i, uint32_t value)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  p[i] = value;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p[i] = __builtin_bswap32(value);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+static FORCE_INLINE void putblock64 (uint64_t *p, int i, uint64_t value)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  p[i] = value;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p[i] = __builtin_bswap64(value);
+#else
+#error "can't figure out byte order"
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static FORCE_INLINE uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+static FORCE_INLINE uint64_t fmix64 ( uint64_t k )
+{
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32 ( const void * key, int len,
+                          uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  uint32_t c1 = 0xcc9e2d51;
+  uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
+
+  int i;
+  for(i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  default: break;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  putblock(out, 0, h1);
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_128 ( const void * key, const int len,
+                           uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  uint32_t c1 = 0x239b961b;
+  uint32_t c2 = 0xab0e9789;
+  uint32_t c3 = 0x38b34ae5;
+  uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
+
+  int i;
+  for(i = -nblocks; i; i++)
+  {
+    uint32_t k1 = getblock(blocks,i*4+0);
+    uint32_t k2 = getblock(blocks,i*4+1);
+    uint32_t k3 = getblock(blocks,i*4+2);
+    uint32_t k4 = getblock(blocks,i*4+3);
+
+    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+
+    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+
+    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+
+    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k4 ^= tail[14] << 16;
+  case 14: k4 ^= tail[13] << 8;
+  case 13: k4 ^= tail[12] << 0;
+           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+
+  case 12: k3 ^= tail[11] << 24;
+  case 11: k3 ^= tail[10] << 16;
+  case 10: k3 ^= tail[ 9] << 8;
+  case  9: k3 ^= tail[ 8] << 0;
+           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+
+  case  8: k2 ^= tail[ 7] << 24;
+  case  7: k2 ^= tail[ 6] << 16;
+  case  6: k2 ^= tail[ 5] << 8;
+  case  5: k2 ^= tail[ 4] << 0;
+           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+
+  case  4: k1 ^= tail[ 3] << 24;
+  case  3: k1 ^= tail[ 2] << 16;
+  case  2: k1 ^= tail[ 1] << 8;
+  case  1: k1 ^= tail[ 0] << 0;
+           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+  default: break;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2; h1 += h3; h1 += h4;
+  h2 += h1; h3 += h1; h4 += h1;
+
+  putblock((uint32_t*)out, 0, h1);
+  putblock((uint32_t*)out, 1, h2);
+  putblock((uint32_t*)out, 2, h3);
+  putblock((uint32_t*)out, 3, h4);
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x64_128 ( const void * key, const int len,
+                           const uint32_t seed, void * out )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint64_t h1 = seed;
+  uint64_t h2 = seed;
+
+  uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+  uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+  //----------
+  // body
+
+  const uint64_t * blocks = (const uint64_t *)(data);
+
+  int i;
+  for(i = 0; i < nblocks; i++)
+  {
+    uint64_t k1 = getblock64(blocks,i*2+0);
+    uint64_t k2 = getblock64(blocks,i*2+1);
+
+    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+
+    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
+
+  uint64_t k1 = 0;
+  uint64_t k2 = 0;
+
+  switch(len & 15)
+  {
+  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
+  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
+  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
+  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
+  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
+  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
+  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
+           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+
+  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
+  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
+  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
+  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
+  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
+  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
+  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
+  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
+           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+  default: break;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len; h2 ^= len;
+
+  h1 += h2;
+  h2 += h1;
+
+  h1 = fmix64(h1);
+  h2 = fmix64(h2);
+
+  h1 += h2;
+  h2 += h1;
+
+  putblock64((uint64_t*)out, 0, h1);
+  putblock64((uint64_t*)out, 1, h2);
+}
diff --git a/uds/murmur/MurmurHash3.h b/uds/murmur/MurmurHash3.h
new file mode 100644
index 0000000..bebb8fa
--- /dev/null
+++ b/uds/murmur/MurmurHash3.h
@@ -0,0 +1,44 @@
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Linux kernel
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+
+// Microsoft Visual Studio
+
+#else // defined(__KERNEL__)
+# if defined(_MSC_VER)
+
+  typedef unsigned char uint8_t;
+  typedef unsigned long uint32_t;
+  typedef unsigned __int64 uint64_t;
+
+// Other compilers
+
+# else	// defined(_MSC_VER)
+
+#  include <stdint.h>
+
+# endif // !defined(_MSC_VER)
+#endif // !defined(__KERNEL__)
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+
+void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+
+//-----------------------------------------------------------------------------
+
+#endif // _MURMURHASH3_H_
diff --git a/uds/nonce.c b/uds/nonce.c
new file mode 100644
index 0000000..43b0f80
--- /dev/null
+++ b/uds/nonce.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/nonce.c#3 $
+ */
+
+#include "nonce.h"
+
+#include "murmur/MurmurHash3.h"
+#include "numeric.h"
+#include "random.h"
+#include "stringUtils.h"
+#include "timeUtils.h"
+
+/*****************************************************************************/
+static uint64_t hashStuff(uint64_t start, const void *data, size_t len)
+{
+  uint32_t seed = start ^ (start >> 27);
+  byte hashBuffer[16];
+  MurmurHash3_x64_128(data, len, seed, hashBuffer);
+  return getUInt64LE(hashBuffer + 4);
+}
+
+/*****************************************************************************/
+static void *memput(void *buf, void *end, const void *data, size_t len)
+{
+  byte *bp = buf;
+  byte *be = end;
+
+  size_t chunk = minSizeT(len, be - bp);
+  memcpy(bp, data, chunk);
+  return bp + chunk;
+}
+
+/*****************************************************************************/
+size_t createUniqueNonceData(byte *buffer, size_t length)
+{
+  AbsTime now = currentTime(CLOCK_REALTIME);
+
+  byte *be = buffer + length;
+  byte *bp = memput(buffer, be, &now, sizeof(now));
+
+  uint32_t rand = randomInRange(1, (1<<30) - 1);
+
+  bp = memput(bp, be, &rand, sizeof(rand));
+
+  while (bp < be) {
+    size_t n = minSizeT(be - bp, bp - buffer);
+    memcpy(bp, buffer, n);
+    bp += n;
+  }
+
+  return bp - buffer;
+}
+
+/*****************************************************************************/
+uint64_t generateMasterNonce(const void *data, size_t len)
+{
+  return hashStuff(0xa1b1e0fc, data, len);
+}
+
+/*****************************************************************************/
+uint64_t generateSecondaryNonce(uint64_t    nonce,
+                                const void *data,
+                                size_t      len)
+{
+  return hashStuff(nonce + 1, data, len);
+}
diff --git a/uds/nonce.h b/uds/nonce.h
new file mode 100644
index 0000000..43f2054
--- /dev/null
+++ b/uds/nonce.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/nonce.h#1 $
+ */
+
+#ifndef NONCE_H
+#define NONCE_H
+
+#include "typeDefs.h"
+
+/**
+ * Create unique data for the master nonce, using system-specific
+ * methods such as the current time and a random number.
+ *
+ * @param buffer        A buffer of length specified next.
+ * @param length        Length of the buffer.
+ *
+ * @return the amount of the buffer that has been filled with unique data
+ **/
+size_t createUniqueNonceData(byte *buffer, size_t length);
+
+/**
+ * Generate a master nonce, using the specified data.
+ *
+ * @param data          Some arbitrary information.
+ * @param len           The length of the information.
+ *
+ * @return a number which will be fairly unique
+ **/
+uint64_t generateMasterNonce(const void *data, size_t len);
+
+/**
+ * Deterministically generate a secondary nonce based on an existing
+ * nonce and some arbitrary data. Effectively hashes the nonce and
+ * the data to produce a new nonce which is deterministic.
+ *
+ * @param nonce         An existing nonce which is well known.
+ * @param data          Some data related to the creation of this nonce.
+ * @param len           The length of the data.
+ *
+ * @return a number which will be fairly unique and depend solely on
+ *      the nonce and the data.
+ **/
+uint64_t generateSecondaryNonce(uint64_t    nonce,
+                                const void *data,
+                                size_t      len);
+
+#endif // NONCE_H
diff --git a/uds/numeric.c b/uds/numeric.c
new file mode 100644
index 0000000..4bc1e2d
--- /dev/null
+++ b/uds/numeric.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/numeric.c#2 $
+ */
+
+#include "numeric.h"
+#include "permassert.h"
+
+#define STATIC_ASSERT_ALIGNOF(type, expectedAlignment) \
+  STATIC_ASSERT(__alignof__(type) == (expectedAlignment))
+
+/**********************************************************************/
+bool multiplyWouldOverflow(uint64_t a, uint64_t b)
+{
+  return b != 0 && a > UINT64_MAX / b;
+}
+
+/**********************************************************************/
+void numericCompileTimeAssertions(void)
+{
+  STATIC_ASSERT_SIZEOF(uint64_t, 8);
+  STATIC_ASSERT_SIZEOF(uint32_t, 4);
+  STATIC_ASSERT_SIZEOF(uint16_t, 2);
+
+  STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint64_t), 8);
+  STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint32_t), 4);
+  STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint16_t), 2);
+
+  STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint64_t), 1);
+  STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint32_t), 1);
+  STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint16_t), 1);
+}
diff --git a/uds/numeric.h b/uds/numeric.h
new file mode 100644
index 0000000..06d7eee
--- /dev/null
+++ b/uds/numeric.h
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/numeric.h#2 $
+ */
+
+#ifndef NUMERIC_H
+#define NUMERIC_H 1
+
+#include "compiler.h"
+#include "numericDefs.h"
+#include "typeDefs.h"
+
+#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \
+  || !defined(__BYTE_ORDER__)
+#error "GCC byte order macros not defined?"
+#endif
+
+/*
+ * Define a type describing an integer value that is only byte-aligned
+ * and may explicitly alias other types.  GCC keeps getting better
+ * about type-based alias analysis (both for optimization and for
+ * warnings), so simply casting a pointer to pointer-to-uintXX_t isn't
+ * good enough.
+ *
+ * C is okay with defining the structures directly in a cast, but
+ * C++ is not, and we use this header in some C++ code internally.
+ */
+#define UNALIGNED_WRAPPER(TYPE)                 \
+  unaligned_wrap_##TYPE
+#define UNALIGNED_WRAPPER_DEF(TYPE)                                 \
+  typedef struct __attribute__((packed, may_alias)) { TYPE value; } \
+  UNALIGNED_WRAPPER(TYPE)
+UNALIGNED_WRAPPER_DEF(int64_t);
+UNALIGNED_WRAPPER_DEF(uint64_t);
+UNALIGNED_WRAPPER_DEF(int32_t);
+UNALIGNED_WRAPPER_DEF(uint32_t);
+UNALIGNED_WRAPPER_DEF(uint16_t);
+
+#define GET_UNALIGNED(TYPE,ADDR)                        \
+  (((const UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value)
+#define PUT_UNALIGNED(TYPE,ADDR,VALUE)                  \
+  (((UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value = (VALUE))
+
+/**
+ * Find the minimum of two ints.
+ *
+ * @param a The first int
+ * @param b The second int
+ *
+ * @return The lesser of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE int minInt(int a, int b)
+{
+  return ((a < b) ? a : b);
+}
+
+/**
+ * Find the maximum of two ints.
+ *
+ * @param a The first int
+ * @param b The second int
+ *
+ * @return The greater of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE int maxInt(int a, int b)
+{
+  return ((a > b) ? a : b);
+}
+
+/**
+ * Find the maximum of two unsigned ints.
+ *
+ * @param a The first value
+ * @param b The second value
+ *
+ * @return The greater of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE unsigned int maxUInt(unsigned int a, unsigned int b)
+{
+  return ((a > b) ? a : b);
+}
+
+/**
+ * Find the maximum of two signed longs.
+ *
+ * @param a The first int
+ * @param b The second int
+ *
+ * @return The greater of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE long maxLong(long a, long b)
+{
+  return ((a > b) ? a : b);
+}
+
+/**
+ * Find the maximum of two unsigned longs.
+ *
+ * @param a The first int
+ * @param b The second int
+ *
+ * @return The greater of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE unsigned long maxULong(unsigned long a, unsigned long b)
+{
+  return ((a > b) ? a : b);
+}
+
+/**
+ * Find the minimum of two size_ts.
+ *
+ * @param a The first size_t
+ * @param b The second size_t
+ *
+ * @return The lesser of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE size_t minSizeT(size_t a, size_t b)
+{
+  return ((a < b) ? a : b);
+}
+
+/**
+ * Find the maximum of two size_ts.
+ *
+ * @param a The first size_t
+ * @param b The second size_t
+ *
+ * @return The greater of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE size_t maxSizeT(size_t a, size_t b)
+{
+  return ((a > b) ? a : b);
+}
+
+/**
+ * Find the minimum of two uint64_ts.
+ *
+ * @param a The first uint64_t
+ * @param b The second uint64_t
+ *
+ * @return The lesser of a and b
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint64_t minUInt64(uint64_t a, uint64_t b)
+{
+  return ((a < b) ? a : b);
+}
+
+/**
+ * Multiply two uint64_t and check for overflow. Does division.
+ **/
+bool multiplyWouldOverflow(uint64_t a, uint64_t b);
+
+/**
+ * Extract a 64 bit unsigned number from a buffer stored in
+ * big-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint64_t getUInt64BE(const byte* data)
+{
+  uint64_t num = GET_UNALIGNED(uint64_t, data);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 64 bit unsigned big-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt64BE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint64_t   *decoded)
+{
+  *decoded = getUInt64BE(buffer + *offset);
+  *offset += sizeof(uint64_t);
+}
+
+/**
+ * Store a 64 bit unsigned number in a buffer in
+ * big-endian representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt64BE(byte* data, uint64_t num)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  PUT_UNALIGNED(uint64_t, data, num);
+}
+
+/**
+ * Encode a 64 bit unsigned number into a buffer at a given offset
+ * using a big-endian representation. The offset will be advanced to
+ * first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt64BE(byte     *data,
+                                  size_t   *offset,
+                                  uint64_t  toEncode)
+{
+  storeUInt64BE(data + *offset, toEncode);
+  *offset += sizeof(uint64_t);
+}
+
+/**
+ * Extract a 32 bit unsigned number from a buffer stored in big-endian
+ * representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint32_t getUInt32BE(const byte* data)
+{
+  uint32_t num = GET_UNALIGNED(uint32_t, data);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 32 bit unsigned big-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt32BE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint32_t   *decoded)
+{
+  *decoded = getUInt32BE(buffer + *offset);
+  *offset += sizeof(uint32_t);
+}
+
+/**
+ * Store a 32 bit number in a buffer in
+ * big-endian representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt32BE(byte* data, uint32_t num)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  PUT_UNALIGNED(uint32_t, data, num);
+}
+
+/**
+ * Encode a 32 bit number into a buffer at a given offset using a
+ * big-endian representation. The offset will be advanced to first byte
+ * after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt32BE(byte     *data,
+                                  size_t   *offset,
+                                  uint32_t  toEncode)
+{
+  storeUInt32BE(data + *offset, toEncode);
+  *offset += sizeof(uint32_t);
+}
+
+/**
+ * Extract a 16 bit number from a buffer stored in
+ * big-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint16_t getUInt16BE(const byte* data)
+{
+  uint16_t num = GET_UNALIGNED(uint16_t, data);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = bswap_16(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 16 bit, big-endian number from a buffer at a specified offset.
+ * The offset will be advanced to the first byte after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to
+ *                extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt16BE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint16_t   *decoded)
+{
+  *decoded = getUInt16BE(buffer + *offset);
+  *offset += sizeof(uint16_t);
+}
+
+/**
+ * Store a 16 bit number in a buffer in
+ * big-endian representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt16BE(byte* data, uint16_t num)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  num = bswap_16(num);
+#endif
+  PUT_UNALIGNED(uint16_t, data, num);
+}
+
+/**
+ * Encode a 16 bit number into a buffer at a given offset using a
+ * big-endian representation. The offset will be advanced to first byte
+ * after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt16BE(byte     *data,
+                                  size_t   *offset,
+                                  uint16_t  toEncode)
+{
+  storeUInt16BE(data + *offset, toEncode);
+  *offset += sizeof(uint16_t);
+}
+
+/**
+ * Extract a 64 bit signed number from a buffer stored in
+ * little-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE int64_t getInt64LE(const byte* data)
+{
+  int64_t num = GET_UNALIGNED(int64_t, data);
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 64 bit signed little-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeInt64LE(const byte *buffer,
+                                 size_t     *offset,
+                                 int64_t   *decoded)
+{
+  *decoded = getInt64LE(buffer + *offset);
+  *offset += sizeof(int64_t);
+}
+
+/**
+ * Store a signed 64 bit number in a buffer in little-endian
+ * representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeInt64LE(byte* data, int64_t num)
+{
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  PUT_UNALIGNED(int64_t, data, num);
+}
+
+/**
+ * Encode a 64 bit signed number into a buffer at a given offset using
+ * a little-endian representation. The offset will be advanced to
+ * first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeInt64LE(byte    *data,
+                                 size_t  *offset,
+                                 int64_t  toEncode)
+{
+  storeInt64LE(data + *offset, toEncode);
+  *offset += sizeof(int64_t);
+}
+
+/**
+ * Extract a 64 bit number from a buffer stored in
+ * little-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint64_t getUInt64LE(const byte* data)
+{
+  uint64_t num = GET_UNALIGNED(uint64_t, data);
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 64 bit unsigned little-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt64LE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint64_t   *decoded)
+{
+  *decoded = getUInt64LE(buffer + *offset);
+  *offset += sizeof(uint64_t);
+}
+
+/**
+ * Store a 64 bit unsigned number in a buffer in little-endian
+ * representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt64LE(byte* data, uint64_t num)
+{
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap64(num);
+#endif
+  PUT_UNALIGNED(uint64_t, data, num);
+}
+
+/**
+ * Encode a 64 bit unsigned number into a buffer at a given offset
+ * using a little-endian representation. The offset will be advanced
+ * to first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt64LE(byte     *data,
+                                  size_t   *offset,
+                                  uint64_t  toEncode)
+{
+  storeUInt64LE(data + *offset, toEncode);
+  *offset += sizeof(uint64_t);
+}
+
+/**
+ * Extract a 32 bit signed number from a buffer stored in
+ * little-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE int32_t getInt32LE(const byte* data)
+{
+  int32_t num = GET_UNALIGNED(int32_t, data);
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 32 bit signed little-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeInt32LE(const byte *buffer,
+                                 size_t     *offset,
+                                 int32_t   *decoded)
+{
+  *decoded = getInt32LE(buffer + *offset);
+  *offset += sizeof(int32_t);
+}
+
+/**
+ * Store a signed 32 bit number in a buffer in little-endian
+ * representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeInt32LE(byte* data, int32_t num)
+{
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  PUT_UNALIGNED(int32_t, data, num);
+}
+
+/**
+ * Encode a 32 bit signed number into a buffer at a given offset using
+ * a little-endian representation. The offset will be advanced to
+ * first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeInt32LE(byte    *data,
+                                 size_t  *offset,
+                                 int32_t  toEncode)
+{
+  storeInt32LE(data + *offset, toEncode);
+  *offset += sizeof(int32_t);
+}
+
+/**
+ * Extract a 32 bit unsigned number from a buffer stored in
+ * little-endian representation.
+
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint32_t getUInt32LE(const byte* data)
+{
+  uint32_t num = GET_UNALIGNED(uint32_t, data);
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 32 bit unsigned little-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt32LE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint32_t   *decoded)
+{
+  *decoded = getUInt32LE(buffer + *offset);
+  *offset += sizeof(uint32_t);
+}
+
+/**
+ * Store a 32 bit unsigned number in a buffer in little-endian
+ * representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt32LE(byte* data, uint32_t num)
+{
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = __builtin_bswap32(num);
+#endif
+  PUT_UNALIGNED(uint32_t, data, num);
+}
+
+/**
+ * Encode a 32 bit unsigned number into a buffer at a given offset
+ * using a little-endian representation. The offset will be advanced
+ * to first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt32LE(byte     *data,
+                                  size_t   *offset,
+                                  uint32_t  toEncode)
+{
+  storeUInt32LE(data + *offset, toEncode);
+  *offset += sizeof(uint32_t);
+}
+
+/**
+ * Extract a 16 bit number from a buffer stored in
+ * little-endian representation.
+ *
+ * @param data The buffer from which to extract the number
+ *
+ * @return The extracted quantity
+ **/
+__attribute__((warn_unused_result))
+static INLINE uint16_t getUInt16LE(const byte* data)
+{
+  uint16_t num = GET_UNALIGNED(uint16_t, data);
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = bswap_16(num);
+#endif
+  return num;
+}
+
+/**
+ * Extract a 16 bit unsigned little-endian number from a buffer at a
+ * specified offset.  The offset will be advanced to the first byte
+ * after the number.
+ *
+ * @param buffer  The buffer from which to extract the number
+ * @param offset  A pointer to the offset into the buffer at which to
+ *                extract
+ * @param decoded A pointer to hold the extracted number
+ **/
+static INLINE void decodeUInt16LE(const byte *buffer,
+                                  size_t     *offset,
+                                  uint16_t   *decoded)
+{
+  *decoded = getUInt16LE(buffer + *offset);
+  *offset += sizeof(uint16_t);
+}
+
+/**
+ * Store a 16 bit number in a buffer in little-endian representation.
+ *
+ * @param data The buffer in which to store the number
+ * @param num  The number to store
+ **/
+static INLINE void storeUInt16LE(byte* data, uint16_t num)
+{
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  num = bswap_16(num);
+#endif
+  PUT_UNALIGNED(uint16_t, data, num);
+}
+
+/**
+ * Encode a 16 bit unsigned number into a buffer at a given offset
+ * using a little-endian representation. The offset will be advanced
+ * to first byte after the encoded number.
+ *
+ * @param data     The buffer to encode into
+ * @param offset   A pointer to the offset at which to start encoding
+ * @param toEncode The number to encode
+ **/
+static INLINE void encodeUInt16LE(byte     *data,
+                                  size_t   *offset,
+                                  uint16_t  toEncode)
+{
+  storeUInt16LE(data + *offset, toEncode);
+  *offset += sizeof(uint16_t);
+}
+
+/**
+ * Special function wrapper required for compile-time assertions. This
+ * function will fail to compile if any of the uint*_t types are not of the
+ * size we expect. This function should never be called.
+ **/
+void numericCompileTimeAssertions(void);
+
+#endif /* NUMERIC_H */
diff --git a/uds/numericDefs.h b/uds/numericDefs.h
new file mode 100644
index 0000000..c8795a1
--- /dev/null
+++ b/uds/numericDefs.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/numericDefs.h#1 $
+ */
+
+#ifndef LINUX_KERNEL_NUMERIC_DEFS_H
+#define LINUX_KERNEL_NUMERIC_DEFS_H 1
+
+#ifdef __x86_64__
+/*
+ * __builtin_bswap16 should work fine here too, but check for a
+ * performance impact before changing it, just to be safe.
+ */
+#define bswap_16(x) \
+  (__extension__                                                        \
+   ({ register unsigned short int __v, __x = (unsigned short int) (x);  \
+     __asm__ ("rorw $8, %w0"                                            \
+              : "=r" (__v)                                              \
+              : "0" (__x)                                               \
+              : "cc");                                                  \
+     __v; }))
+#else
+#define bswap_16(x) __builtin_bswap16(x)
+#endif
+
+#endif /* LINUX_KERNEL_NUMERIC_DEFS_H */
diff --git a/uds/opaqueTypes.h b/uds/opaqueTypes.h
new file mode 100644
index 0000000..478631a
--- /dev/null
+++ b/uds/opaqueTypes.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/opaqueTypes.h#3 $
+ */
+
+#ifndef OPAQUE_TYPES_H
+#define OPAQUE_TYPES_H
+
+/*
+ * This file contains typedefs of structures internal to the UDS library
+ * for which many users of those structures do need to know the details
+ * of the structures themselves.
+ */
+typedef struct indexRouter     IndexRouter;
+typedef struct internalRequest Request;
+typedef struct requestQueue    RequestQueue;
+
+#endif /* OPAQUE_TYPES_H */
diff --git a/uds/openChapter.c b/uds/openChapter.c
new file mode 100644
index 0000000..7a8a613
--- /dev/null
+++ b/uds/openChapter.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/openChapter.c#4 $
+ */
+
+#include "openChapter.h"
+
+#include "compiler.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "zone.h"
+
+static int readOpenChapters(ReadPortal *portal);
+static int writeOpenChapters(IndexComponent *component,
+                             BufferedWriter *writer,
+                             unsigned int    zone);
+
+const IndexComponentInfo OPEN_CHAPTER_INFO = {
+  .kind        = RL_KIND_OPEN_CHAPTER,
+  .name        = "open chapter",
+  .saveOnly    = true,
+  .chapterSync = false,
+  .multiZone   = false,
+  .ioStorage   = true,
+  .loader      = readOpenChapters,
+  .saver       = writeOpenChapters,
+  .incremental = NULL,
+};
+
+static const byte OPEN_CHAPTER_MAGIC[]       = "ALBOC";
+static const byte OPEN_CHAPTER_VERSION[]     = "02.00";
+
+enum {
+  OPEN_CHAPTER_MAGIC_LENGTH   = sizeof(OPEN_CHAPTER_MAGIC) - 1,
+  OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1
+};
+
+/**********************************************************************/
+static int fillDeltaChapterIndex(OpenChapterZone **chapterZones,
+                                 unsigned int      zoneCount,
+                                 OpenChapterIndex *index,
+                                 UdsChunkRecord   *collatedRecords)
+{
+  // Find a record to replace any deleted records, and fill the chapter if
+  // it was closed early. The last record in any filled zone is guaranteed
+  // to not have been deleted in this chapter, so use one of those.
+  OpenChapterZone *fillChapterZone = NULL;
+  UdsChunkRecord  *fillRecord      = NULL;
+  unsigned int z;
+  for (z = 0; z < zoneCount; ++z) {
+    fillChapterZone = chapterZones[z];
+    if (fillChapterZone->size == fillChapterZone->capacity) {
+      fillRecord = &fillChapterZone->records[fillChapterZone->size];
+      break;
+    }
+  }
+  int result = ASSERT((fillRecord != NULL),
+                      "some open chapter zone filled");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT(!fillChapterZone->slots[fillChapterZone->size].recordDeleted,
+                  "chapter fill record not deleted");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  const Geometry *geometry     = index->geometry;
+  unsigned int pagesPerChapter = geometry->recordPagesPerChapter;
+  unsigned int recordsPerPage  = geometry->recordsPerPage;
+  int          overflowCount   = 0;
+  unsigned int recordsAdded    = 0;
+  unsigned int zone            = 0;
+
+  unsigned int page;
+  for (page = 0; page < pagesPerChapter; page++) {
+    unsigned int i;
+    for (i = 0;
+         i < recordsPerPage;
+         i++, recordsAdded++, zone = (zone + 1) % zoneCount) {
+
+      // The record arrays are 1-based.
+      unsigned int recordNumber = 1 + (recordsAdded / zoneCount);
+
+      // If the zone has been exhausted, or the record was deleted,
+      // add the fill record to the chapter.
+      if (recordNumber > chapterZones[zone]->size
+          || chapterZones[zone]->slots[recordNumber].recordDeleted) {
+        collatedRecords[1 + recordsAdded] = *fillRecord;
+        continue;
+      }
+
+      UdsChunkRecord *nextRecord = &chapterZones[zone]->records[recordNumber];
+      collatedRecords[1 + recordsAdded] = *nextRecord;
+
+      int result = putOpenChapterIndexRecord(index, &nextRecord->name, page);
+      switch (result) {
+      case UDS_SUCCESS:
+        break;
+      case UDS_OVERFLOW:
+        overflowCount++;
+        break;
+      default:
+        logErrorWithStringError(result, "failed to build open chapter index");
+        return result;
+      }
+    }
+  }
+  if (overflowCount > 0) {
+    logWarning("Failed to add %d entries to chapter index", overflowCount);
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int closeOpenChapter(OpenChapterZone  **chapterZones,
+                     unsigned int       zoneCount,
+                     Volume            *volume,
+                     OpenChapterIndex  *chapterIndex,
+                     UdsChunkRecord    *collatedRecords,
+                     uint64_t           virtualChapterNumber)
+{
+  // Empty the delta chapter index, and prepare it for the new virtual chapter.
+  emptyOpenChapterIndex(chapterIndex, virtualChapterNumber);
+
+  // Map each non-deleted record name to its record page number in the delta
+  // chapter index.
+  int result = fillDeltaChapterIndex(chapterZones, zoneCount, chapterIndex,
+                                     collatedRecords);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Pass the populated chapter index and the records to the volume, which
+  // will generate and write the index and record pages for the chapter.
+  return writeChapter(volume, chapterIndex, collatedRecords);
+}
+
+/**********************************************************************/
+int saveOpenChapters(Index *index, BufferedWriter *writer)
+{
+  int result = writeToBufferedWriter(writer, OPEN_CHAPTER_MAGIC,
+                                     OPEN_CHAPTER_MAGIC_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = writeToBufferedWriter(writer, OPEN_CHAPTER_VERSION,
+                                 OPEN_CHAPTER_VERSION_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint32_t totalRecords = 0;
+  unsigned int i;
+  for (i = 0; i < index->zoneCount; i++) {
+    totalRecords += openChapterSize(index->zones[i]->openChapter);
+  }
+
+  // Store the record count in little-endian order.
+  byte totalRecordData[sizeof(totalRecords)];
+  storeUInt32LE(totalRecordData, totalRecords);
+
+  result = writeToBufferedWriter(writer, totalRecordData,
+                                 sizeof(totalRecordData));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Only write out the records that have been added and not deleted.
+  uint32_t recordsAdded = 0;
+  unsigned int recordIndex = 1;
+  while(recordsAdded < totalRecords) {
+    unsigned int i;
+    for (i = 0; i < index->zoneCount; i++) {
+      if (recordIndex > index->zones[i]->openChapter->size) {
+        continue;
+      }
+      if (index->zones[i]->openChapter->slots[recordIndex].recordDeleted) {
+        continue;
+      }
+      UdsChunkRecord *record
+        = &index->zones[i]->openChapter->records[recordIndex];
+      result = writeToBufferedWriter(writer, record, sizeof(UdsChunkRecord));
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+      recordsAdded++;
+    }
+    recordIndex++;
+  }
+
+  return flushBufferedWriter(writer);
+}
+
+/**********************************************************************/
+uint64_t computeSavedOpenChapterSize(Geometry *geometry)
+{
+  return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH +
+    sizeof(uint32_t) + geometry->recordsPerChapter * sizeof(UdsChunkRecord);
+}
+
+/**********************************************************************/
+static int writeOpenChapters(IndexComponent *component,
+                             BufferedWriter *writer,
+                             unsigned int    zone)
+{
+  int result = ASSERT((zone == 0), "open chapter write not zoned");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  Index *index = indexComponentData(component);
+  return saveOpenChapters(index, writer);
+}
+
+/**
+ * Read the version field from a buffered reader, checking whether it is a
+ * supported version. Returns (via a pointer parameter) the matching
+ * version constant, which can be used by comparing to the version
+ * constants using simple pointer equality.
+ *
+ * @param [in]  reader  A buffered reader.
+ * @param [out] version The version constant that was matched.
+ *
+ * @return UDS_SUCCESS or an error code if the file could not be read or
+ *         the version is invalid or unsupported
+ **/
+static int readVersion(BufferedReader *reader, const byte **version)
+{
+  byte buffer[OPEN_CHAPTER_VERSION_LENGTH];
+  int result = readFromBufferedReader(reader, buffer, sizeof(buffer));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) {
+    return logErrorWithStringError(UDS_CORRUPT_COMPONENT,
+                                   "Invalid open chapter version: %.*s",
+                                   (int) sizeof(buffer), buffer);
+  }
+  *version = OPEN_CHAPTER_VERSION;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int loadVersion20(Index *index, BufferedReader *reader)
+{
+  byte numRecordsData[sizeof(uint32_t)];
+  int result
+    = readFromBufferedReader(reader, &numRecordsData, sizeof(numRecordsData));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  uint32_t numRecords = getUInt32LE(numRecordsData);
+
+  // Keep track of which zones cannot accept any more records.
+  bool fullFlags[MAX_ZONES] = { false, };
+
+  // Assign records to the correct zones.
+  UdsChunkRecord record;
+  uint32_t records;
+  for (records = 0; records < numRecords; records++) {
+    result = readFromBufferedReader(reader, &record, sizeof(UdsChunkRecord));
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    unsigned int zone = 0;
+    if (index->zoneCount > 1) {
+      // A read-only index has no master index, but it also has only one zone.
+      zone = getMasterIndexZone(index->masterIndex, &record.name);
+    }
+    // Add records until the open chapter zone almost runs out of space.
+    // The chapter can't be closed here, so don't add the last record.
+    if (!fullFlags[zone]) {
+      unsigned int remaining;
+      result = putOpenChapter(index->zones[zone]->openChapter,
+                              &record.name, &record.data, &remaining);
+      fullFlags[zone] = (remaining <= 1);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int loadOpenChapters(Index *index, BufferedReader *reader)
+{
+  // Read and check the magic number.
+  int result =
+    verifyBufferedData(reader, OPEN_CHAPTER_MAGIC, OPEN_CHAPTER_MAGIC_LENGTH);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Read and check the version.
+  const byte *version = NULL;
+  result = readVersion(reader, &version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return loadVersion20(index, reader);
+}
+
+/**********************************************************************/
+int readOpenChapters(ReadPortal *portal)
+{
+  Index *index = indexComponentData(portal->component);
+
+  BufferedReader *reader;
+  int result = getBufferedReaderForPortal(portal, 0, &reader);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return loadOpenChapters(index, reader);
+}
diff --git a/uds/openChapter.h b/uds/openChapter.h
new file mode 100644
index 0000000..381badd
--- /dev/null
+++ b/uds/openChapter.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/openChapter.h#1 $
+ */
+
+#ifndef OPENCHAPTER_H
+#define OPENCHAPTER_H 1
+
+#include "common.h"
+#include "geometry.h"
+#include "index.h"
+#include "indexComponent.h"
+
+extern const IndexComponentInfo OPEN_CHAPTER_INFO;
+
+/**
+ * OpenChapter handles writing the open chapter records to the volume. It also
+ * manages the open chapter index component, and all the tools to generate and
+ * parse the open chapter file. The open chapter file interleaves records from
+ * each openChapterZone structure.
+ *
+ * <p>Once each open chapter zone is filled, the records are interleaved to
+ * preserve temporal locality, the index pages are generated through a
+ * delta chapter index, and the record pages are derived by sorting each
+ * page-sized batch of records by their names.
+ *
+ * <p>Upon index shutdown, the open chapter zone records are again
+ * interleaved, and the records are stored as a single array. The hash
+ * slots are not preserved, since the records may be reassigned to new
+ * zones at load time.
+ **/
+
+/**
+ * Close the open chapter and write it to disk.
+ *
+ * @param chapterZones         The zones of the chapter to close
+ * @param zoneCount            The number of zones
+ * @param volume               The volume to which to write the chapter
+ * @param chapterIndex         The OpenChapterIndex to use while writing
+ * @param collatedRecords      Collated records array to use while writing
+ * @param virtualChapterNumber The virtual chapter number of the open chapter
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int closeOpenChapter(OpenChapterZone  **chapterZones,
+                     unsigned int       zoneCount,
+                     Volume            *volume,
+                     OpenChapterIndex  *chapterIndex,
+                     UdsChunkRecord    *collatedRecords,
+                     uint64_t           virtualChapterNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write out a partially filled chapter to a file.
+ *
+ * @param index        the index to save the data from
+ * @param writer       the writer to write out the chapters
+ *
+ * @return UDS_SUCCESS on success
+ **/
+int saveOpenChapters(Index *index, BufferedWriter *writer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Read a partially filled chapter from a file.
+ *
+ * @param index        the index to load the data into
+ * @param reader       the buffered reader to read from
+ *
+ * @return UDS_SUCCESS on success
+ **/
+int loadOpenChapters(Index *index, BufferedReader *reader)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the size of the maximum open chapter save image.
+ *
+ * @param geometry      the index geometry
+ *
+ * @return the number of bytes of the largest possible open chapter save
+ *         image
+ **/
+uint64_t computeSavedOpenChapterSize(Geometry *geometry);
+
+#endif /* OPENCHAPTER_H */
diff --git a/uds/openChapterZone.c b/uds/openChapterZone.c
new file mode 100644
index 0000000..f346409
--- /dev/null
+++ b/uds/openChapterZone.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.c#2 $
+ */
+
+#include "openChapterZone.h"
+
+#include "compiler.h"
+#include "hashUtils.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+/**********************************************************************/
+static INLINE size_t recordsSize(const OpenChapterZone *openChapter)
+{
+  return (sizeof(UdsChunkRecord) * (1 + openChapter->capacity));
+}
+
+/**********************************************************************/
+static INLINE size_t slotsSize(size_t slotCount)
+{
+  return (sizeof(Slot) * slotCount);
+}
+
+/**
+ * Round up to the first power of two greater than or equal
+ * to the supplied number.
+ *
+ * @param val  the number to round up
+ *
+ * @return the first power of two not smaller than val for any
+ *         val <= 2^63
+ **/
+static INLINE size_t nextPowerOfTwo(size_t val)
+{
+  if (val == 0) {
+    return 1;
+  }
+  return (1 << computeBits(val - 1));
+}
+
+/**********************************************************************/
+int makeOpenChapter(const Geometry   *geometry,
+                    unsigned int      zoneCount,
+                    OpenChapterZone **openChapterPtr)
+{
+  int result = ASSERT(zoneCount > 0, "zone count must be > 0");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_WITH_ERROR_CODE(geometry->openChapterLoadRatio > 1,
+                                  UDS_BAD_STATE,
+                                  "Open chapter hash table is too small");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = ASSERT_WITH_ERROR_CODE((geometry->recordsPerChapter
+                                   <= OPEN_CHAPTER_MAX_RECORD_NUMBER),
+                                  UDS_BAD_STATE,
+                                  "Too many records (%u) for a single chapter",
+                                  geometry->recordsPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (geometry->recordsPerChapter < zoneCount) {
+    return logUnrecoverable(
+      UDS_INVALID_ARGUMENT,
+      "zone count: %u is larger than the records per chapter %u",
+      zoneCount, geometry->recordsPerChapter);
+  }
+  size_t capacity = geometry->recordsPerChapter / zoneCount;
+
+  // The slot count must be at least one greater than the capacity.
+  // Using a power of two slot count guarantees that hash insertion
+  // will never fail if the hash table is not full.
+  size_t slotCount = nextPowerOfTwo(capacity * geometry->openChapterLoadRatio);
+  OpenChapterZone *openChapter;
+  result = ALLOCATE_EXTENDED(OpenChapterZone, slotCount, Slot,
+                             "open chapter", &openChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  openChapter->slotCount = slotCount;
+  openChapter->capacity = capacity;
+  result = allocateCacheAligned(recordsSize(openChapter), "record pages",
+                                &openChapter->records);
+  if (result != UDS_SUCCESS) {
+    freeOpenChapter(openChapter);
+    return result;
+  }
+
+  *openChapterPtr = openChapter;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+size_t openChapterSize(const OpenChapterZone *openChapter)
+{
+  return openChapter->size - openChapter->deleted;
+}
+
+/**********************************************************************/
+void resetOpenChapter(OpenChapterZone *openChapter)
+{
+  openChapter->size    = 0;
+  openChapter->deleted = 0;
+
+  memset(openChapter->records, 0, recordsSize(openChapter));
+  memset(openChapter->slots,   0, slotsSize(openChapter->slotCount));
+}
+
+/**********************************************************************/
+static UdsChunkRecord *probeChapterSlots(OpenChapterZone    *openChapter,
+                                         const UdsChunkName *name,
+                                         unsigned int       *slotPtr,
+                                         unsigned int       *recordNumberPtr)
+{
+  unsigned int slots     = openChapter->slotCount;
+  unsigned int probe     = nameToHashSlot(name, slots);
+  unsigned int firstSlot = 0;
+
+  UdsChunkRecord *record;
+  unsigned int probeSlot;
+  unsigned int recordNumber;
+  unsigned int probeAttempts;
+
+  for (probeAttempts = 1; ; ++probeAttempts) {
+    probeSlot = firstSlot + probe;
+    recordNumber = openChapter->slots[probeSlot].recordNumber;
+
+    // If the hash slot is empty, we've reached the end of a chain without
+    // finding the record and should terminate the search.
+    if (recordNumber == 0) {
+      record = NULL;
+      break;
+    }
+
+    // If the name of the record referenced by the slot matches and has not
+    // been deleted, then we've found the requested name.
+    record = &openChapter->records[recordNumber];
+    if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0)
+        && !openChapter->slots[recordNumber].recordDeleted) {
+      break;
+    }
+
+    // Quadratic probing: advance the probe by 1, 2, 3, etc. and try again.
+    // This performs better than linear probing and works best for 2^N slots.
+    probe += probeAttempts;
+    if (probe >= slots) {
+      probe = probe % slots;
+    }
+  }
+
+  // These NULL checks will be optimized away in callers who don't care about
+  // the values when this function is inlined.
+  if (slotPtr != NULL) {
+    *slotPtr = probeSlot;
+  }
+  if (recordNumberPtr != NULL) {
+    *recordNumberPtr = recordNumber;
+  }
+
+  return record;
+}
+
+/**********************************************************************/
+void searchOpenChapter(OpenChapterZone     *openChapter,
+                       const UdsChunkName  *name,
+                       UdsChunkData        *metadata,
+                       bool                *found)
+{
+  UdsChunkRecord *record = probeChapterSlots(openChapter, name, NULL, NULL);
+
+  if (record == NULL) {
+    *found = false;
+  } else {
+    *found = true;
+    if (metadata != NULL) {
+      *metadata = record->data;
+    }
+  }
+}
+
+/**********************************************************************/
+int putOpenChapter(OpenChapterZone    *openChapter,
+                   const UdsChunkName *name,
+                   const UdsChunkData *metadata,
+                   unsigned int       *remaining)
+{
+  unsigned int slot;
+  UdsChunkRecord *record = probeChapterSlots(openChapter, name, &slot, NULL);
+
+  if (record != NULL) {
+    record->data = *metadata;
+    *remaining = openChapter->capacity - openChapter->size;
+    return UDS_SUCCESS;
+  }
+
+  if (openChapter->size >= openChapter->capacity) {
+    return makeUnrecoverable(UDS_VOLUME_OVERFLOW);
+  }
+
+  unsigned int recordNumber = ++openChapter->size;
+  openChapter->slots[slot].recordNumber = recordNumber;
+  record                                = &openChapter->records[recordNumber];
+  record->name                          = *name;
+  record->data                          = *metadata;
+
+  *remaining = openChapter->capacity - openChapter->size;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void removeFromOpenChapter(OpenChapterZone    *openChapter,
+                           const UdsChunkName *name,
+                           bool               *removed)
+{
+  unsigned int recordNumber;
+  UdsChunkRecord *record
+    = probeChapterSlots(openChapter, name, NULL, &recordNumber);
+
+  if (record == NULL) {
+    *removed = false;
+    return;
+  }
+
+  // Set the deleted flag on the recordNumber in the slot array so search
+  // won't find it and close won't index it.
+  openChapter->slots[recordNumber].recordDeleted = true;
+  openChapter->deleted += 1;
+  *removed = true;
+}
+
+/**********************************************************************/
+void freeOpenChapter(OpenChapterZone *openChapter)
+{
+  if (openChapter != NULL) {
+    FREE(openChapter->records);
+    FREE(openChapter);
+  }
+}
diff --git a/uds/openChapterZone.h b/uds/openChapterZone.h
new file mode 100644
index 0000000..cecee4b
--- /dev/null
+++ b/uds/openChapterZone.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.h#1 $
+ */
+
+#ifndef OPEN_CHAPTER_ZONE_H
+#define OPEN_CHAPTER_ZONE_H 1
+
+#include "common.h"
+#include "geometry.h"
+#include "typeDefs.h"
+
+/**
+ * OpenChapterZone is the mutable, in-memory representation of one zone's
+ * section of an Albireo index chapter.
+ *
+ * <p>In addition to providing the same access to records as an on-disk
+ * chapter, the open chapter zone must allow records to be added or
+ * modified. It must provide a way to generate the on-disk representation
+ * without excessive work. It does that by accumulating records in the order
+ * they are added (maintaining temporal locality), and referencing them (as
+ * record numbers) from hash slots selected from the name. If the metadata for
+ * a name changes, the record field is just modified in place.
+ *
+ * <p>Storage for the records (names and metadata) is allocated when the zone
+ * is created. It keeps no references to the data passed to it, and performs
+ * no additional allocation when adding records. Opening a new chapter simply
+ * marks it as being empty.
+ *
+ * <p>Records are stored in a flat array. To allow a value of zero in a
+ * hash slot to indicate that the slot is empty, records are numbered starting
+ * at one (1-based). Since C arrays are 0-based, the records array contains
+ * enough space for N+1 records, and the record that starts at array index
+ * zero is never used or referenced.
+ *
+ * <p>The array of hash slots is actually two arrays, superimposed: an
+ * array of record numbers, indexed by hash value, and an array of deleted
+ * flags, indexed by record number. This overlay is possible because the
+ * number of hash slots always exceeds the number of records, and is done
+ * simply to save on memory.
+ **/
+
+enum {
+  OPEN_CHAPTER_RECORD_NUMBER_BITS = 23,
+  OPEN_CHAPTER_MAX_RECORD_NUMBER = (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1
+};
+
+typedef struct {
+  /** If non-zero, the record number addressed by this hash slot */
+  unsigned int recordNumber : OPEN_CHAPTER_RECORD_NUMBER_BITS;
+  /** If true, the record at the index of this hash slot was deleted */
+  bool         recordDeleted : 1;
+} __attribute__((packed)) Slot;
+
+typedef struct openChapterZone {
+  /** Maximum number of records that can be stored */
+  unsigned int    capacity;
+  /** Number of records stored */
+  unsigned int    size;
+  /** Number of deleted records */
+  unsigned int    deleted;
+  /** Record data, stored as (name, metadata), 1-based */
+  UdsChunkRecord *records;
+  /** The number of slots in the chapter zone hash table. */
+  unsigned int    slotCount;
+  /** Hash table, referencing virtual record numbers */
+  Slot            slots[];
+} OpenChapterZone;
+
+/**
+ * Allocate an open chapter zone.
+ *
+ * @param geometry       the geometry of the volume
+ * @param zoneCount      the total number of open chapter zones
+ * @param openChapterPtr a pointer to hold the new open chapter
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeOpenChapter(const Geometry   *geometry,
+                    unsigned int      zoneCount,
+                    OpenChapterZone **openChapterPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the number of records in the open chapter zone that have not been
+ * deleted.
+ *
+ * @return the number of non-deleted records
+ **/
+size_t openChapterSize(const OpenChapterZone *openChapter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open a chapter by marking it empty.
+ *
+ * @param openChapter The chapter to open
+ **/
+void resetOpenChapter(OpenChapterZone *openChapter);
+
+/**
+ * Search the open chapter for a chunk name.
+ *
+ * @param openChapter The chapter to search
+ * @param name        The name of the desired chunk
+ * @param metadata    The holder for the metadata associated with the
+ *                    chunk, if found (or NULL)
+ * @param found       A pointer which will be set to true if the chunk
+ *                    name was found
+ **/
+void searchOpenChapter(OpenChapterZone    *openChapter,
+                       const UdsChunkName *name,
+                       UdsChunkData       *metadata,
+                       bool               *found);
+
+/**
+ * Put a record into the open chapter.
+ *
+ * @param openChapter The chapter into which to put the record
+ * @param name        The name of the record
+ * @param metadata    The record data
+ * @param remaining   Pointer to an integer set to the number of additional
+ *                    records that can be added to this chapter
+ *
+ * @return            UDS_SUCCESS or an error code
+ **/
+int putOpenChapter(OpenChapterZone    *openChapter,
+                   const UdsChunkName *name,
+                   const UdsChunkData *metadata,
+                   unsigned int       *remaining)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove a record from the open chapter.
+ *
+ * @param openChapter The chapter from which to remove the record
+ * @param name        The name of the record
+ * @param removed     Pointer to bool set to <code>true</code> if the
+ *                    record was found
+ **/
+void removeFromOpenChapter(OpenChapterZone    *openChapter,
+                           const UdsChunkName *name,
+                           bool               *removed);
+
+/**
+ * Clean up an open chapter and its memory.
+ *
+ * @param openChapter the chapter to destroy
+ **/
+void freeOpenChapter(OpenChapterZone *openChapter);
+
+#endif /* OPEN_CHAPTER_ZONE_H */
diff --git a/uds/pageCache.c b/uds/pageCache.c
new file mode 100644
index 0000000..b2db9a5
--- /dev/null
+++ b/uds/pageCache.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/pageCache.c#6 $
+ */
+
+#include "pageCache.h"
+
+#include "atomicDefs.h"
+#include "cacheCounters.h"
+#include "chapterIndex.h"
+#include "compiler.h"
+#include "errors.h"
+#include "geometry.h"
+#include "hashUtils.h"
+#include "indexConfig.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "recordPage.h"
+#include "stringUtils.h"
+#include "threads.h"
+#include "zone.h"
+
+/**********************************************************************/
+int assertPageInCache(PageCache *cache, CachedPage *page)
+{
+  int result = ASSERT((page->cp_physicalPage < cache->numIndexEntries),
+                      "physicalPage %u is valid (< %u)",
+                      page->cp_physicalPage, cache->numIndexEntries);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint16_t pageIndex = cache->index[page->cp_physicalPage];
+  return ASSERT((pageIndex < cache->numCacheEntries)
+                && (&cache->cache[pageIndex] == page),
+                "page is at expected location in cache");
+}
+
+/**
+ * Clear a cache page.  Note: this does not clear readPending - a read could
+ * still be pending and the read thread needs to be able to proceed and restart
+ * the requests regardless. This page will still be marked invalid, but it
+ * won't get reused (see getLeastRecentPage()) until the readPending flag
+ * is cleared. This is a valid case, e.g. the chapter gets forgotten and
+ * replaced with a new one in LRU.  Restarting the requests will lead them to
+ * not find the records in the MI.
+ *
+ * @param cache   the cache
+ * @param page    the cached page to clear
+ *
+ **/
+static void clearPage(PageCache *cache, CachedPage *page)
+{
+  page->cp_physicalPage = cache->numIndexEntries;
+  WRITE_ONCE(page->cp_lastUsed, 0);
+}
+
+/**
+ * Get a page from the cache, but with no stats
+ *
+ * @param cache        the cache
+ * @param physicalPage the physical page to get
+ * @param queueIndex   the index of the page in the read queue if
+ *                     queued, -1 otherwise
+ * @param pagePtr      a pointer to hold the page
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int getPageNoStats(PageCache     *cache,
+                          unsigned int   physicalPage,
+                          int           *queueIndex,
+                          CachedPage   **pagePtr)
+{
+  /*
+   * ASSERTION: We are either a zone thread holding a searchPendingCounter,
+   *            or we are any thread holding the readThreadsMutex.
+   *
+   * Holding only a searchPendingCounter is the most frequent case.
+   */
+
+  int result = ASSERT((physicalPage < cache->numIndexEntries),
+                      "physical page %u is invalid", physicalPage);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  /*
+   * It would be unlikely that the compiler turns the usage of indexValue into
+   * two reads of cache->index, but it would be possible and very bad if those
+   * reads did not return the same bits.
+   */
+  uint16_t indexValue = READ_ONCE(cache->index[physicalPage]);
+  bool     queued     = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0;
+  uint16_t index      = indexValue & ~VOLUME_CACHE_QUEUED_FLAG;
+
+  if (!queued && (index < cache->numCacheEntries)) {
+    *pagePtr = &cache->cache[index];
+    /*
+     * We have acquired access to the cached page, but unless we hold the
+     * readThreadsMutex, we need a read memory barrier now.  The corresponding
+     * write memory barrier is in putPageInCache.
+     */
+    smp_rmb();
+  } else {
+    *pagePtr = NULL;
+  }
+  if (queueIndex != NULL) {
+    *queueIndex = queued ? index : -1;
+  }
+  return UDS_SUCCESS;
+}
+
+/**
+ * Wait for all pending searches on a page in the cache to complete
+ *
+ * @param cache         the page cache
+ * @param physicalPage  the page to check searches on
+ **/
+static void waitForPendingSearches(PageCache *cache, unsigned int physicalPage)
+{
+  /*
+   * We hold the readThreadsMutex.  We are waiting for threads that do not hold
+   * the readThreadsMutex.  Those threads have "locked" their targeted page by
+   * setting the searchPendingCounter.  The corresponding write memory barrier
+   * is in beginPendingSearch.
+   */
+  smp_mb();
+
+  InvalidateCounter initialCounters[MAX_ZONES];
+  unsigned int i;
+  for (i = 0; i < cache->zoneCount; i++) {
+    initialCounters[i] = getInvalidateCounter(cache, i);
+  }
+  for (i = 0; i < cache->zoneCount; i++) {
+    if (searchPending(initialCounters[i])
+        && (pageBeingSearched(initialCounters[i]) == physicalPage)) {
+      // There is an active search using the physical page.
+      // We need to wait for the search to finish.
+      while (initialCounters[i] == getInvalidateCounter(cache, i)) {
+        yieldScheduler();
+      }
+    }
+  }
+}
+
+/**
+ * Invalidate a cache page
+ *
+ * @param cache   the cache
+ * @param page    the cached page
+ * @param reason  the reason for invalidation, for stats
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int invalidatePageInCache(PageCache          *cache,
+                                 CachedPage         *page,
+                                 InvalidationReason  reason)
+{
+  // We hold the readThreadsMutex.
+  if (page == NULL) {
+    return UDS_SUCCESS;
+  }
+
+  if (page->cp_physicalPage != cache->numIndexEntries) {
+    switch (reason) {
+    case INVALIDATION_EVICT:
+      cache->counters.evictions++;
+      break;
+    case INVALIDATION_EXPIRE:
+      cache->counters.expirations++;
+      break;
+    default:
+      break;
+    }
+
+    if (reason != INVALIDATION_ERROR) {
+      int result = assertPageInCache(cache, page);
+      if (result != UDS_SUCCESS) {
+        return result;
+      }
+    }
+
+    WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries);
+    waitForPendingSearches(cache, page->cp_physicalPage);
+  }
+
+  clearPage(cache, page);
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int findInvalidateAndMakeLeastRecent(PageCache          *cache,
+                                     unsigned int        physicalPage,
+                                     QueuedRead         *readQueue,
+                                     InvalidationReason  reason,
+                                     bool                mustFind)
+{
+  // We hold the readThreadsMutex.
+  if (cache == NULL) {
+    return UDS_SUCCESS;
+  }
+
+  CachedPage *page;
+  int queuedIndex = -1;
+  int result
+    = getPageNoStats(cache, physicalPage,
+                     ((readQueue != NULL) ? &queuedIndex : NULL), &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (page == NULL) {
+    result = ASSERT(!mustFind, "found page");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    if (queuedIndex > -1) {
+      logDebug("setting pending read to invalid");
+      readQueue[queuedIndex].invalid = true;
+    }
+    return UDS_SUCCESS;
+  }
+
+  // Invalidate the page and unmap it from the cache.
+  result = invalidatePageInCache(cache, page, reason);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Move the cached page to the least recently used end of the list
+  // so it will be replaced before any page with valid data.
+  WRITE_ONCE(page->cp_lastUsed, 0);
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int initializePageCache(PageCache      *cache,
+                               const Geometry *geometry,
+                               unsigned int    chaptersInCache,
+                               unsigned int    readQueueMaxSize,
+                               unsigned int    zoneCount)
+{
+  cache->geometry  = geometry;
+  cache->numIndexEntries = geometry->pagesPerVolume + 1;
+  cache->numCacheEntries = chaptersInCache * geometry->recordPagesPerChapter;
+  cache->readQueueMaxSize = readQueueMaxSize;
+  cache->zoneCount = zoneCount;
+  atomic64_set(&cache->clock, 1);
+
+  int result = ALLOCATE(readQueueMaxSize, QueuedRead,
+                        "volume read queue", &cache->readQueue);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(cache->zoneCount, SearchPendingCounter,
+                    "Volume Cache Zones", &cache->searchPendingCounters);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((cache->numCacheEntries <= VOLUME_CACHE_MAX_ENTRIES),
+                  "requested cache size, %u, within limit %u",
+                  cache->numCacheEntries, VOLUME_CACHE_MAX_ENTRIES);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(cache->numIndexEntries, uint16_t, "page cache index",
+                    &cache->index);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Initialize index values to invalid values.
+  unsigned int i;
+  for (i = 0; i < cache->numIndexEntries; i++) {
+    cache->index[i] = cache->numCacheEntries;
+  }
+
+  result = ALLOCATE(cache->numCacheEntries, CachedPage,
+                    "page cache cache", &cache->cache);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  for (i = 0; i < cache->numCacheEntries; i++) {
+    CachedPage *page = &cache->cache[i];
+    result = initializeVolumePage(geometry, &page->cp_pageData);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    clearPage(cache, page);
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*********************************************************************/
+int makePageCache(const Geometry  *geometry,
+                  unsigned int     chaptersInCache,
+                  unsigned int     readQueueMaxSize,
+                  unsigned int     zoneCount,
+                  PageCache      **cachePtr)
+{
+  if (chaptersInCache < 1) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "cache size must be"
+                                     " at least one chapter");
+  }
+  if (readQueueMaxSize <= 0) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "read queue max size must be"
+                                     " greater than 0");
+  }
+  if (zoneCount < 1) {
+    return logWarningWithStringError(UDS_INVALID_ARGUMENT,
+                                     "cache must have at least one zone");
+  }
+
+  PageCache *cache;
+  int result = ALLOCATE(1, PageCache, "volume cache", &cache);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initializePageCache(cache, geometry, chaptersInCache,
+                               readQueueMaxSize, zoneCount);
+  if (result != UDS_SUCCESS) {
+    freePageCache(cache);
+    return result;
+  }
+
+  *cachePtr = cache;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freePageCache(PageCache *cache)
+{
+  if (cache == NULL) {
+    return;
+  }
+  if (cache->cache != NULL) {
+    unsigned int i;
+    for (i = 0; i < cache->numCacheEntries; i++) {
+      destroyVolumePage(&cache->cache[i].cp_pageData);
+    }
+  }
+  FREE(cache->index);
+  FREE(cache->cache);
+  FREE(cache->searchPendingCounters);
+  FREE(cache->readQueue);
+  FREE(cache);
+}
+
+/**********************************************************************/
+int invalidatePageCacheForChapter(PageCache          *cache,
+                                  unsigned int        chapter,
+                                  unsigned int        pagesPerChapter,
+                                  InvalidationReason  reason)
+{
+  // We hold the readThreadsMutex.
+  if ((cache == NULL) || (cache->cache == NULL)) {
+    return UDS_SUCCESS;
+  }
+
+  int result;
+  unsigned int i;
+  for (i = 0; i < pagesPerChapter; i++) {
+    unsigned int physicalPage = 1 + (pagesPerChapter * chapter) + i;
+    result = findInvalidateAndMakeLeastRecent(cache, physicalPage,
+                                              cache->readQueue, reason, false);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/*********************************************************************/
+void makePageMostRecent(PageCache *cache, CachedPage *page)
+{
+  // ASSERTION: We are either a zone thread holding a searchPendingCounter,
+  //            or we are any thread holding the readThreadsMutex.
+  if (atomic64_read(&cache->clock) != READ_ONCE(page->cp_lastUsed)) {
+    WRITE_ONCE(page->cp_lastUsed, atomic64_inc_return(&cache->clock));
+  }
+}
+
+/**
+ * Get the least recent valid page from the cache.
+ *
+ * @param cache    the cache
+ * @param pagePtr  a pointer to hold the new page (will be set to NULL
+ *                 if the page was not found)
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int getLeastRecentPage(PageCache *cache, CachedPage **pagePtr)
+{
+  // We hold the readThreadsMutex.
+  int oldestIndex = 0;
+  // Our first candidate is any page that does have a pending read.  We ensure
+  // above that there are more entries than read threads, so there must be one.
+  unsigned int i;
+  for (i = 0;; i++) {
+    if (i >= cache->numCacheEntries) {
+      // This should never happen.
+      return ASSERT(false, "oldest page is not NULL");
+    }
+    if (!cache->cache[i].cp_readPending) {
+      oldestIndex = i;
+      break;
+    }
+  }
+  // Now find the least recently used page that does not have a pending read.
+  for (i = 0; i < cache->numCacheEntries; i++) {
+    if (!cache->cache[i].cp_readPending
+        && (READ_ONCE(cache->cache[i].cp_lastUsed)
+            <= READ_ONCE(cache->cache[oldestIndex].cp_lastUsed))) {
+      oldestIndex = i;
+    }
+  }
+  *pagePtr = &cache->cache[oldestIndex];
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int getPageFromCache(PageCache     *cache,
+                     unsigned int   physicalPage,
+                     int            probeType,
+                     CachedPage   **pagePtr)
+{
+  // ASSERTION: We are in a zone thread.
+  // ASSERTION: We holding a searchPendingCounter or the readThreadsMutex.
+  if (cache == NULL) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "cannot get page with NULL cache");
+  }
+
+  // Get the cache page from the index
+  CachedPage *page;
+  int queueIndex = -1;
+  int result = getPageNoStats(cache, physicalPage, &queueIndex, &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  CacheResultKind cacheResult = ((page != NULL)
+                                 ? CACHE_RESULT_HIT
+                                 : ((queueIndex != -1)
+                                    ? CACHE_RESULT_QUEUED
+                                    : CACHE_RESULT_MISS));
+  incrementCacheCounter(&cache->counters, probeType, cacheResult);
+
+  if (pagePtr != NULL) {
+    *pagePtr = page;
+  }
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage)
+{
+  // We hold the readThreadsMutex.
+  uint16_t first = cache->readQueueFirst;
+  uint16_t last  = cache->readQueueLast;
+  uint16_t next  = (last + 1) % cache->readQueueMaxSize;
+  uint16_t readQueuePos;
+
+  if ((cache->index[physicalPage] & VOLUME_CACHE_QUEUED_FLAG) == 0) {
+    /* Not seen before, add this to the read queue and mark it as queued */
+    if (next == first) {
+      /* queue is full */
+      return UDS_SUCCESS;
+    }
+    /* fill the read queue entry */
+    cache->readQueue[last].physicalPage = physicalPage;
+    cache->readQueue[last].invalid = false;
+
+    /* point the cache index to it */
+    readQueuePos = last;
+    WRITE_ONCE(cache->index[physicalPage],
+               readQueuePos | VOLUME_CACHE_QUEUED_FLAG);
+    cache->readQueue[readQueuePos].requestList.first = NULL;
+    cache->readQueue[readQueuePos].requestList.last = NULL;
+    /* bump the last pointer */
+    cache->readQueueLast = next;
+  } else {
+    /* It's already queued, just add on to it */
+    readQueuePos = cache->index[physicalPage] & ~VOLUME_CACHE_QUEUED_FLAG;
+  }
+
+  int result = ASSERT((readQueuePos < cache->readQueueMaxSize),
+                      "queue is not overfull");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  request->nextRequest = NULL;
+  if (cache->readQueue[readQueuePos].requestList.first == NULL) {
+    cache->readQueue[readQueuePos].requestList.first = request;
+  } else {
+    cache->readQueue[readQueuePos].requestList.last->nextRequest = request;
+  }
+  cache->readQueue[readQueuePos].requestList.last = request;
+  return UDS_QUEUED;
+}
+
+/***********************************************************************/
+bool reserveReadQueueEntry(PageCache     *cache,
+                           unsigned int  *queuePos,
+                           Request      **firstRequest,
+                           unsigned int  *physicalPage,
+                           bool          *invalid)
+{
+  // We hold the readThreadsMutex.
+  uint16_t lastRead = cache->readQueueLastRead;
+
+  // No items to dequeue
+  if (lastRead == cache->readQueueLast) {
+    return false;
+  }
+
+  unsigned int pageNo    = cache->readQueue[lastRead].physicalPage;
+  bool         isInvalid = cache->readQueue[lastRead].invalid;
+
+  uint16_t indexValue = cache->index[pageNo];
+  bool     queued     = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0;
+
+  // ALB-1429 ... need to check to see if its still queued before resetting
+  if (isInvalid && queued) {
+    // invalidate cache index slot
+    WRITE_ONCE(cache->index[pageNo], cache->numCacheEntries);
+  }
+
+  // If a sync read has taken this page, set invalid to true so we don't
+  // overwrite, we simply just requeue requests.
+  if (!queued) {
+    isInvalid = true;
+  }
+
+  cache->readQueue[lastRead].reserved = true;
+
+  *queuePos                = lastRead;
+  *firstRequest            = cache->readQueue[lastRead].requestList.first;
+  *physicalPage            = pageNo;
+  *invalid                 = isInvalid;
+  cache->readQueueLastRead = (lastRead  + 1) % cache->readQueueMaxSize;
+
+  return true;
+}
+
+/************************************************************************/
+void releaseReadQueueEntry(PageCache *cache, unsigned int queuePos)
+{
+  // We hold the readThreadsMutex.
+  cache->readQueue[queuePos].reserved = false;
+
+  uint16_t lastRead = cache->readQueueLastRead;
+
+  // Move the readQueueFirst pointer along when we can
+  while ((cache->readQueueFirst != lastRead)
+         && (!cache->readQueue[cache->readQueueFirst].reserved)) {
+    cache->readQueueFirst =
+      (cache->readQueueFirst + 1) % cache->readQueueMaxSize;
+  }
+}
+
+/***********************************************************************/
+int selectVictimInCache(PageCache   *cache,
+                        CachedPage **pagePtr)
+{
+  // We hold the readThreadsMutex.
+  if (cache == NULL) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "cannot put page in NULL cache");
+  }
+
+  CachedPage *page = NULL;
+  int result = getLeastRecentPage(cache, &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((page != NULL), "least recent page was not NULL");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // If the page is currently being pointed to by the page map, clear
+  // it from the page map, and update cache stats
+  if (page->cp_physicalPage != cache->numIndexEntries) {
+    cache->counters.evictions++;
+    WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries);
+    waitForPendingSearches(cache, page->cp_physicalPage);
+  }
+
+  page->cp_readPending = true;
+
+  *pagePtr = page;
+
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+int putPageInCache(PageCache    *cache,
+                   unsigned int  physicalPage,
+                   CachedPage   *page)
+{
+  // We hold the readThreadsMutex.
+  if (cache == NULL) {
+    return logWarningWithStringError(UDS_BAD_STATE,
+                                     "cannot complete page in NULL cache");
+  }
+
+  int result = ASSERT((page != NULL), "page to install exists");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((page->cp_readPending),
+                  "page to install has a pending read");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  clearPage(cache, page);
+
+  page->cp_physicalPage = physicalPage;
+
+  // Figure out the index into the cache array using pointer arithmetic
+  uint16_t value = page - cache->cache;
+  result = ASSERT((value < cache->numCacheEntries), "cache index is valid");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  makePageMostRecent(cache, page);
+
+  page->cp_readPending = false;
+
+  /*
+   * We hold the readThreadsMutex, but we must have a write memory barrier
+   * before making the CachedPage available to the readers that do not hold the
+   * mutex.  The corresponding read memory barrier is in getPageNoStats.
+   */
+  smp_wmb();
+
+  // Point the page map to the new page. Will clear queued flag
+  WRITE_ONCE(cache->index[physicalPage], value);
+
+  return UDS_SUCCESS;
+}
+
+/***********************************************************************/
+void cancelPageInCache(PageCache    *cache,
+                       unsigned int  physicalPage,
+                       CachedPage   *page)
+{
+  // We hold the readThreadsMutex.
+  if (cache == NULL) {
+    logWarning("cannot cancel page in NULL cache");
+    return;
+  }
+
+  int result = ASSERT((page != NULL), "page to install exists");
+  if (result != UDS_SUCCESS) {
+    return;
+  }
+
+  result = ASSERT((page->cp_readPending),
+                  "page to install has a pending read");
+  if (result != UDS_SUCCESS) {
+    return;
+  }
+
+  clearPage(cache, page);
+  page->cp_readPending = false;
+
+  // Clear the page map for the new page. Will clear queued flag
+  WRITE_ONCE(cache->index[physicalPage], cache->numCacheEntries);
+}
+
+/**********************************************************************/
+size_t getPageCacheSize(PageCache *cache)
+{
+  if (cache == NULL) {
+    return 0;
+  }
+  return sizeof(DeltaIndexPage) * cache->numCacheEntries;
+}
+
diff --git a/uds/pageCache.h b/uds/pageCache.h
new file mode 100644
index 0000000..d639b4a
--- /dev/null
+++ b/uds/pageCache.h
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/pageCache.h#5 $
+ */
+
+#ifndef PAGE_CACHE_H
+#define PAGE_CACHE_H
+
+#include "atomicDefs.h"
+#include "cacheCounters.h"
+#include "chapterIndex.h"
+#include "common.h"
+#include "compiler.h"
+#include "indexConfig.h"
+#include "opaqueTypes.h"
+#include "permassert.h"
+#include "request.h"
+#include "volumeStore.h"
+
+typedef struct requestList {
+  Request *first;
+  Request *last;
+} RequestList;
+
+typedef struct cachedPage {
+  /* whether this page is currently being read asynchronously */
+  bool               cp_readPending;
+  /* if equal to numCacheEntries, the page is invalid */
+  unsigned int       cp_physicalPage;
+  /* the value of the volume clock when this page was last used */
+  int64_t            cp_lastUsed;
+  /* the cache page data */
+  struct volume_page cp_pageData;
+  /* the chapter index page. This is here, even for record pages */
+  DeltaIndexPage     cp_indexPage;
+} CachedPage;
+
+enum {
+  VOLUME_CACHE_MAX_ENTRIES              = (UINT16_MAX >> 1),
+  VOLUME_CACHE_QUEUED_FLAG              = (1 << 15),
+  VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS = 4096
+};
+
+typedef struct queuedRead {
+  /* whether this queue entry is invalid */
+  bool         invalid;
+  /* whether this queue entry has a pending read on it */
+  bool         reserved;
+  /* physical page to read */
+  unsigned int physicalPage;
+  /* list of requests waiting on a queued read */
+  RequestList  requestList;
+} QueuedRead;
+
+// Reason for invalidating a cache entry, used for gathering statistics
+typedef enum invalidationReason {
+  INVALIDATION_EVICT,           // cache is full, goodbye
+  INVALIDATION_EXPIRE,          // your chapter is being overwritten
+  INVALIDATION_ERROR,           // error happened; don't try to use data
+  INVALIDATION_INIT_SHUTDOWN
+} InvalidationReason;
+
+/*
+ * Value stored atomically in a SearchPendingCounter.  The low order 32 bits is
+ * the physical page number of the cached page being read.  The high order 32
+ * bits is a sequence number.
+ *
+ * An InvalidateCounter is only written by its zone thread by calling the
+ * beginPendingSearch or endPendingSearch methods.
+ *
+ * Any other thread that is accessing an InvalidateCounter is reading the value
+ * in the waitForPendingSearches method.
+ */
+typedef int64_t InvalidateCounter;
+// Fields of InvalidateCounter.
+// These must be 64 bit, so an enum cannot be not used.
+#define PAGE_FIELD  ((long)UINT_MAX)   // The page number field
+#define COUNTER_LSB (PAGE_FIELD + 1L)  // The LSB of the counter field
+
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) {
+  atomic64_t atomicValue;
+} SearchPendingCounter;
+
+typedef struct pageCache {
+  // Geometry governing the volume
+  const Geometry *geometry;
+  // The number of zones
+  unsigned int    zoneCount;
+  // The number of index entries
+  unsigned int    numIndexEntries;
+  // The max number of cached entries
+  uint16_t        numCacheEntries;
+  // The index used to quickly access page in cache - top bit is a 'queued'
+  // flag
+  uint16_t       *index;
+  // The cache
+  CachedPage     *cache;
+  // A counter for each zone to keep track of when a search is occurring
+  // within that zone.
+  SearchPendingCounter *searchPendingCounters;
+  // Queued reads, as a circular array, with first and last indexes
+  QueuedRead     *readQueue;
+  // Cache counters for stats.  This is the first field of a PageCache that is
+  // not constant after the struct is initialized.
+  CacheCounters   counters;
+  /**
+   * Entries are enqueued at readQueueLast.
+   * To 'reserve' entries, we get the entry pointed to by readQueueLastRead
+   * and increment last read.  This is done with a lock so if another reader
+   * thread reserves a read, it will grab the next one.  After every read
+   * is completed, the reader thread calls releaseReadQueueEntry which
+   * increments readQueueFirst until it is equal to readQueueLastRead, but only
+   * if the value pointed to by readQueueFirst is no longer pending.
+   * This means that if n reads are outstanding, readQueueFirst may not
+   * be incremented until the last of the reads finishes.
+   *
+   *  First                    Last
+   * ||    |    |    |    |    |    ||
+   *   LR   (1)   (2)
+   *
+   * Read thread 1 increments last read (1), then read thread 2 increments it
+   * (2). When each read completes, it checks to see if it can increment first,
+   * when all concurrent reads have completed, readQueueFirst should equal
+   * readQueueLastRead.
+   **/
+  uint16_t              readQueueFirst;
+  uint16_t              readQueueLastRead;
+  uint16_t              readQueueLast;
+  // The size of the read queue
+  unsigned int          readQueueMaxSize;
+  // Page access counter
+  atomic64_t            clock;
+} PageCache;
+
+/**
+ * Allocate a cache for a volume.
+ *
+ * @param geometry          The geometry governing the volume
+ * @param chaptersInCache   The size (in chapters) of the page cache
+ * @param readQueueMaxSize  The maximum size of the read queue
+ * @param zoneCount         The number of zones in the index
+ * @param cachePtr          A pointer to hold the new page cache
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makePageCache(const Geometry  *geometry,
+                  unsigned int     chaptersInCache,
+                  unsigned int     readQueueMaxSize,
+                  unsigned int     zoneCount,
+                  PageCache      **cachePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up a volume's cache
+ *
+ * @param cache the volumecache
+ **/
+void freePageCache(PageCache *cache);
+
+/**
+ * Invalidates a page cache for a particular chapter
+ *
+ * @param cache           the page cache
+ * @param chapter         the chapter
+ * @param pagesPerChapter the number of pages per chapter
+ * @param reason          the reason for invalidation
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int invalidatePageCacheForChapter(PageCache          *cache,
+                                  unsigned int        chapter,
+                                  unsigned int        pagesPerChapter,
+                                  InvalidationReason  reason)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find a page, invalidate it, and make its memory the least recent.  This
+ * method is only exposed for the use of unit tests.
+ *
+ * @param cache        The cache containing the page
+ * @param physicalPage The id of the page to invalidate
+ * @param readQueue    The queue of pending reads (may be NULL)
+ * @param reason       The reason for the invalidation, for stats
+ * @param mustFind     If <code>true</code>, it is an error if the page
+ *                     can't be found
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int findInvalidateAndMakeLeastRecent(PageCache          *cache,
+                                     unsigned int        physicalPage,
+                                     QueuedRead         *readQueue,
+                                     InvalidationReason  reason,
+                                     bool                mustFind);
+
+/**
+ * Make the page the most recent in the cache
+ *
+ * @param cache   the page cache
+ * @param pagePtr the page to make most recent
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+void makePageMostRecent(PageCache *cache, CachedPage *pagePtr);
+
+/**
+ * Verifies that a page is in the cache.  This method is only exposed for the
+ * use of unit tests.
+ *
+ * @param cache the cache to verify
+ * @param page the page to find
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int assertPageInCache(PageCache *cache, CachedPage *page)
+  __attribute__((warn_unused_result));
+
+/**
+ * Gets a page from the cache.
+ *
+ * @param [in] cache        the page cache
+ * @param [in] physicalPage the page number
+ * @param [in] probeType    the type of cache access being done (CacheProbeType
+ *                          optionally OR'ed with CACHE_PROBE_IGNORE_FAILURE)
+ * @param [out] pagePtr     the found page
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getPageFromCache(PageCache     *cache,
+                     unsigned int   physicalPage,
+                     int            probeType,
+                     CachedPage   **pagePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Enqueue a read request
+ *
+ * @param cache        the page cache
+ * @param request      the request that depends on the read
+ * @param physicalPage the physicalPage for the request
+ *
+ * @return UDS_QUEUED  if the page was queued
+ *         UDS_SUCCESS if the queue was full
+ *         an error code if there was an error
+ **/
+int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Reserves a queued read for future dequeuing, but does not remove it from
+ * the queue. Must call releaseReadQueueEntry to complete the process
+ *
+ * @param cache          the page cache
+ * @param queuePos       the position in the read queue for this pending read
+ * @param firstRequests  list of requests for the pending read
+ * @param physicalPage   the physicalPage for the requests
+ * @param invalid        whether or not this entry is invalid
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+bool reserveReadQueueEntry(PageCache     *cache,
+                           unsigned int  *queuePos,
+                           Request      **firstRequests,
+                           unsigned int  *physicalPage,
+                           bool          *invalid);
+
+/**
+ * Releases a read from the queue, allowing it to be reused by future
+ * enqueues
+ *
+ * @param cache      the page cache
+ * @param queuePos   queue entry position
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+void releaseReadQueueEntry(PageCache    *cache,
+                           unsigned int  queuePos);
+
+/**
+ * Check for the page cache read queue being empty.
+ *
+ * @param cache  the page cache for which to check the read queue.
+ *
+ * @return  true if the read queue for cache is empty, false otherwise.
+ **/
+static INLINE bool readQueueIsEmpty(PageCache *cache)
+{
+  return (cache->readQueueFirst == cache->readQueueLast);
+}
+
+/**
+ * Check for the page cache read queue being full.
+ *
+ * @param cache  the page cache for which to check the read queue.
+ *
+ * @return  true if the read queue for cache is full, false otherwise.
+ **/
+static INLINE bool readQueueIsFull(PageCache *cache)
+{
+  return (cache->readQueueFirst ==
+    (cache->readQueueLast + 1) % cache->readQueueMaxSize);
+}
+
+/**
+ * Selects a page in the cache to be used for a read.
+ *
+ * This will clear the pointer in the page map and
+ * set readPending to true on the cache page
+ *
+ * @param cache          the page cache
+ * @param pagePtr        the page to add
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int selectVictimInCache(PageCache     *cache,
+                        CachedPage   **pagePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Completes an async page read in the cache, so that
+ * the page can now be used for incoming requests.
+ *
+ * This will invalidate the old cache entry and point
+ * the page map for the new page to this entry
+ *
+ * @param cache          the page cache
+ * @param physicalPage   the page number
+ * @param page           the page to complete processing on
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int putPageInCache(PageCache    *cache,
+                   unsigned int  physicalPage,
+                   CachedPage   *page)
+  __attribute__((warn_unused_result));
+
+/**
+ * Cancels an async page read in the cache, so that
+ * the page can now be used for incoming requests.
+ *
+ * This will invalidate the old cache entry and clear
+ * the read queued flag on the page map entry, if it
+ * was set.
+ *
+ * @param cache          the page cache
+ * @param physicalPage   the page number to clear the queued read flag on
+ * @param page           the page to cancel processing on
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+void cancelPageInCache(PageCache    *cache,
+                       unsigned int  physicalPage,
+                       CachedPage   *page);
+
+/**
+ * Get the page cache size
+ *
+ * @param cache the page cache
+ *
+ * @return the size of the page cache
+ **/
+size_t getPageCacheSize(PageCache *cache)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Read the InvalidateCounter for the given zone.
+ *
+ * @param cache       the page cache
+ * @param zoneNumber  the zone number
+ *
+ * @return the InvalidateCounter value
+ **/
+static INLINE InvalidateCounter getInvalidateCounter(PageCache    *cache,
+                                                     unsigned int  zoneNumber)
+{
+  return atomic64_read(&cache->searchPendingCounters[zoneNumber].atomicValue);
+}
+
+/**
+ * Write the InvalidateCounter for the given zone.
+ *
+ * @param cache              the page cache
+ * @param zoneNumber         the zone number
+ * @param invalidateCounter  the InvalidateCounter value to write
+ **/
+static INLINE void setInvalidateCounter(PageCache         *cache,
+                                        unsigned int       zoneNumber,
+                                        InvalidateCounter  invalidateCounter)
+{
+  atomic64_set(&cache->searchPendingCounters[zoneNumber].atomicValue,
+               invalidateCounter);
+}
+
+/**
+ * Return the physical page number of the page being searched.  The return
+ * value is only valid if searchPending indicates that a search is in progress.
+ *
+ * @param counter  the InvalidateCounter value to check
+ *
+ * @return the page that the zone is searching
+ **/
+static INLINE unsigned int pageBeingSearched(InvalidateCounter counter)
+{
+  return counter & PAGE_FIELD;
+}
+
+/**
+ * Determines whether a given value indicates that a search is occuring.
+ *
+ * @param invalidateCounter  the InvalidateCounter value to check
+ *
+ * @return true if a search is pending, false otherwise
+ **/
+static INLINE bool searchPending(InvalidateCounter invalidateCounter)
+{
+  return (invalidateCounter & COUNTER_LSB) != 0;
+}
+
+/**
+ * Determines whether there is a search occuring for the given zone.
+ *
+ * @param cache       the page cache
+ * @param zoneNumber  the zone number
+ *
+ * @return true if a search is pending, false otherwise
+ **/
+static INLINE bool isSearchPending(PageCache    *cache,
+                                   unsigned int  zoneNumber)
+{
+  return searchPending(getInvalidateCounter(cache, zoneNumber));
+}
+
+/**
+ * Increment the counter for the specified zone to signal that a search has
+ * begun.  Also set which page is being searched.  The searchPendingCounters
+ * are protecting read access to pages indexed by the cache.  This is the
+ * "lock" action.
+ *
+ * @param cache         the page cache
+ * @param physicalPage  the page that the zone is searching
+ * @param zoneNumber    the zone number
+ **/
+static INLINE void beginPendingSearch(PageCache    *cache,
+                                      unsigned int  physicalPage,
+                                      unsigned int  zoneNumber)
+{
+  InvalidateCounter invalidateCounter = getInvalidateCounter(cache,
+                                                             zoneNumber);
+  invalidateCounter &= ~PAGE_FIELD;
+  invalidateCounter |= physicalPage;
+  invalidateCounter += COUNTER_LSB;
+  setInvalidateCounter(cache, zoneNumber, invalidateCounter);
+  ASSERT_LOG_ONLY(searchPending(invalidateCounter),
+                  "Search is pending for zone %u", zoneNumber);
+  /*
+   * This memory barrier ensures that the write to the invalidate counter is
+   * seen by other threads before this threads accesses the cached page.  The
+   * corresponding read memory barrier is in waitForPendingSearches.
+   */
+  smp_mb();
+}
+
+/**
+ * Increment the counter for the specified zone to signal that a search has
+ * finished.  We do not need to reset the page since we only should ever look
+ * at the page value if the counter indicates a search is ongoing.  The
+ * searchPendingCounters are protecting read access to pages indexed by the
+ * cache.  This is the "unlock" action.
+ *
+ * @param cache       the page cache
+ * @param zoneNumber  the zone number
+ **/
+static INLINE void endPendingSearch(PageCache    *cache,
+                                    unsigned int  zoneNumber)
+{
+  // This memory barrier ensures that this thread completes reads of the
+  // cached page before other threads see the write to the invalidate counter.
+  smp_mb();
+
+  InvalidateCounter invalidateCounter = getInvalidateCounter(cache,
+                                                             zoneNumber);
+  ASSERT_LOG_ONLY(searchPending(invalidateCounter),
+                  "Search is pending for zone %u", zoneNumber);
+  invalidateCounter += COUNTER_LSB;
+  setInvalidateCounter(cache, zoneNumber, invalidateCounter);
+}
+
+#endif /* PAGE_CACHE_H */
diff --git a/uds/permassert.c b/uds/permassert.c
new file mode 100644
index 0000000..0c8afeb
--- /dev/null
+++ b/uds/permassert.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/permassert.c#1 $
+ */
+
+#include "permassert.h"
+#include "permassertInternals.h"
+
+#include "errors.h"
+
+/*****************************************************************************/
+int assertionFailed(const char *expressionString,
+                    int         code,
+                    const char *fileName,
+                    int         lineNumber,
+                    const char *format,
+                    ...)
+{
+  va_list args;
+  va_start(args, format);
+  handleAssertionFailure(expressionString, fileName, lineNumber, format, args);
+  va_end(args);
+
+  return code;
+}
+
+/*****************************************************************************/
+int assertionFailedLogOnly(const char *expressionString,
+                           const char *fileName,
+                           int         lineNumber,
+                           const char *format,
+                           ...)
+{
+  va_list args;
+  va_start(args, format);
+  handleAssertionFailure(expressionString, fileName, lineNumber, format, args);
+  va_end(args);
+
+  return UDS_ASSERTION_FAILED;
+}
diff --git a/uds/permassert.h b/uds/permassert.h
new file mode 100644
index 0000000..d04336b
--- /dev/null
+++ b/uds/permassert.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/permassert.h#1 $
+ */
+
+#ifndef PERMASSERT_H
+#define PERMASSERT_H
+
+#include "compiler.h"
+#include "errors.h"
+#include "uds-error.h"
+
+#define STRINGIFY(X) #X
+#define STRINGIFY_VALUE(X) STRINGIFY(X)
+
+/*
+ * A hack to apply the "warn if unused" attribute to an integral expression.
+ *
+ * Since GCC doesn't propagate the warn_unused_result attribute to
+ * conditional expressions incorporating calls to functions with that
+ * attribute, this function can be used to wrap such an expression.
+ * With optimization enabled, this function contributes no additional
+ * instructions, but the warn_unused_result attribute still applies to
+ * the code calling it.
+ *
+ * @param value  The value to return
+ *
+ * @return       The supplied value
+ */
+__attribute__((warn_unused_result))
+static INLINE int mustUse(int value)
+{
+  return value;
+}
+
+/*
+ * A replacement for assert() from assert.h.
+ *
+ * @param expr      The boolean expression being asserted
+ * @param code      The error code to return on non-fatal assertion
+ *                  failure
+ * @param format    A printf() style format for the message to log on
+ *                  assertion failure
+ * @param arguments Any additional arguments required by the format
+ *
+ * @return UDS_SUCCESS If expr is true, code if expr is false and
+ *         exitOnAssertionFailure is false. When exitOnAssertionFailure
+ *         is true and expr is false, the program will exit from within
+ *         this macro.
+ */
+#define ASSERT_WITH_ERROR_CODE(expr, code, ...)                         \
+  mustUse(__builtin_expect(!!(expr), 1)                                 \
+          ? UDS_SUCCESS                                                 \
+          : assertionFailed(STRINGIFY(expr), code, __FILE__, __LINE__,  \
+                            __VA_ARGS__))
+
+/*
+ * A replacement for assert() from assert.h.
+ *
+ * @param expr      The boolean expression being asserted
+ * @param format    A printf() style format for the message to log on
+ *                  assertion failure
+ * @param arguments Any additional arguments required by the format
+ *
+ * @return UDS_SUCCESS If expr is true, UDS_ASSERTION_FAILED if expr is
+ *         false and exitOnAssertionFailure is false. When
+ *         exitOnAssertionFailure is true and expr is false, the
+ *         program will exit from within this macro.
+ */
+#define ASSERT(expr, ...)                                         \
+  ASSERT_WITH_ERROR_CODE(expr, UDS_ASSERTION_FAILED, __VA_ARGS__)
+
+/*
+ * A replacement for assert() which logs on failure, but does not return an
+ * error code. This should be used sparingly. If the expression is false and
+ * exitOnAssertionFailure is true, the program will exit from within this macro.
+ *
+ * @param expr      The boolean expression being asserted
+ * @param format    A printf() syle format for the message to log on
+ *                  assertion failure
+ * @param arguments Any additional arguments required by the format
+ */
+#define ASSERT_LOG_ONLY(expr, ...)                                           \
+  (__builtin_expect(!!(expr), 1)                                             \
+   ? UDS_SUCCESS                                                             \
+   : assertionFailedLogOnly(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__))
+
+/*
+ * This macro is a convenient wrapper for ASSERT(false, ...).
+ */
+#define ASSERT_FALSE(...) \
+  ASSERT(false, __VA_ARGS__)
+
+#define STATIC_ASSERT(expr) \
+  do {                      \
+    switch (0) {            \
+    case 0:                 \
+    case expr:              \
+      ;                     \
+    default:                \
+      ;                     \
+    }                       \
+  } while(0)
+
+#define STATIC_ASSERT_SIZEOF(type, expectedSize) \
+  STATIC_ASSERT(sizeof(type) == (expectedSize))
+
+/**
+ * Set whether or not to exit on an assertion failure.
+ *
+ * @param shouldExit If <code>true</code> assertion failures will cause
+ *                   the program to exit
+ *
+ * @return The previous setting
+ **/
+bool setExitOnAssertionFailure(bool shouldExit);
+
+/**
+ * Log an assertion failure.
+ *
+ * @param expressionString The assertion
+ * @param errorCode        The error code to return
+ * @param fileName         The file in which the assertion appears
+ * @param lineNumber       The line number on which the assertion
+ *                         appears
+ * @param format           A printf() style format describing the
+ *                         assertion
+ *
+ * @return The supplied errorCode unless exitOnAssertionFailure is
+ *         true, in which case the process will be aborted
+ **/
+int assertionFailed(const char *expressionString,
+                    int         errorCode,
+                    const char *fileName,
+                    int         lineNumber,
+                    const char *format,
+                    ...)
+  __attribute__((format(printf, 5, 6), warn_unused_result));
+
+/**
+ * Log an assertion failure. This function is different from
+ * assertionFailed() in that its return value may be ignored, and so should
+ * only be used in cases where the return value will be ignored.
+ *
+ * @param expressionString The assertion
+ * @param fileName         The file in which the assertion appears
+ * @param lineNumber       The line number on which the assertion
+ *                         appears
+ * @param format           A printf() style format describing the
+ *                         assertion
+ *
+ * @return UDS_ASSERTION_FAILED unless exitOnAssertionFailure is
+ *         true, in which case the process will be aborted
+ **/
+int assertionFailedLogOnly(const char *expressionString,
+                           const char *fileName,
+                           int         lineNumber,
+                           const char *format,
+                           ...)
+  __attribute__((format(printf, 4, 5)));
+
+#endif /* PERMASSERT_H */
diff --git a/uds/permassertInternals.h b/uds/permassertInternals.h
new file mode 100644
index 0000000..f0a3b95
--- /dev/null
+++ b/uds/permassertInternals.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/permassertInternals.h#1 $
+ */
+
+#ifndef PERMASSERT_INTERNALS_H
+#define PERMASSERT_INTERNALS_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void handleAssertionFailure(const char *expressionString,
+                            const char *fileName,
+                            int         lineNumber,
+                            const char *format,
+                            va_list     args)
+  __attribute__((format(printf, 4, 0)));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* PERMASSERT_INTERNALS_H */
diff --git a/uds/permassertLinuxKernel.c b/uds/permassertLinuxKernel.c
new file mode 100644
index 0000000..67f66d9
--- /dev/null
+++ b/uds/permassertLinuxKernel.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/permassertLinuxKernel.c#1 $
+ */
+
+#include "logger.h"
+#include "permassert.h"
+#include "permassertInternals.h"
+
+/**********************************************************************/
+__attribute__((format(printf, 4, 0)))
+void handleAssertionFailure(const char *expressionString,
+                            const char *fileName,
+                            int         lineNumber,
+                            const char *format,
+                            va_list     args)
+{
+  logEmbeddedMessage(LOG_ERR, "assertion \"", format, args,
+                     "\" (%s) failed at %s:%d",
+                     expressionString, fileName, lineNumber);
+  logBacktrace(LOG_ERR);
+}
diff --git a/uds/random.c b/uds/random.c
new file mode 100644
index 0000000..acad146
--- /dev/null
+++ b/uds/random.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/random.c#2 $
+ */
+
+#include "random.h"
+
+#include "permassert.h"
+
+/*****************************************************************************/
+unsigned int randomInRange(unsigned int lo, unsigned int hi)
+{
+  return lo + random() % (hi - lo + 1);
+}
+
+/*****************************************************************************/
+void randomCompileTimeAssertions(void)
+{
+  STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0);
+}
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+void fillRandomly(void *ptr, size_t len)
+{
+  uint64_t randNum  = 0;
+  uint64_t randMask = 0;
+  const uint64_t multiplier = (uint64_t) RAND_MAX + 1;
+
+  byte *bp = ptr;
+  for (size_t i = 0; i < len; ++i) {
+    if (randMask < 0xff) {
+      randNum  = randNum * multiplier + random();
+      randMask = randMask * multiplier + RAND_MAX;
+    }
+    bp[i] = randNum & 0xff;
+    randNum >>= 8;
+    randMask >>= 8;
+  }
+}
+#endif
diff --git a/uds/random.h b/uds/random.h
new file mode 100644
index 0000000..f5d2f49
--- /dev/null
+++ b/uds/random.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/random.h#2 $
+ */
+
+#ifndef RANDOM_H
+#define RANDOM_H
+
+#ifdef __KERNEL__
+#include <linux/random.h>
+#else
+#include <stdlib.h>
+#endif
+
+#include "compiler.h"
+#include "typeDefs.h"
+
+/**
+ * Get random unsigned integer in a given range
+ *
+ * @param lo  Minimum unsigned integer value
+ * @param hi  Maximum unsigned integer value
+ *
+ * @return unsigned integer in the interval [lo,hi]
+ **/
+unsigned int randomInRange(unsigned int lo, unsigned int hi);
+
+/**
+ * Special function wrapper required for compile-time assertions. This
+ * function will fail to compile if RAND_MAX is not of the form 2^n - 1.
+ **/
+void randomCompileTimeAssertions(void);
+
+/**
+ * Fill bytes with random data.
+ *
+ * @param ptr   where to store bytes
+ * @param len   number of bytes to write
+ **/
+#ifdef __KERNEL__
+static INLINE void fillRandomly(void *ptr, size_t len)
+{
+  prandom_bytes(ptr, len);
+}
+#else
+void fillRandomly(void *ptr, size_t len);
+#endif
+
+#ifdef __KERNEL__
+#define RAND_MAX 2147483647
+
+/**
+ * Random number generator
+ *
+ * @return a random number in the rand 0 to RAND_MAX
+ **/
+static INLINE long random(void)
+{
+  long value;
+  fillRandomly(&value, sizeof(value));
+  return value & RAND_MAX;
+}
+#endif
+
+#endif /* RANDOM_H */
diff --git a/uds/recordPage.c b/uds/recordPage.c
new file mode 100644
index 0000000..f4c2572
--- /dev/null
+++ b/uds/recordPage.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/recordPage.c#3 $
+ */
+
+#include "recordPage.h"
+
+#include "permassert.h"
+
+/**********************************************************************/
+static unsigned int encodeTree(byte                  recordPage[],
+                               const UdsChunkRecord *sortedPointers[],
+                               unsigned int          nextRecord,
+                               unsigned int          node,
+                               unsigned int          nodeCount)
+{
+  if (node < nodeCount) {
+    unsigned int child = (2 * node) + 1;
+    nextRecord = encodeTree(recordPage, sortedPointers, nextRecord,
+                            child, nodeCount);
+
+    // In-order traversal: copy the contents of the next record
+    // into the page at the node offset.
+    memcpy(&recordPage[node * BYTES_PER_RECORD],
+           sortedPointers[nextRecord],
+           BYTES_PER_RECORD);
+    ++nextRecord;
+
+    nextRecord = encodeTree(recordPage, sortedPointers, nextRecord,
+                            child + 1, nodeCount);
+  }
+  return nextRecord;
+}
+
+/**********************************************************************/
+int encodeRecordPage(const Volume         *volume,
+                     const UdsChunkRecord  records[],
+                     byte                  recordPage[])
+{
+  unsigned int recordsPerPage = volume->geometry->recordsPerPage;
+  const UdsChunkRecord **recordPointers = volume->recordPointers;
+
+  // Build an array of record pointers. We'll sort the pointers by the block
+  // names in the records, which is less work than sorting the record values.
+  unsigned int i;
+  for (i = 0; i < recordsPerPage; i++) {
+    recordPointers[i] = &records[i];
+  }
+
+  STATIC_ASSERT(offsetof(UdsChunkRecord, name) == 0);
+  int result = radixSort(volume->radixSorter, (const byte **) recordPointers,
+                         recordsPerPage, UDS_CHUNK_NAME_SIZE);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Use the sorted pointers to copy the records from the chapter to the
+  // record page in tree order.
+  encodeTree(recordPage, recordPointers, 0, 0, recordsPerPage);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+bool searchRecordPage(const byte          recordPage[],
+                      const UdsChunkName *name,
+                      const Geometry     *geometry,
+                      UdsChunkData       *metadata)
+{
+  // The record page is just an array of chunk records.
+  const UdsChunkRecord *records = (const UdsChunkRecord *) recordPage;
+
+  // The array of records is sorted by name and stored as a binary tree in
+  // heap order, so the root of the tree is the first array element.
+  unsigned int node = 0;
+  while (node < geometry->recordsPerPage) {
+    const UdsChunkRecord *record = &records[node];
+    int result = memcmp(name, &record->name, UDS_CHUNK_NAME_SIZE);
+    if (result == 0) {
+      if (metadata != NULL) {
+        *metadata = record->data;
+      }
+      return true;
+    }
+    // The children of node N are in the heap at indexes 2N+1 and 2N+2.
+    node = ((2 * node) + ((result < 0) ? 1 : 2));
+  }
+  return false;
+}
diff --git a/uds/recordPage.h b/uds/recordPage.h
new file mode 100644
index 0000000..ecf9ddc
--- /dev/null
+++ b/uds/recordPage.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/recordPage.h#2 $
+ */
+
+#ifndef RECORDPAGE_H
+#define RECORDPAGE_H 1
+
+#include "common.h"
+#include "volume.h"
+
+/**
+ * Generate the on-disk encoding of a record page from the list of records
+ * in the open chapter representation.
+ *
+ * @param volume     The volume
+ * @param records    The records to be encoded
+ * @param recordPage The record page
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int encodeRecordPage(const Volume         *volume,
+                     const UdsChunkRecord  records[],
+                     byte                  recordPage[]);
+
+/**
+ * Find the metadata for a given block name in this page.
+ *
+ * @param recordPage The record page
+ * @param name       The block name to look for
+ * @param geometry   The geometry of the volume
+ * @param metadata   an array in which to place the metadata of the
+ *                   record, if one was found
+ *
+ * @return <code>true</code> if the record was found
+ **/
+bool searchRecordPage(const byte          recordPage[],
+                      const UdsChunkName *name,
+                      const Geometry     *geometry,
+                      UdsChunkData       *metadata);
+
+#endif /* RECORDPAGE_H */
diff --git a/uds/regionIdentifiers.h b/uds/regionIdentifiers.h
new file mode 100644
index 0000000..ff72b19
--- /dev/null
+++ b/uds/regionIdentifiers.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/regionIdentifiers.h#1 $
+ */
+
+#ifndef REGION_IDENTIFIERS_H
+#define REGION_IDENTIFIERS_H
+
+enum {
+  RH_TYPE_FREE           =   0,         // unused
+  RH_TYPE_SUPER          =   1,
+  RH_TYPE_SAVE           =   2,
+  RH_TYPE_CHECKPOINT     =   3,
+  RH_TYPE_UNSAVED        =   4,
+
+  RL_KIND_SCRATCH        =   0,         // uninitialized or scrapped
+  RL_KIND_HEADER         =   1,         // for self-referential items
+  RL_KIND_CONFIG         = 100,
+  RL_KIND_INDEX          = 101,
+  RL_KIND_SEAL           = 102,
+  RL_KIND_VOLUME         = 201,
+  RL_KIND_SAVE           = 202,
+  RL_KIND_INDEX_PAGE_MAP = 301,
+  RL_KIND_MASTER_INDEX   = 302,
+  RL_KIND_OPEN_CHAPTER   = 303,
+  RL_KIND_INDEX_STATE    = 401,         // not saved as region
+
+  RL_SOLE_INSTANCE       = 65535,
+};
+
+typedef unsigned int    RegionType;
+typedef unsigned int    RegionKind;
+
+#endif // REGION_IDENTIFIERS_H
diff --git a/uds/request.c b/uds/request.c
new file mode 100644
index 0000000..c994181
--- /dev/null
+++ b/uds/request.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/request.c#6 $
+ */
+
+#include "request.h"
+
+#include "indexRouter.h"
+#include "indexSession.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "requestQueue.h"
+
+/**********************************************************************/
+int udsStartChunkOperation(UdsRequest *udsRequest)
+{
+  if (udsRequest->callback == NULL) {
+    return UDS_CALLBACK_REQUIRED;
+  }
+  switch (udsRequest->type) {
+  case UDS_DELETE:
+  case UDS_POST:
+  case UDS_QUERY:
+  case UDS_UPDATE:
+    break;
+  default:
+    return UDS_INVALID_OPERATION_TYPE;
+  }
+  memset(udsRequest->private, 0, sizeof(udsRequest->private));
+  Request *request = (Request *)udsRequest;
+
+  int result = getIndexSession(request->session);
+  if (result != UDS_SUCCESS) {
+    return sansUnrecoverable(result);
+  }
+
+  request->found            = false;
+  request->action           = (RequestAction) request->type;
+  request->isControlMessage = false;
+  request->unbatched        = false;
+  request->router           = request->session->router;
+
+  enqueueRequest(request, STAGE_TRIAGE);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int launchZoneControlMessage(RequestAction  action,
+                             ZoneMessage    message,
+                             unsigned int   zone,
+                             IndexRouter   *router)
+{
+  Request *request;
+  int result = ALLOCATE(1, Request, __func__, &request);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  request->router           = router;
+  request->isControlMessage = true;
+  request->unbatched        = true;
+  request->action           = action;
+  request->zoneNumber       = zone;
+  request->zoneMessage      = message;
+
+  enqueueRequest(request, STAGE_INDEX);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeRequest(Request *request)
+{
+  if (request != NULL) {
+    FREE(request);
+  }
+}
+
+/**********************************************************************/
+static RequestQueue *getNextStageQueue(Request      *request,
+                                       RequestStage  nextStage)
+{
+  if (nextStage == STAGE_CALLBACK) {
+    return request->session->callbackQueue;
+  }
+
+  // Local and remote index routers handle the rest of the pipeline
+  // differently, so delegate the choice of queue to the router.
+  return selectIndexRouterQueue(request->router, request, nextStage);
+}
+
+/**********************************************************************/
+static void handleRequestErrors(Request *request)
+{
+  // XXX Use the router's callback function to hand back the error
+  // and clean up the request? (Possible thread issues doing that.)
+
+  freeRequest(request);
+}
+
+/**********************************************************************/
+void enqueueRequest(Request *request, RequestStage nextStage)
+{
+  RequestQueue *nextQueue = getNextStageQueue(request, nextStage);
+  if (nextQueue == NULL) {
+    handleRequestErrors(request);
+    return;
+  }
+
+  requestQueueEnqueue(nextQueue, request);
+}
+
+/*
+ * This function pointer allows unit test code to intercept the slow-lane
+ * requeuing of a request.
+ */
+static RequestRestarter requestRestarter = NULL;
+
+/**********************************************************************/
+void restartRequest(Request *request)
+{
+  request->requeued = true;
+  if (requestRestarter == NULL) {
+    enqueueRequest(request, STAGE_INDEX);
+  } else {
+    requestRestarter(request);
+  }
+}
+
+/**********************************************************************/
+void setRequestRestarter(RequestRestarter restarter)
+{
+  requestRestarter = restarter;
+}
+
+/**********************************************************************/
+static INLINE void increment_once(uint64_t *countPtr)
+{
+  WRITE_ONCE(*countPtr, READ_ONCE(*countPtr) + 1);
+}
+
+/**********************************************************************/
+void updateRequestContextStats(Request *request)
+{
+  /*
+   * We don't need any synchronization since the context stats are only
+   *  modified from the single callback thread.
+   *
+   * We increment either 2 or 3 counters in this method.
+   *
+   * XXX We always increment the "requests" counter.  But there is no code
+   *     that uses the value stored in this counter.
+   *
+   * We always increment exactly one of these counters (unless there is an
+   * error in the code, which never happens):
+   *     postsFound      postsNotFound
+   *     updatesFound    updatesNotFound
+   *     deletionsFound  deletionsNotFound
+   *     queriesFound    queriesNotFound
+   *
+   * XXX In the case of post request that were found in the index, we increment
+   *     exactly one of these counters.  But there is no code that uses the
+   *     value stored in these counters.
+   *          inMemoryPostsFound
+   *          densePostsFound
+   *          sparsePostsFound
+   */
+
+  SessionStats *sessionStats = &request->session->stats;
+
+  increment_once(&sessionStats->requests);
+  bool found = (request->location != LOC_UNAVAILABLE);
+
+  switch (request->action) {
+  case REQUEST_INDEX:
+    if (found) {
+      increment_once(&sessionStats->postsFound);
+
+      if (request->location == LOC_IN_OPEN_CHAPTER) {
+        increment_once(&sessionStats->postsFoundOpenChapter);
+      } else if (request->location == LOC_IN_DENSE) {
+        increment_once(&sessionStats->postsFoundDense);
+      } else if (request->location == LOC_IN_SPARSE) {
+        increment_once(&sessionStats->postsFoundSparse);
+      }
+    } else {
+      increment_once(&sessionStats->postsNotFound);
+    }
+    break;
+
+  case REQUEST_UPDATE:
+    if (found) {
+      increment_once(&sessionStats->updatesFound);
+    } else {
+      increment_once(&sessionStats->updatesNotFound);
+    }
+    break;
+
+  case REQUEST_DELETE:
+    if (found) {
+      increment_once(&sessionStats->deletionsFound);
+    } else {
+      increment_once(&sessionStats->deletionsNotFound);
+    }
+    break;
+
+  case REQUEST_QUERY:
+    if (found) {
+      increment_once(&sessionStats->queriesFound);
+    } else {
+      increment_once(&sessionStats->queriesNotFound);
+    }
+    break;
+
+  default:
+    request->status = ASSERT(false, "unknown next action in request: %d",
+                             request->action);
+  }
+}
+
+/**********************************************************************/
+void enterCallbackStage(Request *request)
+{
+  if (!request->isControlMessage) {
+    if (isUnrecoverable(request->status)) {
+      // Unrecoverable errors must disable the index session
+      disableIndexSession(request->session);
+      // The unrecoverable state is internal and must not sent to the client.
+      request->status = sansUnrecoverable(request->status);
+    }
+
+    // Handle asynchronous client callbacks in the designated thread.
+    enqueueRequest(request, STAGE_CALLBACK);
+  } else {
+    /*
+     * Asynchronous control messages are complete when they are executed.
+     * There should be nothing they need to do on the callback thread. The
+     * message has been completely processed, so just free it.
+     */
+    freeRequest(request);
+  }
+}
diff --git a/uds/request.h b/uds/request.h
new file mode 100644
index 0000000..fb6250e
--- /dev/null
+++ b/uds/request.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/request.h#7 $
+ */
+
+#ifndef REQUEST_H
+#define REQUEST_H
+
+#include "cacheCounters.h"
+#include "common.h"
+#include "compiler.h"
+#include "opaqueTypes.h"
+#include "threads.h"
+#include "timeUtils.h"
+#include "uds.h"
+#include "util/funnelQueue.h"
+
+/**
+ * RequestAction values indicate what action, command, or query is to be
+ * performed when processing a Request instance.
+ **/
+typedef enum {
+  // Map the API's UdsCallbackType values directly to a corresponding action.
+  REQUEST_INDEX  = UDS_POST,
+  REQUEST_UPDATE = UDS_UPDATE,
+  REQUEST_DELETE = UDS_DELETE,
+  REQUEST_QUERY  = UDS_QUERY,
+
+  REQUEST_CONTROL,
+
+  // REQUEST_SPARSE_CACHE_BARRIER is the action for the control request used
+  // by localIndexRouter.
+  REQUEST_SPARSE_CACHE_BARRIER,
+
+  // REQUEST_ANNOUNCE_CHAPTER_CLOSED is the action for the control
+  // request used by an indexZone to signal the other zones that it
+  // has closed the current open chapter.
+  REQUEST_ANNOUNCE_CHAPTER_CLOSED,
+} RequestAction;
+
+/**
+ * The block's rough location in the index, if any.
+ **/
+typedef enum {
+  /* the block doesn't exist or the location isn't available */
+  LOC_UNAVAILABLE,
+  /* if the block was found in the open chapter */
+  LOC_IN_OPEN_CHAPTER,
+  /* if the block was found in the dense part of the index */
+  LOC_IN_DENSE,
+  /* if the block was found in the sparse part of the index */
+  LOC_IN_SPARSE
+} IndexRegion;
+
+/**
+ * Abstract request pipeline stages, which can also be viewed as stages in the
+ * life-cycle of a request.
+ **/
+typedef enum {
+  STAGE_TRIAGE,
+  STAGE_INDEX,
+  STAGE_CALLBACK,
+} RequestStage;
+
+/**
+ * Control message fields for the barrier messages used to coordinate the
+ * addition of a chapter to the sparse chapter index cache.
+ **/
+typedef struct barrierMessageData {
+  /** virtual chapter number of the chapter index to add to the sparse cache */
+  uint64_t      virtualChapter;
+} BarrierMessageData;
+
+/**
+ * Control message fields for the chapter closed messages used to inform
+ * lagging zones of the first zone to close a given open chapter.
+ **/
+typedef struct chapterClosedMessageData {
+  /** virtual chapter number of the chapter which was closed */
+  uint64_t      virtualChapter;
+} ChapterClosedMessageData;
+
+/**
+ * Union of the all the zone control message fields. The RequestAction field
+ * (or launch function argument) selects which of the members is valid.
+ **/
+typedef union zoneMessageData {
+  BarrierMessageData barrier;             // for REQUEST_SPARSE_CACHE_BARRIER
+  ChapterClosedMessageData chapterClosed; // for REQUEST_ANNOUNCE_CHAPTER_CLOSED
+} ZoneMessageData;
+
+typedef struct zoneMessage {
+  /** the index to which the message is directed */
+  struct index *index;
+  /** the message specific data */
+  ZoneMessageData data;
+} ZoneMessage;
+
+/**
+ * Request context for queuing throughout the uds pipeline
+ *
+ * XXX Note that the typedef for this struct defines "Request", and that this
+ *     should therefore be "struct request".  However, this conflicts with the
+ *     Linux kernel which also has a "struct request".  This is a workaround so
+ *     that we can make upstreaming progress.  The real solution is to expose
+ *     this structure as the true "struct uds_request" and do a lot of
+ *     renaming.
+ **/
+struct internalRequest {
+  /*
+   * The first part of this structure must be exactly parallel to the
+   * UdsRequest structure, which is part of the public UDS API.
+   */
+  UdsChunkName      chunkName;    // hash value
+  UdsChunkData      oldMetadata;  // metadata from index
+  UdsChunkData      newMetadata;  // metadata from request
+  UdsChunkCallback *callback;     // callback method when complete
+  struct uds_index_session *session; // The public index session
+  UdsCallbackType   type;            // the type of request
+  int               status;          // success or error code for this request
+  bool              found;           // True if the block was found in index
+  bool              update;          // move record to newest chapter if found
+
+  /*
+   * The remainder of this structure is private to the UDS implementation.
+   */
+  FunnelQueueEntry  requestQueueLink; // for lock-free request queue
+  Request          *nextRequest;
+  IndexRouter      *router;
+
+  // Data for control message requests
+  ZoneMessage zoneMessage;
+  bool        isControlMessage;
+
+  bool          unbatched;      // if true, must wake worker when enqueued
+  bool          requeued;
+  RequestAction action;         // the action for the index to perform
+  unsigned int  zoneNumber;     // the zone for this request to use
+  IndexRegion   location;       // if and where the block was found
+
+  bool        slLocationKnown;  // slow lane has determined a location
+  IndexRegion slLocation;       // location determined by slowlane
+};
+
+typedef void (*RequestRestarter)(Request *);
+
+/**
+ * Make an asynchronous control message for an index zone and enqueue it for
+ * processing.
+ *
+ * @param action   The control action to perform
+ * @param message  The message to send
+ * @param zone     The zone number of the zone to receive the message
+ * @param router   The index router responsible for handling the message
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int launchZoneControlMessage(RequestAction  action,
+                             ZoneMessage    message,
+                             unsigned int   zone,
+                             IndexRouter   *router)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an index request.
+ *
+ * @param request The request to free
+ **/
+void freeRequest(Request *request);
+
+/**
+ * Enqueue a request for the next stage of the pipeline. If there is more than
+ * one possible queue for a stage, this function uses the request to decide
+ * which queue should handle it.
+ *
+ * @param request       The request to enqueue
+ * @param nextStage     The next stage of the pipeline to process the request
+ **/
+void enqueueRequest(Request *request, RequestStage nextStage);
+
+/**
+ * A method to restart delayed requests.
+ *
+ * @param request    The request to restart
+ **/
+void restartRequest(Request *request);
+
+/**
+ * Set the function pointer which is used to restart requests.
+ * This is needed by albserver code and is used as a test hook by the unit
+ * tests.
+ *
+ * @param restarter   The function to call to restart requests.
+ **/
+void setRequestRestarter(RequestRestarter restarter);
+
+/**
+ * Enter the callback stage of processing for a request, notifying the waiting
+ * thread if the request is synchronous, freeing the request if it is an
+ * asynchronous control message, or placing it on the callback queue if it is
+ * an asynchronous client request.
+ *
+ * @param request  the request which has completed execution
+ **/
+void enterCallbackStage(Request *request);
+
+/**
+ * Update the context statistics to reflect the successful completion of a
+ * client request.
+ *
+ * @param request  a client request that has successfully completed execution
+ **/
+void updateRequestContextStats(Request *request);
+
+/**
+ * Compute the CacheProbeType value reflecting the request and page type.
+ *
+ * @param request      The request being processed, or NULL
+ * @param isIndexPage  Whether the cache probe will be for an index page
+ *
+ * @return the cache probe type enumeration
+ **/
+static INLINE CacheProbeType cacheProbeType(Request *request,
+                                            bool     isIndexPage)
+{
+  if ((request != NULL) && request->requeued) {
+    return isIndexPage ? CACHE_PROBE_INDEX_RETRY : CACHE_PROBE_RECORD_RETRY;
+  } else {
+    return isIndexPage ? CACHE_PROBE_INDEX_FIRST : CACHE_PROBE_RECORD_FIRST;
+  }
+}
+#endif /* REQUEST_H */
diff --git a/uds/requestQueue.h b/uds/requestQueue.h
new file mode 100644
index 0000000..5bf7ef6
--- /dev/null
+++ b/uds/requestQueue.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/requestQueue.h#1 $
+ */
+
+#ifndef REQUEST_QUEUE_H
+#define REQUEST_QUEUE_H
+
+#include "opaqueTypes.h"
+#include "typeDefs.h"
+
+/* void return value because this function will process its own errors */
+typedef void RequestQueueProcessor(Request *);
+
+/**
+ * Allocate a new request processing queue and start a worker thread to
+ * consume and service requests in the queue.
+ *
+ * @param queueName   the name of the queue and the worker thread
+ * @param processOne  the function the worker will invoke on each request
+ * @param queuePtr    a pointer to receive the new queue
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeRequestQueue(const char             *queueName,
+                     RequestQueueProcessor  *processOne,
+                     RequestQueue          **queuePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Add a request to the end of the queue for processing by the worker thread.
+ * If the requeued flag is set on the request, it will be processed before
+ * any non-requeued requests under most circumstances.
+ *
+ * @param queue    the request queue that should process the request
+ * @param request  the request to be processed on the queue's worker thread
+ **/
+void requestQueueEnqueue(RequestQueue *queue, Request *request);
+
+/**
+ * Shut down the request queue worker thread, then destroy and free the queue.
+ *
+ * @param queue  the queue to shut down and free
+ **/
+void requestQueueFinish(RequestQueue *queue);
+
+#endif /* REQUEST_QUEUE_H */
diff --git a/uds/requestQueueKernel.c b/uds/requestQueueKernel.c
new file mode 100644
index 0000000..a53ff12
--- /dev/null
+++ b/uds/requestQueueKernel.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/requestQueueKernel.c#3 $
+ */
+
+#include "requestQueue.h"
+
+#include <linux/wait.h>
+
+#include "atomicDefs.h"
+#include "compiler.h"
+#include "logger.h"
+#include "request.h"
+#include "memoryAlloc.h"
+#include "threads.h"
+#include "util/funnelQueue.h"
+
+/*
+ * Ordering:
+ *
+ * Multiple retry requests or multiple non-retry requests enqueued from
+ * a single producer thread will be processed in the order enqueued.
+ *
+ * Retry requests will generally be processed before normal requests.
+ *
+ * HOWEVER, a producer thread can enqueue a retry request (generally given
+ * higher priority) and then enqueue a normal request, and they can get
+ * processed in the reverse order.  The checking of the two internal queues is
+ * very simple and there's a potential race with the producer regarding the
+ * "priority" handling.  If an ordering guarantee is needed, it can be added
+ * without much difficulty, it just makes the code a bit more complicated.
+ *
+ * If requests are enqueued while the processing of another request is
+ * happening, and the enqueuing operations complete while the request
+ * processing is still in progress, then the retry request(s) *will*
+ * get processed next.  (This is used for testing.)
+ */
+
+/**
+ * Time constants, all in units of nanoseconds.
+ **/
+enum {
+  ONE_NANOSECOND    =    1,
+  ONE_MICROSECOND   = 1000 * ONE_NANOSECOND,
+  ONE_MILLISECOND   = 1000 * ONE_MICROSECOND,
+  ONE_SECOND        = 1000 * ONE_MILLISECOND,
+
+  /** The initial time to wait after waiting with no timeout */
+  DEFAULT_WAIT_TIME = 20 * ONE_MICROSECOND,
+
+  /** The minimum time to wait when waiting with a timeout */
+  MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2,
+
+  /** The maximimum time to wait when waiting with a timeout */
+  MAXIMUM_WAIT_TIME = ONE_MILLISECOND
+};
+
+/**
+ * Batch size tuning constants. These are compared to the number of requests
+ * that have been processed since the worker thread last woke up.
+ **/
+enum {
+  MINIMUM_BATCH = 32,  // wait time increases if batches are smaller than this
+  MAXIMUM_BATCH = 64   // wait time decreases if batches are larger than this
+};
+
+struct requestQueue {
+  /* Wait queue for synchronizing producers and consumer */
+  struct wait_queue_head  wqhead;
+  /* function to process 1 request */
+  RequestQueueProcessor  *processOne;
+  /* new incoming requests */
+  FunnelQueue            *mainQueue;
+  /* old requests to retry first */
+  FunnelQueue            *retryQueue;
+  /* thread id of the worker thread */
+  Thread                  thread;
+  /* true if the worker was started */
+  bool                    started;
+  /* when true, requests can be enqueued */
+  bool                    alive;
+  /* A flag set when the worker is waiting without a timeout */
+  atomic_t                dormant;
+};
+
+/*****************************************************************************/
+/**
+ * Poll the underlying lock-free queues for a request to process.  Must only be
+ * called by the worker thread.
+ *
+ * @param queue  the RequestQueue being serviced
+ *
+ * @return a dequeued request, or NULL if no request was available
+ **/
+static INLINE Request *pollQueues(RequestQueue *queue)
+{
+  // The retry queue has higher priority.
+  FunnelQueueEntry *entry = funnelQueuePoll(queue->retryQueue);
+  if (entry != NULL) {
+    return container_of(entry, Request, requestQueueLink);
+  }
+
+  // The main queue has lower priority.
+  entry = funnelQueuePoll(queue->mainQueue);
+  if (entry != NULL) {
+    return container_of(entry, Request, requestQueueLink);
+  }
+  
+  // No entry found.
+  return NULL;
+}
+
+/*****************************************************************************/
+/**
+ * Check if the underlying lock-free queues appear not just not to have any
+ * requests available right now, but also not to be in the intermediate state
+ * of getting requests added. Must only be called by the worker thread.
+ *
+ * @param queue  the RequestQueue being serviced
+ *
+ * @return true iff both funnel queues are idle
+ **/
+static INLINE bool areQueuesIdle(RequestQueue *queue)
+{
+  return (isFunnelQueueIdle(queue->retryQueue) &&
+          isFunnelQueueIdle(queue->mainQueue));
+}
+
+/*****************************************************************************/
+/**
+ * Remove the next request to be processed from the queue.  Must only be called
+ * by the worker thread.
+ *
+ * @param queue       the queue from which to remove an entry
+ * @param requestPtr  the next request is returned here, or a NULL pointer to
+ *                    indicate that there will be no more requests
+ * @param waitedPtr   return a boolean to indicate that we need to wait
+ *
+ * @return True when there is a next request, or when we know that there will
+ *         never be another request.  False when we must wait for a request.
+ **/
+static INLINE bool dequeueRequest(RequestQueue  *queue,
+                                  Request      **requestPtr,
+                                  bool          *waitedPtr)
+{
+  // Because of batching, we expect this to be the most common code path.
+  Request *request = pollQueues(queue);
+  if (request != NULL) {
+    // Return because we found a request
+    *requestPtr = request;
+    return true;
+  }
+
+  if (!READ_ONCE(queue->alive)) {
+    // Return because we see that shutdown is happening
+    *requestPtr = NULL;
+    return true;
+  }
+
+  // Return indicating that we need to wait.
+  *requestPtr = NULL;
+  *waitedPtr = true;
+  return false;
+}
+
+/*****************************************************************************/
+static void requestQueueWorker(void *arg)
+{
+  RequestQueue *queue = (RequestQueue *) arg;
+  unsigned long timeBatch = DEFAULT_WAIT_TIME;
+  bool dormant = atomic_read(&queue->dormant);
+  long currentBatch = 0;
+
+  for (;;) {
+    Request *request;
+    bool waited = false;
+    if (dormant) {
+      /*
+       * Sleep/wakeup protocol:
+       *
+       * The enqueue operation updates "newest" in the
+       * funnel queue via xchg which is a memory barrier,
+       * and later checks "dormant" to decide whether to do
+       * a wakeup of the worker thread.
+       *
+       * The worker thread, when deciding to go to sleep,
+       * sets "dormant" and then examines "newest" to decide
+       * if the funnel queue is idle. In dormant mode, the
+       * last examination of "newest" before going to sleep
+       * is done inside the wait_event_interruptible macro,
+       * after a point where (one or more) memory barriers
+       * have been issued. (Preparing to sleep uses spin
+       * locks.) Even if the "next" field update isn't
+       * visible yet to make the entry accessible, its
+       * existence will kick the worker thread out of
+       * dormant mode and back into timer-based mode.
+       *
+       * So the two threads should agree on the ordering of
+       * the updating of the two fields.
+       */
+      wait_event_interruptible(queue->wqhead,
+                               dequeueRequest(queue, &request, &waited) ||
+                               !areQueuesIdle(queue));
+    } else {
+      wait_event_interruptible_hrtimeout(queue->wqhead,
+                                         dequeueRequest(queue, &request,
+                                                        &waited),
+                                         ns_to_ktime(timeBatch));
+    }
+
+    if (likely(request != NULL)) {
+      // We got a request.
+      currentBatch++;
+      queue->processOne(request);
+    } else if (!READ_ONCE(queue->alive)) {
+      // We got no request and we know we are shutting down.
+      break;
+    }
+
+    if (dormant) {
+      // We've been roused from dormancy. Clear the flag so enqueuers can stop
+      // broadcasting (no fence needed for this transition).
+      atomic_set(&queue->dormant, false);
+      dormant = false;
+      // Reset the timeout back to the default since we don't know how long
+      // we've been asleep and we also want to be responsive to a new burst.
+      timeBatch = DEFAULT_WAIT_TIME;
+    } else if (waited) {
+      // We waited for this request to show up.  Adjust the wait time if the
+      // last batch of requests was too small or too large..
+      if (currentBatch < MINIMUM_BATCH) {
+        // Adjust the wait time if the last batch of requests was too small.
+        timeBatch += timeBatch / 4;
+        if (timeBatch >= MAXIMUM_WAIT_TIME) {
+          // The timeout is getting long enough that we need to switch into
+          // dormant mode.
+          atomic_set(&queue->dormant, true);
+          dormant = true;
+        }
+      } else if (currentBatch > MAXIMUM_BATCH) {
+        // Adjust the wait time if the last batch of requests was too large.
+        timeBatch -= timeBatch / 4;
+        if (timeBatch < MINIMUM_WAIT_TIME) {
+          // But if the producer is very fast or the scheduler doesn't wake up
+          // up promptly, waiting for very short times won't make the batches
+          // smaller.
+          timeBatch = MINIMUM_WAIT_TIME;
+        }
+      }
+      // And we must now start a new batch count
+      currentBatch = 0;
+    }
+  }
+
+  /*
+   * Ensure that we see any requests that were guaranteed to have been fully
+   * enqueued before shutdown was flagged.  The corresponding write barrier
+   * is in requestQueueFinish.
+   */
+  smp_rmb();
+
+  // Process every request that is still in the queue, and never wait for any
+  // new requests to show up.
+  for (;;) {
+    Request *request = pollQueues(queue);
+    if (request == NULL) {
+      break;
+    }
+    queue->processOne(request);
+  }
+}
+
+/**********************************************************************/
+int makeRequestQueue(const char             *queueName,
+                     RequestQueueProcessor  *processOne,
+                     RequestQueue          **queuePtr)
+{
+  RequestQueue *queue;
+  int result = ALLOCATE(1, RequestQueue, __func__, &queue);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  queue->processOne = processOne;
+  queue->alive      = true;
+  atomic_set(&queue->dormant, false);
+  init_waitqueue_head(&queue->wqhead);
+
+  result = makeFunnelQueue(&queue->mainQueue);
+  if (result != UDS_SUCCESS) {
+    requestQueueFinish(queue);
+    return result;
+  }
+
+  result = makeFunnelQueue(&queue->retryQueue);
+  if (result != UDS_SUCCESS) {
+    requestQueueFinish(queue);
+    return result;
+  }
+
+  result = createThread(requestQueueWorker, queue, queueName, &queue->thread);
+  if (result != UDS_SUCCESS) {
+    requestQueueFinish(queue);
+    return result;
+  }
+
+  queue->started = true;
+  smp_mb();
+  *queuePtr = queue;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static INLINE void wakeUpWorker(RequestQueue *queue)
+{
+  // This is the code sequence recommended in <linux/wait.h>
+  smp_mb();
+  if (waitqueue_active(&queue->wqhead)) {
+    wake_up(&queue->wqhead);
+  }
+}
+
+/**********************************************************************/
+void requestQueueEnqueue(RequestQueue *queue, Request *request)
+{
+  bool unbatched = request->unbatched;
+  funnelQueuePut(request->requeued ? queue->retryQueue : queue->mainQueue,
+                 &request->requestQueueLink);
+
+  /*
+   * We must wake the worker thread when it is dormant (waiting with no
+   * timeout).  An atomic load (read fence) isn't needed here since we know the
+   * queue operation acts as one.
+   */
+  if (atomic_read(&queue->dormant) || unbatched) {
+    wakeUpWorker(queue);
+  }
+}
+
+/**********************************************************************/
+void requestQueueFinish(RequestQueue *queue)
+{
+  if (queue == NULL) {
+    return;
+  }
+
+  /*
+   * This memory barrier ensures that any requests we queued will be seen.  The
+   * point is that when dequeueRequest sees the following update to the alive
+   * flag, it will also be able to see any change we made to a next field in
+   * the FunnelQueue entry.  The corresponding read barrier is in
+   * requestQueueWorker.
+   */
+  smp_wmb();
+
+  // Mark the queue as dead.
+  WRITE_ONCE(queue->alive, false);
+
+  if (queue->started) {
+    // Wake the worker so it notices that it should exit.
+    wakeUpWorker(queue);
+
+    // Wait for the worker thread to finish processing any additional pending
+    // work and exit.
+    int result = joinThreads(queue->thread);
+    if (result != UDS_SUCCESS) {
+      logWarningWithStringError(result, "Failed to join worker thread");
+    }
+  }
+
+  freeFunnelQueue(queue->mainQueue);
+  freeFunnelQueue(queue->retryQueue);
+  FREE(queue);
+}
diff --git a/uds/searchList.c b/uds/searchList.c
new file mode 100644
index 0000000..ec2ef70
--- /dev/null
+++ b/uds/searchList.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/searchList.c#2 $
+ */
+
+#include "searchList.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+/**********************************************************************/
+int makeSearchList(unsigned int   capacity,
+                   SearchList   **listPtr)
+{
+  if (capacity == 0) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "search list must have entries");
+  }
+  if (capacity > UINT8_MAX) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                  "search list capacity must fit in 8 bits");
+  }
+
+  // We need three temporary entry arrays for purgeSearchList(). Allocate them
+  // contiguously with the main array.
+  unsigned int bytes = (sizeof(SearchList) + (4 * capacity * sizeof(uint8_t)));
+  SearchList *list;
+  int result = allocateCacheAligned(bytes, "search list", &list);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  list->capacity       = capacity;
+  list->firstDeadEntry = 0;
+
+  // Fill in the indexes of the chapter index cache entries. These will be
+  // only ever be permuted as the search list is used.
+  uint8_t i;
+  for (i = 0; i < capacity; i++) {
+    list->entries[i] = i;
+  }
+
+  *listPtr = list;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeSearchList(SearchList **listPtr)
+{
+  FREE(*listPtr);
+  *listPtr = NULL;
+}
+
+/**********************************************************************/
+void purgeSearchList(SearchList               *searchList,
+                     const CachedChapterIndex  chapters[],
+                     uint64_t                  oldestVirtualChapter)
+{
+  if (searchList->firstDeadEntry == 0) {
+    // There are no live entries in the list to purge.
+    return;
+  }
+
+  /*
+   * Partition the previously-alive entries in the list into three temporary
+   * lists, keeping the current LRU search order within each list. The element
+   * array was allocated with enough space for all four lists.
+   */
+  uint8_t *entries = &searchList->entries[0];
+  uint8_t *alive   = &entries[searchList->capacity];
+  uint8_t *skipped = &alive[searchList->capacity];
+  uint8_t *dead    = &skipped[searchList->capacity];
+  unsigned int nextAlive   = 0;
+  unsigned int nextSkipped = 0;
+  unsigned int nextDead    = 0;
+
+  int i;
+  for (i = 0; i < searchList->firstDeadEntry; i++) {
+    uint8_t entry = entries[i];
+    const CachedChapterIndex *chapter = &chapters[entry];
+    if ((chapter->virtualChapter < oldestVirtualChapter)
+        || (chapter->virtualChapter == UINT64_MAX)) {
+      dead[nextDead++] = entry;
+    } else if (chapter->skipSearch) {
+      skipped[nextSkipped++] = entry;
+    } else {
+      alive[nextAlive++] = entry;
+    }
+  }
+
+  // Copy the temporary lists back to the search list so we wind up with
+  // [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ]
+  memcpy(entries, alive, nextAlive);
+  entries += nextAlive;
+
+  memcpy(entries, skipped, nextSkipped);
+  entries += nextSkipped;
+
+  memcpy(entries, dead, nextDead);
+  // The first dead entry is now the start of the copied dead list.
+  searchList->firstDeadEntry = (nextAlive + nextSkipped);
+}
diff --git a/uds/searchList.h b/uds/searchList.h
new file mode 100644
index 0000000..25d99e9
--- /dev/null
+++ b/uds/searchList.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/searchList.h#1 $
+ */
+
+#ifndef SEARCH_LIST_H
+#define SEARCH_LIST_H
+
+#include "cachedChapterIndex.h"
+#include "compiler.h"
+#include "stringUtils.h"
+#include "typeDefs.h"
+
+/**
+ * A SearchList represents the permutations of the sparse chapter index cache
+ * entry array. Those permutations express an ordering on the chapter indexes,
+ * from most recently accessed to least recently accessed, which is the order
+ * in which the indexes should be searched and the reverse order in which they
+ * should be evicted from the cache (LRU cache replacement policy).
+ *
+ * Cache entries that are dead (virtualChapter == UINT64_MAX) are kept as a
+ * suffix of the list, avoiding the need to even iterate over them to search,
+ * and ensuring that dead entries are replaced before any live entries are
+ * evicted.
+ *
+ * The search list is intended to be instantated for each zone thread,
+ * avoiding any need for synchronization. The structure is allocated on a
+ * cache boundary to avoid false sharing of memory cache lines between zone
+ * threads.
+ **/
+typedef struct searchList {
+  /** The number of cached chapter indexes and search list entries */
+  uint8_t capacity;
+
+  /** The index in the entries array of the first dead cache entry */
+  uint8_t firstDeadEntry;
+
+  /** The chapter array indexes representing the chapter search order */
+  uint8_t entries[];
+} SearchList;
+
+/**
+ * SearchListIterator captures the fields needed to iterate over the live
+ * entries in a search list and return the CachedChapterIndex pointers that
+ * the search code actually wants to deal with.
+ **/
+typedef struct {
+  /** The search list defining the chapter search iteration order */
+  SearchList         *list;
+
+  /** The index of the next entry to return from the search list */
+  unsigned int        nextEntry;
+
+  /** The cached chapters that are referenced by the search list */
+  CachedChapterIndex *chapters;
+} SearchListIterator;
+
+/**
+ * Allocate and initialize a new chapter cache search list with the same
+ * capacity as the cache. The index of each entry in the cache will appear
+ * exactly once in the array. All the chapters in the cache are assumed to be
+ * initially dead, so firstDeadEntry will be zero and no chapters will be
+ * returned when the search list is iterated.
+ *
+ * @param [in]  capacity  the number of entries in the search list
+ * @param [out] listPtr   a pointer in which to return the new search list
+ **/
+int makeSearchList(unsigned int   capacity,
+                   SearchList   **listPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a search list and null out the reference to it.
+ *
+ * @param listPtr the reference to the search list to free
+ **/
+void freeSearchList(SearchList **listPtr);
+
+/**
+ * Copy the contents of one search list to another.
+ *
+ * @param source  the list to copy
+ * @param target  the list to replace
+ **/
+static INLINE void copySearchList(const SearchList *source,
+                                  SearchList       *target)
+{
+  *target = *source;
+  memcpy(target->entries, source->entries, source->capacity);
+}
+
+/**
+ * Prepare to iterate over the live cache entries a search list.
+ *
+ * @param list      the list defining the live chapters and the search order
+ * @param chapters  the chapter index entries to return from getNextChapter()
+ *
+ * @return an iterator positioned at the start of the search list
+ **/
+static INLINE SearchListIterator
+iterateSearchList(SearchList *list, CachedChapterIndex chapters[])
+{
+  SearchListIterator iterator = {
+    .list      = list,
+    .nextEntry = 0,
+    .chapters  = chapters,
+  };
+  return iterator;
+}
+
+/**
+ * Check if the search list iterator has another entry to return.
+ *
+ * @param iterator  the search list iterator
+ *
+ * @return <code>true</code> if getNextChapter() may be called
+ **/
+static INLINE bool hasNextChapter(const SearchListIterator *iterator)
+{
+  return (iterator->nextEntry < iterator->list->firstDeadEntry);
+}
+
+/**
+ * Return a pointer to the next live chapter in the search list iteration and
+ * advance the iterator. This must only be called when hasNextChapter()
+ * returns <code>true</code>.
+ *
+ * @param iterator  the search list iterator
+ *
+ * @return a pointer to the next live chapter index in the search list order
+ **/
+static INLINE CachedChapterIndex *getNextChapter(SearchListIterator *iterator)
+{
+  return &iterator->chapters[iterator->list->entries[iterator->nextEntry++]];
+}
+
+/**
+ * Rotate the pointers in a prefix of a search list downwards by one item,
+ * pushing elements deeper into the list and moving a new chapter to the start
+ * of the search list. This is the "make most recent" operation on the search
+ * list.
+ *
+ * If the search list provided is <code>[ 0 1 2 3 4 ]</code> and the prefix
+ * length is <code>4</code>, then <code>3</code> is being moved to the front.
+ * The search list after the call will be <code>[ 3 0 1 2 4 ]</code> and the
+ * function will return <code>3</code>.
+ *
+ * @param searchList    the chapter index search list to rotate
+ * @param prefixLength  the length of the prefix of the list to rotate
+ *
+ * @return the array index of the chapter cache entry that is now at the front
+ *         of the search list
+ **/
+static INLINE uint8_t rotateSearchList(SearchList *searchList,
+                                       uint8_t     prefixLength)
+{
+  // Grab the value of the last entry in the list prefix.
+  uint8_t mostRecent = searchList->entries[prefixLength - 1];
+
+  if (prefixLength > 1) {
+    // Push the first N-1 entries down by one entry, overwriting the entry
+    // we just grabbed.
+    memmove(&searchList->entries[1],
+            &searchList->entries[0],
+            prefixLength - 1);
+
+    // We now have a hole at the front of the list in which we can place the
+    // rotated entry.
+    searchList->entries[0] = mostRecent;
+  }
+
+  // This function is also used to move a dead chapter to the front of the
+  // list, in which case the suffix of dead chapters was pushed down too.
+  if (searchList->firstDeadEntry < prefixLength) {
+    searchList->firstDeadEntry += 1;
+  }
+
+  return mostRecent;
+}
+
+/**
+ * Purge invalid cache entries, marking them as dead and moving them to the
+ * end of the search list, then push any chapters that have skipSearch set
+ * down so they follow all the remaining live, valid chapters in the search
+ * list. This effectively sorts the search list into three regions--active,
+ * skippable, and dead--while maintaining the LRU ordering that already
+ * existed (a stable sort).
+ *
+ * This operation must only be called during the critical section in
+ * updateSparseCache() since it effectively changes cache membership.
+ *
+ * @param searchList            the chapter index search list to purge
+ * @param chapters              the chapter index cache entries
+ * @param oldestVirtualChapter  the oldest virtual chapter
+ **/
+void purgeSearchList(SearchList               *searchList,
+                     const CachedChapterIndex  chapters[],
+                     uint64_t                  oldestVirtualChapter);
+
+#endif /* SEARCH_LIST_H */
diff --git a/uds/sparseCache.c b/uds/sparseCache.c
new file mode 100644
index 0000000..f816d12
--- /dev/null
+++ b/uds/sparseCache.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.c#3 $
+ */
+
+/**
+ * The sparse chapter index cache is implemented as a simple array of cache
+ * entries. Since the cache is small (seven chapters by default), searching
+ * for a specific virtual chapter is implemented as a linear search. The cache
+ * replacement policy is least-recently-used (LRU). Again, size of the cache
+ * allows the LRU order to be maintained by shifting entries in an array list.
+ *
+ * The most important property of this cache is the absence of synchronization
+ * for read operations. Safe concurrent access to the cache by the zone
+ * threads is controlled by the triage queue and the barrier requests it
+ * issues to the zone queues. The set of cached chapters does not and must not
+ * change between the carefully coordinated calls to updateSparseCache() from
+ * the zone threads.
+ *
+ * The critical invariant for that coordination is the cache membership must
+ * not change between those updates; the calls to sparseCacheContains() from
+ * the zone threads must all receive the same results for any virtual chapter
+ * number. To ensure that critical invariant, state changes such as "that
+ * virtual chapter is no longer in the volume" and "skip searching that
+ * chapter because it has had too many cache misses" are represented
+ * separately from the cache membership information (the virtual chapter
+ * number).
+ *
+ * As a result of this invariant, we have the guarantee that every zone thread
+ * will call updateSparseCache() once and exactly once to request a chapter
+ * that is not in the cache, and the serialization of the barrier requests
+ * from the triage queue ensures they will all request the same chapter
+ * number. This means the only synchronization we need can be provided by a
+ * pair of thread barriers used only in the updateSparseCache() call,
+ * providing a critical section where a single zone thread can drive the cache
+ * update while all the other zone threads are known to be blocked, waiting in
+ * the second barrier. Outside that critical section, all the zone threads
+ * implicitly hold a shared lock. Inside it, the "captain" (the thread that
+ * was uniquely flagged when passing through the first barrier) holds an
+ * exclusive lock. No other threads may access or modify the cache, except for
+ * accessing cache statistics and similar queries.
+ *
+ * Cache statistics must only be modified by a single thread, conventionally
+ * the zone zero thread. All fields that might be frequently updated by that
+ * thread are kept in separate cache-aligned structures so they will not cause
+ * cache contention via "false sharing" with the fields that are frequently
+ * accessed by all of the zone threads.
+ *
+ * LRU order is kept independently by each zone thread, and each zone uses its
+ * own list for searching and cache membership queries. The zone zero list is
+ * used to decide which chapter to evict when the cache is updated, and its
+ * search list is copied to the other threads at that time.
+ *
+ * The virtual chapter number field of the cache entry is the single field
+ * indicating whether a chapter is a member of the cache or not. The value
+ * <code>UINT64_MAX</code> is used to represent a null, undefined, or wildcard
+ * chapter number. When present in the virtual chapter number field
+ * CachedChapterIndex, it indicates that the cache entry is dead, and all
+ * the other fields of that entry (other than immutable pointers to cache
+ * memory) are undefined and irrelevant. Any cache entry that is not marked as
+ * dead is fully defined and a member of the cache--sparseCacheContains()
+ * must always return true for any virtual chapter number that appears in any
+ * of the cache entries.
+ *
+ * A chapter index that is a member of the cache may be marked for different
+ * treatment (disabling search) between calls to updateSparseCache() in two
+ * different ways. When a chapter falls off the end of the volume, its virtual
+ * chapter number will be less that the oldest virtual chapter number. Since
+ * that chapter is no longer part of the volume, there's no point in continuing
+ * to search that chapter index. Once invalidated, that virtual chapter will
+ * still be considered a member of the cache, but it will no longer be searched
+ * for matching chunk names.
+ *
+ * The second mechanism for disabling search is the heuristic based on keeping
+ * track of the number of consecutive search misses in a given chapter index.
+ * Once that count exceeds a threshold, the skipSearch flag will be set to
+ * true, causing the chapter to be skipped in the fallback search of the
+ * entire cache, but still allowing it to be found when searching for a hook
+ * in that specific chapter. Finding a hook will clear the skipSearch flag,
+ * once again allowing the non-hook searches to use the cache entry. Again,
+ * regardless of the state of the skipSearch flag, the virtual chapter must
+ * still considered to be a member of the cache for sparseCacheContains().
+ *
+ * Barrier requests and the sparse chapter index cache are also described in
+ *
+ * https://intranet.permabit.com/wiki/Chapter_Index_Cache_supports_concurrent_access
+ *
+ * and in a message to the albireo mailing list on 5/28/2011 titled "true
+ * barriers with a hook resolution queue".
+ **/
+
+#include "sparseCache.h"
+
+#include "cachedChapterIndex.h"
+#include "chapterIndex.h"
+#include "common.h"
+#include "index.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "searchList.h"
+#include "threads.h"
+#include "zone.h"
+
+enum {
+  /** The number of consecutive search misses that will disable searching */
+  SKIP_SEARCH_THRESHOLD = 20000,
+
+  /** a named constant to use when identifying zone zero */
+  ZONE_ZERO = 0
+};
+
+/**
+ * These counter values are essentially fields of the SparseCache, but are
+ * segregated into this structure because they are frequently modified. We
+ * group them and align them to keep them on different cache lines from the
+ * cache fields that are accessed far more often than they are updated.
+ **/
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sparseCacheCounters {
+  /** the total number of virtual chapter probes that succeeded */
+  uint64_t      chapterHits;
+
+  /** the total number of virtual chapter probes that failed */
+  uint64_t      chapterMisses;
+
+  /** the total number of cache searches that found a possible match */
+  uint64_t      searchHits;
+
+  /** the total number of cache searches that found no matches */
+  uint64_t      searchMisses;
+
+  /** the number of cache entries that fell off the end of the volume */
+  uint64_t      invalidations;
+
+  /** the number of cache entries that were evicted while still valid */
+  uint64_t      evictions;
+} SparseCacheCounters;
+
+/**
+ * This is the private structure definition of a SparseCache.
+ **/
+struct sparseCache {
+  /** the number of cache entries, which is the size of the chapters array */
+  unsigned int           capacity;
+
+  /** the number of zone threads using the cache */
+  unsigned int           zoneCount;
+
+  /** the geometry governing the volume */
+  const Geometry        *geometry;
+
+  /** the number of search misses in zone zero that will disable searching */
+  unsigned int           skipSearchThreshold;
+
+  /** pointers to the cache-aligned chapter search order for each zone */
+  SearchList            *searchLists[MAX_ZONES];
+
+  /** the thread barriers used to synchronize the zone threads for update */
+  Barrier                beginCacheUpdate;
+  Barrier                endCacheUpdate;
+
+  /** frequently-updated counter fields (cache-aligned) */
+  SparseCacheCounters    counters;
+
+  /** the counted array of chapter index cache entries (cache-aligned) */
+  CachedChapterIndex     chapters[];
+};
+
+/**
+ * Initialize a sparse chapter index cache.
+ *
+ * @param cache      the sparse cache to initialize
+ * @param geometry   the geometry governing the volume
+ * @param capacity   the number of chapters the cache will hold
+ * @param zoneCount  the number of zone threads using the cache
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int initializeSparseCache(SparseCache    *cache,
+                                 const Geometry *geometry,
+                                 unsigned int    capacity,
+                                 unsigned int    zoneCount)
+{
+  cache->geometry  = geometry;
+  cache->capacity  = capacity;
+  cache->zoneCount = zoneCount;
+
+  // Scale down the skip threshold by the number of zones since we count the
+  // chapter search misses only in zone zero.
+  cache->skipSearchThreshold = (SKIP_SEARCH_THRESHOLD / zoneCount);
+
+  int result = initializeBarrier(&cache->beginCacheUpdate, zoneCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = initializeBarrier(&cache->endCacheUpdate, zoneCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  unsigned int i;
+  for (i = 0; i < capacity; i++) {
+    result = initializeCachedChapterIndex(&cache->chapters[i], geometry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Allocate each zone's independent LRU order.
+  for (i = 0; i < zoneCount; i++) {
+    result = makeSearchList(capacity, &cache->searchLists[i]);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeSparseCache(const Geometry  *geometry,
+                    unsigned int     capacity,
+                    unsigned int     zoneCount,
+                    SparseCache    **cachePtr)
+{
+  unsigned int bytes
+    = (sizeof(SparseCache) + (capacity * sizeof(CachedChapterIndex)));
+
+  SparseCache *cache;
+  int result = allocateCacheAligned(bytes, "sparse cache", &cache);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initializeSparseCache(cache, geometry, capacity, zoneCount);
+  if (result != UDS_SUCCESS) {
+    freeSparseCache(cache);
+    return result;
+  }
+
+  *cachePtr = cache;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+size_t getSparseCacheMemorySize(const SparseCache *cache)
+{
+  // Count the DeltaIndexPage as cache memory, but ignore all other overhead.
+  size_t pageSize = (sizeof(DeltaIndexPage) + cache->geometry->bytesPerPage);
+  size_t chapterSize = (pageSize * cache->geometry->indexPagesPerChapter);
+  return (cache->capacity * chapterSize);
+}
+
+/**
+ * Update counters to reflect a chapter access hit and clear the skipSearch
+ * flag on the chapter, if set.
+ *
+ * @param cache      the cache to update
+ * @param chapter    the cache entry to update
+ **/
+static void scoreChapterHit(SparseCache        *cache,
+                            CachedChapterIndex *chapter)
+{
+  cache->counters.chapterHits += 1;
+  setSkipSearch(chapter, false);
+}
+
+/**
+ * Update counters to reflect a chapter access miss.
+ *
+ * @param cache      the cache to update
+ **/
+static void scoreChapterMiss(SparseCache *cache)
+{
+  cache->counters.chapterMisses += 1;
+}
+
+/**
+ * Check if the cache entry that is about to be replaced is already dead, and
+ * if it's not, add to tally of evicted or invalidated cache entries.
+ *
+ * @param zone       the zone used to find the oldest chapter
+ * @param cache      the cache to update
+ * @param chapter    the cache entry about to be replaced
+ **/
+static void scoreEviction(IndexZone          *zone,
+                          SparseCache        *cache,
+                          CachedChapterIndex *chapter)
+{
+  if (chapter->virtualChapter == UINT64_MAX) {
+    return;
+  }
+  if (chapter->virtualChapter < zone->oldestVirtualChapter) {
+    cache->counters.invalidations += 1;
+  } else {
+    cache->counters.evictions += 1;
+  }
+}
+
+/**
+ * Update counters to reflect a cache search hit. This bumps the hit
+ * count, clears the miss count, and clears the skipSearch flag.
+ *
+ * @param cache      the cache to update
+ * @param chapter    the cache entry to update
+ **/
+static void scoreSearchHit(SparseCache        *cache,
+                           CachedChapterIndex *chapter)
+{
+  cache->counters.searchHits += 1;
+  chapter->counters.searchHits += 1;
+  chapter->counters.consecutiveMisses = 0;
+  setSkipSearch(chapter, false);
+}
+
+/**
+ * Update counters to reflect a cache search miss. This bumps the consecutive
+ * miss count, and if it goes over skipSearchThreshold, sets the skipSearch
+ * flag on the chapter.
+ *
+ * @param cache      the cache to update
+ * @param chapter    the cache entry to update
+ **/
+static void scoreSearchMiss(SparseCache        *cache,
+                            CachedChapterIndex *chapter)
+{
+  cache->counters.searchMisses += 1;
+  chapter->counters.searchMisses += 1;
+  chapter->counters.consecutiveMisses += 1;
+  if (chapter->counters.consecutiveMisses > cache->skipSearchThreshold) {
+    setSkipSearch(chapter, true);
+  }
+}
+
+/**********************************************************************/
+void freeSparseCache(SparseCache *cache)
+{
+  if (cache == NULL) {
+    return;
+  }
+
+  unsigned int i;
+  for (i = 0; i < cache->zoneCount; i++) {
+    freeSearchList(&cache->searchLists[i]);
+  }
+
+  for (i = 0; i < cache->capacity; i++) {
+    CachedChapterIndex *chapter = &cache->chapters[i];
+    destroyCachedChapterIndex(chapter);
+  }
+
+  destroyBarrier(&cache->beginCacheUpdate);
+  destroyBarrier(&cache->endCacheUpdate);
+  FREE(cache);
+}
+
+
+/**********************************************************************/
+bool sparseCacheContains(SparseCache  *cache,
+                         uint64_t      virtualChapter,
+                         unsigned int  zoneNumber)
+{
+  /*
+   * The correctness of the barriers depends on the invariant that between
+   * calls to updateSparseCache(), the answers this function returns must
+   * never vary--the result for a given chapter must be identical across
+   * zones. That invariant must be maintained even if the chapter falls off
+   * the end of the volume, or if searching it is disabled because of too many
+   * search misses.
+   */
+
+  // Get the chapter search order for this zone thread.
+  SearchListIterator iterator
+    = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters);
+  while (hasNextChapter(&iterator)) {
+    CachedChapterIndex *chapter = getNextChapter(&iterator);
+    if (virtualChapter == chapter->virtualChapter) {
+      if (zoneNumber == ZONE_ZERO) {
+        scoreChapterHit(cache, chapter);
+      }
+
+      // Move the chapter to the front of the search list.
+      rotateSearchList(iterator.list, iterator.nextEntry);
+      return true;
+    }
+  }
+
+  // The specified virtual chapter isn't cached.
+  if (zoneNumber == ZONE_ZERO) {
+    scoreChapterMiss(cache);
+  }
+  return false;
+}
+
+/**********************************************************************/
+int updateSparseCache(IndexZone *zone, uint64_t virtualChapter)
+{
+  const Index *index = zone->index;
+  SparseCache *cache = index->volume->sparseCache;
+
+  // If the chapter is already in the cache, we don't need to do a thing
+  // except update the search list order, which this check does.
+  if (sparseCacheContains(cache, virtualChapter, zone->id)) {
+    return UDS_SUCCESS;
+  }
+
+  // Wait for every zone thread to have reached its corresponding barrier
+  // request and invoked this function before starting to modify the cache.
+  enterBarrier(&cache->beginCacheUpdate, NULL);
+
+  /*
+   * This is the start of the critical section: the zone zero thread is
+   * captain, effectively holding an exclusive lock on the sparse cache. All
+   * the other zone threads must do nothing between the two barriers. They
+   * will wait at the endCacheUpdate barrier for the captain to finish the
+   * update.
+   */
+
+  int result = UDS_SUCCESS;
+  if (zone->id == ZONE_ZERO) {
+    // Purge invalid chapters from the LRU search list.
+    SearchList *zoneZeroList = cache->searchLists[ZONE_ZERO];
+    purgeSearchList(zoneZeroList, cache->chapters, zone->oldestVirtualChapter);
+
+    // First check that the desired chapter is still in the volume. If it's
+    // not, the hook fell out of the index and there's nothing to do for it.
+    if (virtualChapter >= index->oldestVirtualChapter) {
+      // Evict the least recently used live chapter, or replace a dead cache
+      // entry, all by rotating the the last list entry to the front.
+      CachedChapterIndex *victim
+        = &cache->chapters[rotateSearchList(zoneZeroList, cache->capacity)];
+
+      // Check if the victim is already dead, and if it's not, add to the
+      // tally of evicted or invalidated cache entries.
+      scoreEviction(zone, cache, victim);
+
+      // Read the index page bytes and initialize the page array.
+      result = cacheChapterIndex(victim, virtualChapter, index->volume);
+    }
+
+    // Copy the new search list state to all the other zone threads so they'll
+    // get the result of pruning and see the new chapter.
+    unsigned int z;
+    for (z = 1; z < cache->zoneCount; z++) {
+      copySearchList(zoneZeroList, cache->searchLists[z]);
+    }
+  }
+
+  // This is the end of the critical section. All cache invariants must have
+  // been restored--it will be shared/read-only again beyond the barrier.
+
+  enterBarrier(&cache->endCacheUpdate, NULL);
+  return result;
+}
+
+
+/**********************************************************************/
+int searchSparseCache(IndexZone          *zone,
+                      const UdsChunkName *name,
+                      uint64_t           *virtualChapterPtr,
+                      int                *recordPagePtr)
+{
+  Volume *volume = zone->index->volume;
+  SparseCache *cache = volume->sparseCache;
+  unsigned int zoneNumber = zone->id;
+  // If the caller did not specify a virtual chapter, search the entire cache.
+  bool searchAll = (*virtualChapterPtr == UINT64_MAX);
+  unsigned int chaptersSearched = 0;
+
+  // Get the chapter search order for this zone thread, searching the chapters
+  // from most recently hit to least recently hit.
+  SearchListIterator iterator
+    = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters);
+  while (hasNextChapter(&iterator)) {
+    CachedChapterIndex *chapter = getNextChapter(&iterator);
+
+    // Skip chapters no longer cached, or that have too many search misses.
+    if (shouldSkipChapterIndex(zone, chapter, *virtualChapterPtr)) {
+      continue;
+    }
+
+    int result = searchCachedChapterIndex(chapter, cache->geometry,
+                                          volume->indexPageMap, name,
+                                          recordPagePtr);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    chaptersSearched += 1;
+
+    // Did we find an index entry for the name?
+    if (*recordPagePtr != NO_CHAPTER_INDEX_ENTRY) {
+      if (zoneNumber == ZONE_ZERO) {
+        scoreSearchHit(cache, chapter);
+      }
+
+      // Move the chapter to the front of the search list.
+      rotateSearchList(iterator.list, iterator.nextEntry);
+
+      // Return a matching entry as soon as it is found. It might be a false
+      // collision that has a true match in another chapter, but that's a very
+      // rare case and not worth the extra search cost or complexity.
+      *virtualChapterPtr = chapter->virtualChapter;
+      return UDS_SUCCESS;
+    }
+
+    if (zoneNumber == ZONE_ZERO) {
+      scoreSearchMiss(cache, chapter);
+    }
+
+    if (!searchAll) {
+      // We just searched the virtual chapter the caller specified and there
+      // was no match, so we're done.
+      break;
+    }
+  }
+
+  // The name was not found in the cache.
+  *recordPagePtr = NO_CHAPTER_INDEX_ENTRY;
+  return UDS_SUCCESS;
+}
diff --git a/uds/sparseCache.h b/uds/sparseCache.h
new file mode 100644
index 0000000..09c4a1c
--- /dev/null
+++ b/uds/sparseCache.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.h#1 $
+ */
+
+#ifndef SPARSE_CACHE_H
+#define SPARSE_CACHE_H
+
+#include "cacheCounters.h"
+#include "geometry.h"
+#include "indexZone.h"
+#include "typeDefs.h"
+
+/**
+ * SparseCache is the cache of entire chapter indexes from sparse chapters
+ * used for searching for chunks after all other search paths have failed. It
+ * contains only complete chapter indexes; record pages from sparse chapters
+ * and single index pages used for resolving hooks are kept in the volume page
+ * cache.
+ *
+ * Searching the cache is an unsynchronized operation. Changing the contents
+ * of the cache is a coordinated process requiring the coordinated
+ * participation of all zone threads via the careful use of barrier messages
+ * sent to all the index zones by the triage queue worker thread.
+ **/
+typedef struct sparseCache SparseCache;
+
+// Bare declaration to avoid include dependency loops.
+struct index;
+
+/**
+ * Allocate and initialize a sparse chapter index cache.
+ *
+ * @param [in]  geometry   the geometry governing the volume
+ * @param [in]  capacity   the number of chapters the cache will hold
+ * @param [in]  zoneCount  the number of zone threads using the cache
+ * @param [out] cachePtr   a pointer in which to return the new cache
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeSparseCache(const Geometry  *geometry,
+                    unsigned int     capacity,
+                    unsigned int     zoneCount,
+                    SparseCache    **cachePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy and free a sparse chapter index cache.
+ *
+ * @param cache  the cache to free
+ **/
+void freeSparseCache(SparseCache *cache);
+
+/**
+ * Get the number of bytes of memory used by a sparse chapter cache.
+ *
+ * @param cache  the cache to measure
+ **/
+size_t getSparseCacheMemorySize(const SparseCache *cache);
+
+
+/**
+ * Check whether a sparse chapter index is present in the chapter cache. This
+ * is only intended for use by the zone threads.
+ *
+ * @param cache           the cache to search for the virtual chapter
+ * @param virtualChapter  the virtual chapter number of the chapter index
+ * @param zoneNumber      the zone number of the calling thread
+ *
+ * @return <code>true</code> iff the sparse chapter index is cached
+ **/
+bool sparseCacheContains(SparseCache  *cache,
+                         uint64_t      virtualChapter,
+                         unsigned int  zoneNumber);
+
+/**
+ * Update the sparse cache to contain a chapter index.
+ *
+ * This function must be called by all the zone threads with the same chapter
+ * numbers to correctly enter the thread barriers used to synchronize the
+ * cache updates.
+ *
+ * @param zone            the index zone
+ * @param virtualChapter  the virtual chapter number of the chapter index
+ *
+ * @return UDS_SUCCESS or an error code if the chapter index could not be
+ *         read or decoded
+ **/
+int updateSparseCache(IndexZone *zone, uint64_t virtualChapter)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Search the cached sparse chapter indexes for a chunk name, returning a
+ * virtual chapter number and record page number that may contain the name.
+ *
+ * @param [in]     zone               the zone containing the volume, sparse
+ *                                    chapter index cache and the index page
+ *                                    number map
+ * @param [in]     name               the chunk name to search for
+ * @param [in,out] virtualChapterPtr  If <code>UINT64_MAX</code> on input,
+ *                                    search all cached chapters, else search
+ *                                    the specified virtual chapter, if cached.
+ *                                    On output, if a match was found, set to
+ *                                    the virtual chapter number of the match,
+ *                                    otherwise set to UINT64_MAX on a miss.
+ * @param [out]    recordPagePtr      the record page number of a match, else
+ *                                    NO_CHAPTER_INDEX_ENTRY if nothing matched
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int searchSparseCache(IndexZone          *zone,
+                      const UdsChunkName *name,
+                      uint64_t           *virtualChapterPtr,
+                      int                *recordPagePtr)
+  __attribute__((warn_unused_result));
+
+#endif /* SPARSE_CACHE_H */
diff --git a/uds/stringLinuxKernel.c b/uds/stringLinuxKernel.c
new file mode 100644
index 0000000..bf0a255
--- /dev/null
+++ b/uds/stringLinuxKernel.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/stringLinuxKernel.c#1 $
+ */
+
+#include <linux/mm.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "stringUtils.h"
+
+/**********************************************************************/
+int stringToSignedLong(const char *nptr, long *num)
+{
+  while (*nptr == ' ') {
+    nptr++;
+  }
+  return kstrtol(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int stringToUnsignedLong(const char *nptr, unsigned long *num)
+{
+  while (*nptr == ' ') {
+    nptr++;
+  }
+  if (*nptr == '+') {
+    nptr++;
+  }
+  return kstrtoul(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+char *nextToken(char *str, const char *delims, char **state)
+{
+  char *sp = str ? str : *state;
+  while (*sp && strchr(delims, *sp)) {
+    ++sp;
+  }
+  if (!*sp) {
+    return NULL;
+  }
+  char *ep = sp;
+  while (*ep && !strchr(delims, *ep)) {
+    ++ep;
+  }
+  if (*ep) {
+    *ep++ = '\0';
+  }
+  *state = ep;
+  return sp;
+}
+
+/*****************************************************************************/
+int parseUint64(const char *str, uint64_t *num)
+{
+  unsigned long value = *num;
+  int result = stringToUnsignedLong(str, &value);
+  *num = value;
+  return result;
+}
diff --git a/uds/stringUtils.c b/uds/stringUtils.c
new file mode 100644
index 0000000..93d7da1
--- /dev/null
+++ b/uds/stringUtils.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.c#2 $
+ */
+
+#include "stringUtils.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "uds.h"
+
+/*****************************************************************************/
+int allocSprintf(const char *what, char **strp, const char *fmt, ...)
+{
+  if (strp == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+  va_list args;
+#ifdef __KERNEL__
+  // We want the memory allocation to use our own ALLOCATE/FREE wrappers.
+  va_start(args, fmt);
+  int count = vsnprintf(NULL, 0, fmt, args) + 1;
+  va_end(args);
+  int result = ALLOCATE(count, char, what, strp);
+  if (result == UDS_SUCCESS) {
+    va_start(args, fmt);
+    vsnprintf(*strp, count, fmt, args);
+    va_end(args);
+  }
+#else
+  va_start(args, fmt);
+  int result = vasprintf(strp, fmt, args) == -1 ? ENOMEM : UDS_SUCCESS;
+  va_end(args);
+#endif
+  if ((result != UDS_SUCCESS) && (what != NULL)) {
+    logError("cannot allocate %s", what);
+  }
+  return result;
+}
+
+/*****************************************************************************/
+int wrapVsnprintf(const char *what, char *buf, size_t bufSize,
+                  int error, const char *fmt, va_list ap, size_t *needed)
+{
+  if (buf == NULL) {
+    static char nobuf[1];
+    buf = nobuf;
+    bufSize = 0;
+  }
+  int n = vsnprintf(buf, bufSize, fmt, ap);
+  if (n < 0) {
+    return logErrorWithStringError(UDS_UNEXPECTED_RESULT,
+                                   "%s: vsnprintf failed", what);
+  }
+  if (needed) {
+    *needed = n;
+  }
+  if (((size_t) n >= bufSize) && (buf != NULL) && (error != UDS_SUCCESS)) {
+    return logErrorWithStringError(error, "%s: string too long", what);
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int fixedSprintf(const char *what,
+                 char       *buf,
+                 size_t      bufSize,
+                 int         error,
+                 const char *fmt,
+                 ...)
+{
+  if (buf == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+  va_list args;
+  va_start(args, fmt);
+  int result = wrapVsnprintf(what, buf, bufSize, error, fmt, args, NULL);
+  va_end(args);
+  return result;
+}
+
+/*****************************************************************************/
+char *vAppendToBuffer(char       *buffer,
+                      char       *bufEnd,
+                      const char *fmt,
+                      va_list     args)
+{
+  size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args);
+  if (n >= (size_t) (bufEnd - buffer)) {
+    buffer = bufEnd;
+  } else {
+    buffer += n;
+  }
+  return buffer;
+}
+
+/*****************************************************************************/
+char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...)
+{
+  va_list ap;
+
+  va_start(ap, fmt);
+  char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap);
+  va_end(ap);
+  return pos;
+}
+
+/*****************************************************************************/
+int stringToSignedInt(const char *nptr, int *num)
+{
+  long value;
+  int result = stringToSignedLong(nptr, &value);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if ((value < INT_MIN) || (value > INT_MAX)) {
+    return ERANGE;
+  }
+  *num = (int) value;
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int stringToUnsignedInt(const char *nptr, unsigned int *num)
+{
+  unsigned long value;
+  int result = stringToUnsignedLong(nptr, &value);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (value > UINT_MAX) {
+    return ERANGE;
+  }
+  *num = (unsigned int) value;
+  return UDS_SUCCESS;
+}
diff --git a/uds/stringUtils.h b/uds/stringUtils.h
new file mode 100644
index 0000000..bd685bb
--- /dev/null
+++ b/uds/stringUtils.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.h#2 $
+ */
+
+#ifndef STRING_UTILS_H
+#define STRING_UTILS_H
+
+#include <stdarg.h>
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#include <linux/string.h>
+#else
+#include <stdio.h>   // for vsnprintf
+#include <stdlib.h>  // for strtol
+#include <string.h>
+#include <strings.h>
+#endif
+
+#include "compiler.h"
+#include "typeDefs.h"
+
+/**
+ * Convert a boolean value to its corresponding "true" or "false" string.
+ *
+ * @param value  The boolean value to convert
+ *
+ * @return "true" if value is true, "false" otherwise.
+ **/
+static INLINE const char *boolToString(bool value)
+{
+  return (value ? "true" : "false");
+}
+
+/**
+ * Allocate a string built according to format (our version of asprintf).
+ *
+ * @param [in]  what    A description of what is being allocated, for error
+ *                      logging; if NULL doesn't log anything.
+ * @param [out] strp    The pointer in which to store the allocated string.
+ * @param [in]  fmt     The sprintf format parameter.
+ *
+ * @return UDS_SUCCESS, or the appropriately translated asprintf error
+ **/
+int allocSprintf(const char *what, char **strp, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4), warn_unused_result));
+
+/**
+ * Write a printf-style string into a fixed-size buffer, returning
+ * errors if it would not fit. (our version of snprintf)
+ *
+ * @param [in]  what    A description of what is being written, for error
+ *                      logging; if NULL doesn't log anything.
+ * @param [out] buf     The target buffer
+ * @param [in]  bufSize The size of buf
+ * @param [in]  error   Error code to return on overflow
+ * @param [in]  fmt     The sprintf format parameter.
+ * @return <code>UDS_SUCCESS</code> or <code>error</code>
+ **/
+int fixedSprintf(const char *what, char *buf, size_t bufSize,
+                 int error, const char *fmt, ...)
+  __attribute__((format(printf, 5, 6), warn_unused_result));
+
+/**
+ * Write printf-style string into an existing buffer, returning a specified
+ * error code if it would not fit, and setting ``needed`` to the amount of
+ * space that would be required.
+ *
+ * @param [in]  what    A description of what is being written, for logging.
+ * @param [in]  buf     The buffer in which to write the string, or NULL to
+ *                      merely determine the required space.
+ * @param [in]  bufSize The size of buf.
+ * @param [in]  error   The error code to return for exceeding the specified
+ *                      space, UDS_SUCCESS if no logging required.
+ * @param [in]  fmt     The sprintf format specification.
+ * @param [in]  ap      The variable argument pointer (see <stdarg.h>).
+ * @param [out] needed  If non-NULL, the actual amount of string space
+ *                      required, which may be smaller or larger than bufSize.
+ *
+ * @return UDS_SUCCESS if the string fits, the value of the error parameter if
+ *         the string does not fit and a buffer was supplied, or
+ *         UDS_UNEXPECTED_RESULT if vsnprintf fails in some other undocumented
+ *         way.
+ **/
+int wrapVsnprintf(const char *what, char *buf, size_t bufSize,
+                  int error, const char *fmt, va_list ap, size_t *needed)
+  __attribute__((format(printf, 5, 0), warn_unused_result));
+
+/**
+ * Helper to append a string to a buffer.
+ *
+ * @param buffer        the place at which to append the string
+ * @param bufEnd        pointer to the end of the buffer
+ * @param fmt           a printf format string
+ *
+ * @return      the updated buffer position after the append
+ *
+ * if insufficient space is available, the contents are silently truncated
+ **/
+char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+
+/**
+ * Variable-arglist helper to append a string to a buffer.
+ *
+ * @param buffer  the place at which to append the string
+ * @param bufEnd  pointer to the end of the buffer
+ * @param fmt     a printf format string
+ * @param args    printf arguments
+ *
+ * @return the updated buffer position after the append
+ *
+ * if insufficient space is available, the contents are silently truncated
+ **/
+char *vAppendToBuffer(char       *buffer,
+                      char       *bufEnd,
+                      const char *fmt,
+                      va_list     args)
+  __attribute__((format(printf, 3, 0)));
+
+/**
+ * Our version of strtok_r, since some platforma apparently don't define it.
+ *
+ * @param str           On first call, the string to tokenize. On subsequent
+ *                      calls, NULL.
+ * @param delims        The set of delimiter characters.
+ * @param statePtr      The address of a variable which holds the state of
+ *                      the tokenization between calls to nextToken.
+ *
+ * @return the next token if any, or NULL
+ **/
+char *nextToken(char *str, const char *delims, char **statePtr);
+
+/**
+ * Parse a string representing a decimal uint64_t.
+ *
+ * @param str           The string.
+ * @param num           Where to put the number.
+ *
+ * @return UDS_SUCCESS or the error UDS_INVALID_ARGUMENT if the string
+ *         is not in the correct format.
+ **/
+int parseUint64(const char *str, uint64_t *num)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to convert a string to an integer (base 10)
+ *
+ * @param nptr  Pointer to string to convert
+ * @param num   The resulting integer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int stringToSignedInt(const char *nptr, int *num)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to convert a string to a long integer (base 10)
+ *
+ * @param nptr  Pointer to string to convert
+ * @param num   The resulting long integer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int stringToSignedLong(const char *nptr, long *num)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to convert a string to an unsigned integer (base 10).
+ *
+ * @param nptr  Pointer to string to convert
+ * @param num   The resulting unsigned integer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int stringToUnsignedInt(const char *nptr, unsigned int *num)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to convert a string to an unsigned long integer (base 10).
+ *
+ * @param nptr  Pointer to string to convert
+ * @param num   The resulting long unsigned integer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int stringToUnsignedLong(const char *nptr, unsigned long *num)
+  __attribute__((warn_unused_result));
+
+#endif /* STRING_UTILS_H */
diff --git a/uds/sysfs.c b/uds/sysfs.c
new file mode 100644
index 0000000..b2009d7
--- /dev/null
+++ b/uds/sysfs.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.c#4 $
+ */
+
+#include "sysfs.h"
+
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+#include "uds.h"
+
+static struct {
+  struct kobject kobj;               // /sys/uds
+  struct kobject parameterKobj;      // /sys/uds/parameter
+  // These flags are used to ensure a clean shutdown
+  bool flag;               // /sys/uds
+  bool parameterFlag;      // /sys/uds/parameter
+} objectRoot;
+
+/**********************************************************************/
+static char *bufferToString(const char *buf, size_t length)
+{
+  char *string;
+  if (ALLOCATE(length + 1, char, __func__, &string) != UDS_SUCCESS) {
+    return NULL;
+  }
+  memcpy(string, buf, length);
+  string[length] = '\0';
+  if (string[length - 1] == '\n') {
+    string[length - 1] = '\0';
+  }
+  return string;
+}
+
+/**********************************************************************/
+// This is the code for a directory in the /sys/<module_name> tree that
+// contains no regular files (only subdirectories).
+/**********************************************************************/
+
+/**********************************************************************/
+static void emptyRelease(struct kobject *kobj)
+{
+  // Many of our sysfs share this release function that does nothing.
+}
+
+/**********************************************************************/
+static ssize_t emptyShow(struct kobject   *kobj,
+                         struct attribute *attr,
+                         char             *buf)
+{
+  return 0;
+}
+
+/**********************************************************************/
+static ssize_t emptyStore(struct kobject   *kobj,
+                          struct attribute *attr,
+                          const char       *buf,
+                          size_t            length)
+{
+  return length;
+}
+
+static struct sysfs_ops emptyOps = {
+  .show  = emptyShow,
+  .store = emptyStore,
+};
+
+static struct attribute *emptyAttrs[] = {
+  NULL,
+};
+
+static struct kobj_type emptyObjectType = {
+  .release       = emptyRelease,
+  .sysfs_ops     = &emptyOps,
+  .default_attrs = emptyAttrs,
+};
+
+
+/**********************************************************************/
+// This is the the code for the /sys/<module_name>/parameter directory.
+//
+// <dir>/log_level                 UDS_LOG_LEVEL
+//
+/**********************************************************************/
+
+typedef struct {
+  struct attribute  attr;
+  const char *(*showString)(void);
+  void (*storeString)(const char *);
+} ParameterAttribute;
+
+/**********************************************************************/
+static ssize_t parameterShow(struct kobject   *kobj,
+                             struct attribute *attr,
+                             char             *buf)
+{
+  ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr);
+  if (pa->showString != NULL) {
+    return sprintf(buf, "%s\n", pa->showString());
+  } else {
+    return -EINVAL;
+  }
+}
+
+/**********************************************************************/
+static ssize_t parameterStore(struct kobject   *kobj,
+                              struct attribute *attr,
+                              const char       *buf,
+                              size_t            length)
+{
+  ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr);
+  char *string = bufferToString(buf, length);
+  if (string == NULL) {
+    return -ENOMEM;
+  }
+  int result = UDS_SUCCESS;
+  if (pa->storeString != NULL) {
+    pa->storeString(string);
+  } else {
+    return -EINVAL;
+  }
+  FREE(string);
+  return result == UDS_SUCCESS ? length : result;
+}
+
+/**********************************************************************/
+
+static const char *parameterShowLogLevel(void)
+{
+  return priorityToString(getLogLevel());
+}
+
+/**********************************************************************/
+
+static void parameterStoreLogLevel(const char *string)
+{
+  setLogLevel(stringToPriority(string));
+}
+
+/**********************************************************************/
+
+static ParameterAttribute logLevelAttr = {
+  .attr        = { .name = "log_level", .mode = 0600 },
+  .showString  = parameterShowLogLevel,
+  .storeString = parameterStoreLogLevel,
+};
+
+static struct attribute *parameterAttrs[] = {
+  &logLevelAttr.attr,
+  NULL,
+};
+
+static struct sysfs_ops parameterOps = {
+  .show  = parameterShow,
+  .store = parameterStore,
+};
+
+static struct kobj_type parameterObjectType = {
+  .release       = emptyRelease,
+  .sysfs_ops     = &parameterOps,
+  .default_attrs = parameterAttrs,
+};
+
+/**********************************************************************/
+int initSysfs(void)
+{
+  memset(&objectRoot, 0, sizeof(objectRoot));
+  kobject_init(&objectRoot.kobj, &emptyObjectType);
+  int result = kobject_add(&objectRoot.kobj, NULL, THIS_MODULE->name);
+  if (result == 0) {
+    objectRoot.flag = true;
+    kobject_init(&objectRoot.parameterKobj, &parameterObjectType);
+    result = kobject_add(&objectRoot.parameterKobj, &objectRoot.kobj,
+                         "parameter");
+    if (result == 0) {
+      objectRoot.parameterFlag = true;
+    }
+  }
+  if (result != 0) {
+    putSysfs();
+  }
+  return result;
+}
+
+/**********************************************************************/
+void putSysfs()
+{
+  if (objectRoot.parameterFlag) {
+    kobject_put(&objectRoot.parameterKobj);
+  }
+  if (objectRoot.flag) {
+    kobject_put(&objectRoot.kobj);
+  }
+}
diff --git a/uds/sysfs.h b/uds/sysfs.h
new file mode 100644
index 0000000..d5f9ccf
--- /dev/null
+++ b/uds/sysfs.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.h#1 $
+ */
+
+#ifndef SYSFS_H
+#define SYSFS_H
+
+/**
+ * Called when the module is loaded to initialize the /sys/\<module_name\>
+ * tree.
+ *
+ * @return 0 on success, or non-zero on error
+ **/
+int initSysfs(void);
+
+/**
+ * Called when the module is being unloaded to terminate the
+ * /sys/\<module_name\> tree.
+ **/
+void putSysfs(void);
+
+#endif /*  SYSFS_H  */
diff --git a/uds/threadCondVarLinuxKernel.c b/uds/threadCondVarLinuxKernel.c
new file mode 100644
index 0000000..e3c1517
--- /dev/null
+++ b/uds/threadCondVarLinuxKernel.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadCondVarLinuxKernel.c#2 $
+ */
+
+#include "threads.h"
+#include "timeUtils.h"
+#include "uds-error.h"
+
+/**********************************************************************/
+int initCond(CondVar *cv)
+{
+  cv->eventCount = NULL;
+  return makeEventCount(&cv->eventCount);
+}
+
+/**********************************************************************/
+int signalCond(CondVar *cv)
+{
+  eventCountBroadcast(cv->eventCount);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int broadcastCond(CondVar *cv)
+{
+  eventCountBroadcast(cv->eventCount);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int waitCond(CondVar *cv, Mutex *mutex)
+{
+  EventToken token = eventCountPrepare(cv->eventCount);
+  unlockMutex(mutex);
+  eventCountWait(cv->eventCount, token, NULL);
+  lockMutex(mutex);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int timedWaitCond(CondVar *cv, Mutex *mutex, RelTime timeout)
+{
+  EventToken token = eventCountPrepare(cv->eventCount);
+  unlockMutex(mutex);
+  bool happened = eventCountWait(cv->eventCount, token, &timeout);
+  lockMutex(mutex);
+  return happened ? UDS_SUCCESS : ETIMEDOUT;
+}
+
+/**********************************************************************/
+int destroyCond(CondVar *cv)
+{
+  freeEventCount(cv->eventCount);
+  cv->eventCount = NULL;
+  return UDS_SUCCESS;
+}
diff --git a/uds/threadOnce.c b/uds/threadOnce.c
new file mode 100644
index 0000000..62149ca
--- /dev/null
+++ b/uds/threadOnce.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.c#1 $
+ */
+
+#include "errors.h"
+#include "threads.h"
+
+enum {
+  ONCE_NOT_DONE    = 0,
+  ONCE_IN_PROGRESS = 1,
+  ONCE_COMPLETE    = 2,
+};
+
+/*****************************************************************************/
+int performOnce(OnceState *once, void (*function)(void))
+{
+  for (;;) {
+    switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) {
+    case ONCE_NOT_DONE:
+      function();
+      atomic_set_release(once, ONCE_COMPLETE);
+      return UDS_SUCCESS;
+    case ONCE_IN_PROGRESS:
+      yieldScheduler();
+      break;
+    case ONCE_COMPLETE:
+      return UDS_SUCCESS;
+    default:
+      return UDS_BAD_STATE;
+    }
+  }
+}
diff --git a/uds/threadOnce.h b/uds/threadOnce.h
new file mode 100644
index 0000000..58b6da3
--- /dev/null
+++ b/uds/threadOnce.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.h#1 $
+ */
+
+#ifndef THREAD_ONCE_H
+#define THREAD_ONCE_H
+
+#include "atomicDefs.h"
+
+#define ONCE_STATE_INITIALIZER ATOMIC_INIT(0)
+
+typedef atomic_t OnceState;
+
+/**
+ * Thread safe once only initialization.
+ *
+ * @param onceState    pointer to object to record that initialization
+ *                     has been performed
+ * @param initFunction called if onceState does not indicate
+ *                     initialization has been performed
+ *
+ * @return             UDS_SUCCESS or error code
+ *
+ * @note Generally the following declaration of onceState is performed in
+ *       at file scope:
+ *
+ *       static OnceState onceState = ONCE_STATE_INITIALIZER;
+ **/
+int performOnce(OnceState *onceState, void (*initFunction) (void));
+
+#endif /* THREAD_ONCE_H */
diff --git a/uds/threadRegistry.c b/uds/threadRegistry.c
new file mode 100644
index 0000000..c37e77a
--- /dev/null
+++ b/uds/threadRegistry.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.c#1 $
+ */
+
+#include "threadRegistry.h"
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include "permassert.h"
+
+/*
+ * We need to be careful when using other facilities that may use
+ * threadRegistry functions in their normal operation.  For example,
+ * we do not want to invoke the logger while holding a lock.
+ */
+
+/*****************************************************************************/
+void registerThread(ThreadRegistry   *registry,
+                    RegisteredThread *newThread,
+                    const void       *pointer)
+{
+  INIT_LIST_HEAD(&newThread->links);
+  newThread->pointer = pointer;
+  newThread->task    = current;
+
+  bool foundIt = false;
+  RegisteredThread *thread;
+  write_lock(&registry->lock);
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      // This should not have been there.
+      // We'll complain after releasing the lock.
+      list_del_init(&thread->links);
+      foundIt = true;
+      break;
+    }
+  }
+  list_add_tail(&newThread->links, &registry->links);
+  write_unlock(&registry->lock);
+  ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry");
+}
+
+/*****************************************************************************/
+void unregisterThread(ThreadRegistry *registry)
+{
+  bool foundIt = false;
+  RegisteredThread *thread;
+  write_lock(&registry->lock);
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      list_del_init(&thread->links);
+      foundIt = true;
+      break;
+    }
+  }
+  write_unlock(&registry->lock);
+  ASSERT_LOG_ONLY(foundIt, "thread found in registry");
+}
+
+/*****************************************************************************/
+void initializeThreadRegistry(ThreadRegistry *registry)
+{
+  INIT_LIST_HEAD(&registry->links);
+  rwlock_init(&registry->lock);
+}
+
+/*****************************************************************************/
+const void *lookupThread(ThreadRegistry *registry)
+{
+  const void *result = NULL;
+  read_lock(&registry->lock);
+  RegisteredThread *thread;
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      result = thread->pointer;
+      break;
+    }
+  }
+  read_unlock(&registry->lock);
+  return result;
+}
diff --git a/uds/threadRegistry.h b/uds/threadRegistry.h
new file mode 100644
index 0000000..ec1832d
--- /dev/null
+++ b/uds/threadRegistry.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.h#1 $
+ */
+
+#ifndef THREAD_REGISTRY_H
+#define THREAD_REGISTRY_H 1
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * We don't expect this set to ever get really large, so a linked list
+ * is adequate.
+ */
+
+typedef struct threadRegistry {
+  struct list_head links;
+  rwlock_t         lock;
+} ThreadRegistry;
+
+typedef struct registeredThread {
+  struct list_head    links;
+  const void         *pointer;
+  struct task_struct *task;
+} RegisteredThread;
+
+/*****************************************************************************/
+
+/**
+ * Initialize a registry of threads and associated data pointers.
+ *
+ * @param  registry  The registry to initialize
+ **/
+void initializeThreadRegistry(ThreadRegistry *registry);
+
+/**
+ * Register the current thread and associate it with a data pointer.
+ *
+ * This call will log messages if the thread is already registered.
+ *
+ * @param registry   The thread registry
+ * @param newThread  RegisteredThread structure to use for the current thread
+ * @param pointer    The value to associated with the current thread
+ **/
+void registerThread(ThreadRegistry   *registry,
+                    RegisteredThread *newThread,
+                    const void       *pointer);
+
+/**
+ * Remove the registration for the current thread.
+ *
+ * A message may be logged if the thread was not registered.
+ *
+ * @param  registry  The thread registry
+ **/
+void unregisterThread(ThreadRegistry *registry);
+
+/**
+ * Fetch a pointer that may have been registered for the current
+ * thread. If the thread is not registered, a null pointer is
+ * returned.
+ *
+ * @param  registry  The thread registry
+ *
+ * @return  the registered pointer, if any, or NULL
+ **/
+const void *lookupThread(ThreadRegistry *registry);
+
+#endif /* THREAD_REGISTRY_H */
diff --git a/uds/threads.h b/uds/threads.h
new file mode 100644
index 0000000..793355c
--- /dev/null
+++ b/uds/threads.h
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/threads.h#4 $
+ */
+
+#ifndef THREADS_H
+#define THREADS_H
+
+#include "compiler.h"
+#include "threadOnce.h"
+#include "timeUtils.h"
+#include "uds-error.h"
+
+#ifdef __KERNEL__
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include "util/eventCount.h"
+#else
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdbool.h>
+#endif
+
+#ifdef __KERNEL__
+typedef struct { EventCount *eventCount; } CondVar;
+typedef struct mutex                       Mutex;
+typedef struct semaphore                   Semaphore;
+typedef struct kernelThread               *Thread;
+typedef pid_t                              ThreadId;
+
+typedef struct {
+  Semaphore mutex;       // Mutex for this barrier object
+  Semaphore wait;        // Semaphore for threads waiting at the barrier
+  int       arrived;     // Number of threads which have arrived
+  int       threadCount; // Total number of threads using this barrier
+} Barrier;
+#else
+typedef pthread_barrier_t Barrier;
+typedef pthread_cond_t    CondVar;
+typedef pthread_mutex_t   Mutex;
+typedef sem_t             Semaphore;
+typedef pthread_t         Thread;
+typedef pid_t             ThreadId;
+
+#ifndef NDEBUG
+#define MUTEX_INITIALIZER PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
+#else
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#endif
+
+extern const bool DO_ASSERTIONS;
+#endif
+
+#ifdef __KERNEL__
+/**
+ * Apply a function to every thread that we have created.
+ *
+ * @param applyFunc  The function to apply
+ * @param argument   The first argument to applyFunc
+ *
+ **/
+void applyToThreads(void applyFunc(void *, struct task_struct *),
+                    void *argument);
+#endif
+
+/**
+ * Create a thread, logging any cause of failure.
+ *
+ * @param threadFunc  function to run in new thread
+ * @param threadData  private data for new thread
+ * @param name        name of the new thread
+ * @param newThread   where to store the new thread id
+ *
+ * @return       success or failure indication
+ **/
+int createThread(void       (*threadFunc)(void *),
+                 void        *threadData,
+                 const char  *name,
+                 Thread      *newThread)
+  __attribute__((warn_unused_result));
+
+/**
+ * Retrieve the current numbers of cores.
+ *
+ * This is either the total number or the number of cores that this
+ * process has been limited to.
+ *
+ * @return      number of cores
+ **/
+unsigned int getNumCores(void);
+
+/**
+ * Return the id of the current thread.
+ *
+ * @return the thread id
+ **/
+ThreadId getThreadId(void) __attribute__((warn_unused_result));
+
+#ifndef __KERNEL__
+/**
+ * Get the name of the current thread.
+ *
+ * @param name   a buffer of size at least 16 to write the name to
+ **/
+void getThreadName(char *name);
+#endif
+
+/**
+ * Wait for termination of another thread.
+ *
+ *
+ * @param th             The thread for which to wait.
+ *
+ * @return               UDS_SUCCESS or error code
+ **/
+int joinThreads(Thread th);
+
+#ifdef __KERNEL__
+/**
+ * Exit the current thread.  This is a kernel-only function that is intended to
+ * be an alternative to using BUG() or BUG_ON().
+ **/
+__attribute__((noreturn))
+void exitThread(void);
+#endif
+
+/**
+ * Initialize a thread synchronization barrier (also known as a rendezvous).
+ *
+ * @param barrier      the barrier to initialize
+ * @param threadCount  the number of threads that must enter the barrier before
+ *                     any threads are permitted to leave it
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int initializeBarrier(Barrier *barrier, unsigned int threadCount)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a thread synchronization barrier.
+ *
+ * @param barrier   the barrier to destroy
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int destroyBarrier(Barrier *barrier);
+
+/**
+ * Enter a thread synchronization barrier, waiting for the configured number
+ * of threads to have entered before exiting the barrier. Exactly one thread
+ * will be arbitrarily selected to be flagged as the "winner" of a barrier.
+ *
+ * @param barrier   the barrier to enter
+ * @param winner    if non-NULL, a pointer to the flag indicating whether the
+ *                  calling thread was the unique winner
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int enterBarrier(Barrier *barrier, bool *winner);
+
+/**
+ * Initialize a condition variable with default attributes.
+ *
+ * @param cond       condition variable to init
+ *
+ * @return           UDS_SUCCESS or error code
+ **/
+int initCond(CondVar *cond) __attribute__((warn_unused_result));
+
+/**
+ * Signal a condition variable.
+ *
+ * @param cond  condition variable to signal
+ *
+ * @return      UDS_SUCCESS or error code
+ **/
+int signalCond(CondVar *cond);
+
+/**
+ * Broadcast a condition variable.
+ *
+ * @param cond  condition variable to broadcast
+ *
+ * @return      UDS_SUCCESS or error code
+ **/
+int broadcastCond(CondVar *cond);
+
+/**
+ * Wait on a condition variable.
+ *
+ * @param cond    condition variable to wait on
+ * @param mutex   mutex to release while waiting
+ *
+ * @return        UDS_SUCCESS or error code
+ **/
+int waitCond(CondVar *cond, Mutex *mutex);
+
+/**
+ * Wait on a condition variable with a timeout.
+ *
+ * @param cond     condition variable to wait on
+ * @param mutex    mutex to release while waiting
+ * @param timeout  the relative time until the timeout expires
+ *
+ * @return error code (ETIMEDOUT if the deadline is hit)
+ **/
+int timedWaitCond(CondVar *cond, Mutex *mutex, RelTime timeout);
+
+/**
+ * Destroy a condition variable.
+ *
+ * @param cond  condition variable to destroy
+ *
+ * @return      UDS_SUCCESS or error code
+ **/
+int destroyCond(CondVar *cond);
+
+#ifndef __KERNEL__
+/**
+ * Initialize a mutex, optionally asserting if the mutex initialization fails.
+ * This function should only be called directly in places where making
+ * assertions is not safe.
+ *
+ * @param mutex         the mutex to initialize
+ * @param assertOnError if <code>true</code>, an error initializing the
+ *                      mutex will make an assertion
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int initializeMutex(Mutex *mutex, bool assertOnError);
+#endif
+
+/**
+ * Initialize the default type (error-checking during development) mutex.
+ *
+ * @param mutex the mutex to initialize
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+#ifdef __KERNEL__
+static INLINE int initMutex(Mutex *mutex)
+{
+  mutex_init(mutex);
+  return UDS_SUCCESS;
+}
+#else
+int initMutex(Mutex *mutex);
+#endif
+
+/**
+ * Destroy a mutex (with error checking during development).
+ *
+ * @param mutex mutex to destroy
+ *
+ * @return UDS_SUCCESS or error code
+ **/
+#ifdef __KERNEL__
+static INLINE int destroyMutex(Mutex *mutex)
+{
+  return UDS_SUCCESS;
+}
+#else
+int destroyMutex(Mutex *mutex);
+#endif
+
+/**
+ * Lock a mutex, with optional error checking during development.
+ *
+ * @param mutex mutex to lock
+ **/
+#ifdef __KERNEL__
+static INLINE void lockMutex(Mutex *mutex)
+{
+  mutex_lock(mutex);
+}
+#else
+void lockMutex(Mutex *mutex);
+#endif
+
+/**
+ * Unlock a mutex, with optional error checking during development.
+ *
+ * @param mutex mutex to unlock
+ **/
+#ifdef __KERNEL__
+static INLINE void unlockMutex(Mutex *mutex)
+{
+  mutex_unlock(mutex);
+}
+#else
+void unlockMutex(Mutex *mutex);
+#endif
+
+/**
+ * Initialize a semaphore used among threads in the same process.
+ *
+ * @param semaphore the semaphore to initialize
+ * @param value     the initial value of the semaphore
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+#ifdef __KERNEL__
+static INLINE int initializeSemaphore(Semaphore *semaphore, unsigned int value)
+{
+  sema_init(semaphore, value);
+  return UDS_SUCCESS;
+}
+#else
+int initializeSemaphore(Semaphore *semaphore, unsigned int value);
+#endif
+
+/**
+ * Destroy a semaphore used among threads in the same process.
+ *
+ * @param semaphore the semaphore to destroy
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+#ifdef __KERNEL__
+static INLINE int destroySemaphore(Semaphore *semaphore)
+{
+  return UDS_SUCCESS;
+}
+#else
+int destroySemaphore(Semaphore *semaphore);
+#endif
+
+/**
+ * Acquire a permit from a semaphore, waiting if none are currently available.
+ *
+ * @param semaphore the semaphore to acquire
+ **/
+#ifdef __KERNEL__
+static INLINE void acquireSemaphore(Semaphore *semaphore)
+{
+  // Do not use down(semaphore).  Instead use down_interruptible so that we do
+  // not get 120 second stall messages in kern.log.
+  while (down_interruptible(semaphore) != 0) {
+  }
+}
+#else
+void acquireSemaphore(Semaphore *semaphore);
+#endif
+
+/**
+ * Attempt to acquire a permit from a semaphore.
+ *
+ * If a permit is available, it is claimed and the function immediately
+ * returns true. If a timeout is zero or negative, the function immediately
+ * returns false. Otherwise, this will wait either a permit to become
+ * available (returning true) or the relative timeout to expire (returning
+ * false).
+ *
+ * @param semaphore the semaphore to decrement
+ * @param timeout   the relative time until the timeout expires
+ *
+ * @return true if a permit was acquired, otherwise false
+ **/
+__attribute__((warn_unused_result))
+#ifdef __KERNEL__
+static INLINE bool attemptSemaphore(Semaphore *semaphore, RelTime timeout)
+{
+  if (timeout <= 0) {
+    // No timeout, just try to grab the semaphore.
+    return down_trylock(semaphore) == 0;
+  } else {
+    unsigned int jiffies = usecs_to_jiffies(relTimeToMicroseconds(timeout));
+    return down_timeout(semaphore, jiffies) == 0;
+  }
+}
+#else
+bool attemptSemaphore(Semaphore *semaphore, RelTime timeout);
+#endif
+
+/**
+ * Release a semaphore, incrementing the number of available permits.
+ *
+ * @param semaphore the semaphore to increment
+ **/
+#ifdef __KERNEL__
+static INLINE void releaseSemaphore(Semaphore *semaphore)
+{
+  up(semaphore);
+}
+#else
+void releaseSemaphore(Semaphore *semaphore);
+#endif
+
+/**
+ * Yield the time slice in the given thread.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int yieldScheduler(void);
+
+#ifndef __KERNEL__
+/**
+ * Allocate a thread specific key for thread specific data.
+ *
+ * @param key            points to location for new key
+ * @param destr_function destructor function called when thread exits
+ *
+ * @return               UDS_SUCCESS or error code
+ **/
+int createThreadKey(pthread_key_t *key, void (*destr_function) (void *));
+
+/**
+ * Delete a thread specific key for thread specific data.
+ *
+ * @param key  key to delete
+ *
+ * @return     UDS_SUCCESS or error code
+ **/
+int deleteThreadKey(pthread_key_t key);
+
+/**
+ * Set pointer for thread specific data.
+ *
+ * @param key      key to be associated with pointer
+ * @param pointer  data associated with key
+ *
+ * @return         UDS_SUCCESS or error code
+ **/
+int setThreadSpecific(pthread_key_t key, const void *pointer);
+
+/**
+ * Get pointer for thread specific data.
+ *
+ * @param key  key identifying the thread specific data
+ **/
+void *getThreadSpecific(pthread_key_t key);
+#endif
+
+#endif /* THREADS_H */
diff --git a/uds/threadsLinuxKernel.c b/uds/threadsLinuxKernel.c
new file mode 100644
index 0000000..7ac972d
--- /dev/null
+++ b/uds/threadsLinuxKernel.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadsLinuxKernel.c#4 $
+ */
+
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+
+#include "memoryAlloc.h"
+#include "logger.h"
+#include "threads.h"
+#include "uds-error.h"
+
+static struct hlist_head kernelThreadList;
+static struct mutex kernelThreadMutex;
+static OnceState kernelThreadOnce;
+
+typedef struct kernelThread {
+  void (*threadFunc)(void *);
+  void *threadData;
+  struct hlist_node threadLinks;
+  struct task_struct *threadTask;
+  struct completion threadDone;
+} KernelThread;
+
+/**********************************************************************/
+static void kernelThreadInit(void)
+{
+  mutex_init(&kernelThreadMutex);
+}
+
+/**********************************************************************/
+static int threadStarter(void *arg)
+{
+  KernelThread *kt = arg;
+  kt->threadTask = current;
+  performOnce(&kernelThreadOnce, kernelThreadInit);
+  mutex_lock(&kernelThreadMutex);
+  hlist_add_head(&kt->threadLinks, &kernelThreadList);
+  mutex_unlock(&kernelThreadMutex);
+  RegisteredThread allocatingThread;
+  registerAllocatingThread(&allocatingThread, NULL);
+  kt->threadFunc(kt->threadData);
+  unregisterAllocatingThread();
+  complete(&kt->threadDone);
+  return 0;
+}
+
+/**********************************************************************/
+int createThread(void      (*threadFunc)(void *),
+                 void       *threadData,
+                 const char *name,
+                 Thread     *newThread)
+{
+  char *nameColon = strchr(name, ':');
+  char *myNameColon = strchr(current->comm, ':');
+  KernelThread *kt;
+  int result = ALLOCATE(1, KernelThread, __func__, &kt);
+  if (result != UDS_SUCCESS) {
+    logWarning("Error allocating memory for %s", name);
+    return result;
+  }
+  kt->threadFunc = threadFunc;
+  kt->threadData = threadData;
+  init_completion(&kt->threadDone);
+  struct task_struct *thread;
+  /*
+   * Start the thread, with an appropriate thread name.
+   *
+   * If the name supplied contains a colon character, use that name.  This
+   * causes uds module threads to have names like "uds:callbackW" and the main
+   * test runner thread to be named "zub:runtest".
+   *
+   * Otherwise if the current thread has a name containing a colon character,
+   * prefix the name supplied with the name of the current thread up to (and
+   * including) the colon character.  Thus when the "kvdo0:dedupeQ" thread
+   * opens an index session, all the threads associated with that index will
+   * have names like "kvdo0:foo".
+   *
+   * Otherwise just use the name supplied.  This should be a rare occurrence.
+   */
+  if ((nameColon == NULL) && (myNameColon != NULL)) {
+    thread = kthread_run(threadStarter, kt, "%.*s:%s",
+                         (int) (myNameColon - current->comm), current->comm,
+                         name);
+  } else {
+    thread = kthread_run(threadStarter, kt, "%s", name);
+  }
+  if (IS_ERR(thread)) {
+    FREE(kt);
+    return UDS_ENOTHREADS;
+  }
+  *newThread = kt;
+  return UDS_SUCCESS;
+}
+/**********************************************************************/
+int joinThreads(Thread kt)
+{
+  while (wait_for_completion_interruptible(&kt->threadDone) != 0) {
+  }
+  mutex_lock(&kernelThreadMutex);
+  hlist_del(&kt->threadLinks);
+  mutex_unlock(&kernelThreadMutex);
+  FREE(kt);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void applyToThreads(void applyFunc(void *, struct task_struct *),
+                    void *argument)
+{
+  KernelThread *kt;
+  performOnce(&kernelThreadOnce, kernelThreadInit);
+  mutex_lock(&kernelThreadMutex);
+  hlist_for_each_entry(kt, &kernelThreadList, threadLinks) {
+    applyFunc(argument, kt->threadTask);
+  }
+  mutex_unlock(&kernelThreadMutex);
+}
+
+/**********************************************************************/
+void exitThread(void)
+{
+  KernelThread *kt;
+  struct completion *completion = NULL;
+  performOnce(&kernelThreadOnce, kernelThreadInit);
+  mutex_lock(&kernelThreadMutex);
+  hlist_for_each_entry(kt, &kernelThreadList, threadLinks) {
+    if (kt->threadTask == current) {
+      completion = &kt->threadDone;
+      break;
+    }
+  }
+  mutex_unlock(&kernelThreadMutex);
+  unregisterAllocatingThread();
+  complete_and_exit(completion, 1);
+}
+
+/**********************************************************************/
+ThreadId getThreadId(void)
+{
+  return current->pid;
+}
+
+/**********************************************************************/
+unsigned int getNumCores(void)
+{
+  return num_online_cpus();
+}
+
+/**********************************************************************/
+int initializeBarrier(Barrier *barrier, unsigned int threadCount)
+{
+  barrier->arrived     = 0;
+  barrier->threadCount = threadCount;
+  int result = initializeSemaphore(&barrier->mutex, 1);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return initializeSemaphore(&barrier->wait, 0);
+}
+
+/**********************************************************************/
+int destroyBarrier(Barrier *barrier)
+{
+  int result = destroySemaphore(&barrier->mutex);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  return destroySemaphore(&barrier->wait);
+}
+
+/**********************************************************************/
+int enterBarrier(Barrier *barrier, bool *winner)
+{
+  acquireSemaphore(&barrier->mutex);
+  bool lastThread = ++barrier->arrived == barrier->threadCount;
+  if (lastThread) {
+    // This is the last thread to arrive, so wake up the others
+    int i;
+    for (i = 1; i < barrier->threadCount; i++) {
+      releaseSemaphore(&barrier->wait);
+    }
+    // Then reinitialize for the next cycle
+    barrier->arrived = 0;
+    releaseSemaphore(&barrier->mutex);
+  } else {
+    // This is NOT the last thread to arrive, so just wait
+    releaseSemaphore(&barrier->mutex);
+    acquireSemaphore(&barrier->wait);
+  }
+  if (winner != NULL) {
+    *winner = lastThread;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int yieldScheduler(void)
+{
+  yield();
+  return UDS_SUCCESS;
+}
diff --git a/uds/timeUtils.c b/uds/timeUtils.c
new file mode 100644
index 0000000..ddf3b2b
--- /dev/null
+++ b/uds/timeUtils.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.c#4 $
+ */
+
+#include "stringUtils.h"
+#include "timeUtils.h"
+
+#ifdef __KERNEL__
+#include <linux/delay.h>
+#include <linux/ktime.h> // for getnstimeofday on Vivid
+#else
+#include <errno.h>
+#endif
+
+#ifndef __KERNEL__
+static const struct timespec invalidTime = {
+  .tv_sec  = -1,
+  .tv_nsec = LONG_MAX
+};
+
+static const long BILLION = 1000 * 1000 * 1000;
+#endif
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+AbsTime currentTime(clockid_t clock)
+{
+  struct timespec ts;
+  if (clock_gettime(clock, &ts) != 0) {
+    ts = invalidTime;
+  }
+  return ts;
+}
+#endif
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+/**
+ * Return a time offset from the specified time.
+ *
+ * @param time     A time.
+ * @param reltime  The relative time
+ *
+ * @return the sum of the time and the offset, possibly rounded up to the
+ *         next representable instant.
+ *
+ * @note timeDifference(a, deltaTime(a, n)) may only be approx == -n
+ *       depending on the system-specific time resolution
+ **/
+static AbsTime deltaTime(AbsTime time, RelTime reltime)
+{
+  if (!isValidTime(time)) {
+    return time;
+  }
+  if ((reltime >= 0) && (reltime < 10 * BILLION)) {
+    reltime += time.tv_nsec;
+    while (reltime >= BILLION) {
+      reltime -= BILLION;
+      time.tv_sec++;
+    }
+    time.tv_nsec = reltime;
+    return time;
+  }
+  // may not be accurate for times before the Epoch...
+  // (is the ns time positive or negative for negative time_t?)
+  int64_t ns = time.tv_sec * BILLION + time.tv_nsec;
+  if ((ns < INT64_MIN / 2) ||
+      (ns > INT64_MAX / 2) ||
+      (reltime < INT64_MIN / 2) ||
+      (reltime > INT64_MAX / 2)) {
+    return invalidTime;
+  }
+  ns += reltime;
+  return (AbsTime) { .tv_sec = ns / BILLION, .tv_nsec = ns % BILLION };
+}
+#endif
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+AbsTime futureTime(clockid_t clock, RelTime reltime)
+{
+  return deltaTime(currentTime(clock), reltime);
+}
+#endif
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+bool isValidTime(AbsTime time)
+{
+  if (time.tv_nsec < 0 || time.tv_nsec >= BILLION) {
+    return false;
+  }
+  return true;
+}
+#endif
+
+/*****************************************************************************/
+uint64_t nowUsec(void)
+{
+#ifdef __KERNEL__
+  static const AbsTime epoch = 0;
+#else
+  static const AbsTime epoch = { 0, 0 };
+#endif
+  return relTimeToMicroseconds(timeDifference(currentTime(CLOCK_REALTIME),
+                                              epoch));
+}
+
+
+
+#ifndef __KERNEL__
+/*****************************************************************************/
+RelTime timeDifference(AbsTime a, AbsTime b)
+{
+  if (isValidTime(a) && isValidTime(b)) {
+    int64_t ans = a.tv_sec * BILLION + a.tv_nsec;
+    int64_t bns = b.tv_sec * BILLION + b.tv_nsec;
+    return ans - bns;
+  } else if (isValidTime(a)) {
+    return INT64_MAX;
+  } else if (isValidTime(b)) {
+    return INT64_MIN;
+  } else {
+    return 0;
+  }
+}
+#endif
diff --git a/uds/timeUtils.h b/uds/timeUtils.h
new file mode 100644
index 0000000..8d159f4
--- /dev/null
+++ b/uds/timeUtils.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.h#5 $
+ */
+
+#ifndef TIME_UTILS_H
+#define TIME_UTILS_H
+
+#include "compiler.h"
+#include "typeDefs.h"
+
+#ifdef __KERNEL__
+#include <linux/ktime.h>
+#include <linux/time.h>
+#else
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+// Absolute time.
+#ifdef __KERNEL__
+typedef int64_t AbsTime;
+#else
+typedef struct timespec AbsTime;
+#endif
+
+// Relative time, the length of a time interval, or the difference between
+// two times.  A signed 64-bit number of nanoseconds.
+typedef int64_t RelTime;
+
+#ifndef __KERNEL__
+/**
+ * Return true if the time is valid.
+ *
+ * @param time  a time
+ *
+ * @return true if the time is valid
+ *
+ * @note an invalid time is generally returned from a failed attempt
+ *      to get the time from the system
+ **/
+bool isValidTime(AbsTime time);
+#endif
+
+/**
+ * Return the current time according to the specified clock type.
+ *
+ * @param clock         Either CLOCK_REALTIME or CLOCK_MONOTONIC
+ *
+ * @return the current time according to the clock in question
+ *
+ * @note the precision of the clock is system specific
+ **/
+#ifdef __KERNEL__
+static INLINE AbsTime currentTime(clockid_t clock)
+{
+  // clock is always a constant, so gcc reduces this to a single call
+  return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns();
+}
+#else
+AbsTime currentTime(clockid_t clock);
+#endif
+
+#ifndef __KERNEL__
+/**
+ * Return the timestamp a certain number of nanoseconds in the future.
+ *
+ * @param clock    Either CLOCK_REALTIME or CLOCK_MONOTONIC
+ * @param reltime  The relative time to the clock value
+ *
+ * @return the timestamp for that time (potentially rounded to the next
+ *         representable instant for the system in question)
+ **/
+AbsTime futureTime(clockid_t clock, RelTime reltime);
+#endif
+
+/**
+ * Return the difference between two timestamps.
+ *
+ * @param a  A time
+ * @param b  Another time, based on the same clock as a.
+ *
+ * @return the relative time between the two timestamps
+ **/
+#ifdef __KERNEL__
+static INLINE RelTime timeDifference(AbsTime a, AbsTime b)
+{
+  return a - b;
+}
+#else
+RelTime timeDifference(AbsTime a, AbsTime b);
+#endif
+
+
+
+/**
+ * Convert seconds to a RelTime value
+ *
+ * @param seconds  A number of seconds
+ *
+ * @return the equivalent number of seconds as a RelTime
+ **/
+static INLINE RelTime secondsToRelTime(int64_t seconds)
+{
+  return (RelTime) seconds * (1000 * 1000 * 1000);
+}
+
+/**
+ * Convert milliseconds to a RelTime value
+ *
+ * @param milliseconds  A number of milliseconds
+ *
+ * @return the equivalent number of milliseconds as a RelTime
+ **/
+static INLINE RelTime millisecondsToRelTime(int64_t milliseconds)
+{
+  return (RelTime) milliseconds * (1000 * 1000);
+}
+
+/**
+ * Convert microseconds to a RelTime value
+ *
+ * @param microseconds  A number of microseconds
+ *
+ * @return the equivalent number of microseconds as a RelTime
+ **/
+static INLINE RelTime microsecondsToRelTime(int64_t microseconds)
+{
+  return (RelTime) microseconds * 1000;
+}
+
+/**
+ * Convert nanoseconds to a RelTime value
+ *
+ * @param nanoseconds  A number of nanoseconds
+ *
+ * @return the equivalent number of nanoseconds as a RelTime
+ **/
+static INLINE RelTime nanosecondsToRelTime(int64_t nanoseconds)
+{
+  return (RelTime) nanoseconds;
+}
+
+/**
+ * Convert a RelTime value to milliseconds
+ *
+ * @param reltime  The relative time
+ *
+ * @return the equivalent number of milliseconds
+ **/
+static INLINE int64_t relTimeToSeconds(RelTime reltime)
+{
+  return reltime / (1000 * 1000 * 1000);
+}
+
+/**
+ * Convert a RelTime value to milliseconds
+ *
+ * @param reltime  The relative time
+ *
+ * @return the equivalent number of milliseconds
+ **/
+static INLINE int64_t relTimeToMilliseconds(RelTime reltime)
+{
+  return reltime / (1000 * 1000);
+}
+
+/**
+ * Convert a RelTime value to microseconds
+ *
+ * @param reltime  The relative time
+ *
+ * @return the equivalent number of microseconds
+ **/
+static INLINE int64_t relTimeToMicroseconds(RelTime reltime)
+{
+  return reltime / 1000;
+}
+
+/**
+ * Convert a RelTime value to nanoseconds
+ *
+ * @param reltime  The relative time
+ *
+ * @return the equivalent number of nanoseconds
+ **/
+static INLINE int64_t relTimeToNanoseconds(RelTime reltime)
+{
+  return reltime;
+}
+
+/**
+ * Return the wall clock time in microseconds. The actual value is time
+ * since the epoch (see "man gettimeofday"), but the typical use is to call
+ * this twice and compute the difference, giving the elapsed time between
+ * the two calls.
+ *
+ * @return the time in microseconds
+ **/
+uint64_t nowUsec(void) __attribute__((warn_unused_result));
+
+/**
+ * Convert from an AbsTime to a time_t
+ *
+ * @param time  an AbsTime time
+ *
+ * @return a time_t time
+ **/
+static INLINE time_t asTimeT(AbsTime time)
+{
+#ifdef __KERNEL__
+  return time / 1000000000;
+#else
+  return time.tv_sec;
+#endif
+}
+
+/**
+ * Convert from a time_t to an AbsTime,
+ *
+ * @param time  a time_t time
+ *
+ * @return an AbsTime time
+ **/
+static INLINE AbsTime fromTimeT(time_t time)
+{
+#ifdef __KERNEL__
+  return time * 1000000000;
+#else
+  AbsTime abs;
+  abs.tv_sec = time;
+  abs.tv_nsec = 0;
+  return abs;
+#endif
+}
+
+#ifndef __KERNEL__
+/**
+ * Convert from an AbsTime to a struct timespec
+ *
+ * @param time  an AbsTime time
+ *
+ * @return a time_t time
+ **/
+static INLINE struct timespec asTimeSpec(AbsTime time)
+{
+  return time;
+}
+#endif
+
+#ifndef __KERNEL__
+/**
+ * Convert from an AbsTime to a struct timeval
+ *
+ * @param time  an AbsTime time
+ *
+ * @return a time_t time
+ **/
+static INLINE struct timeval asTimeVal(AbsTime time)
+{
+  struct timeval tv = { time.tv_sec, time.tv_nsec / 1000 };
+  return tv;
+}
+#endif
+
+#endif /* TIME_UTILS_H */
diff --git a/uds/typeDefs.h b/uds/typeDefs.h
new file mode 100644
index 0000000..927bd23
--- /dev/null
+++ b/uds/typeDefs.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/typeDefs.h#1 $
+ */
+
+#ifndef LINUX_KERNEL_TYPE_DEFS_H
+#define LINUX_KERNEL_TYPE_DEFS_H
+
+/*
+ * General system type definitions.  This file is parallel to the other
+ * typeDefs.h files in this project.  We pick up what we can from the system
+ * include files, and explicitly define the other things we need.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <stddef.h>
+
+#define CHAR_BIT 8
+
+#define INT64_MAX  (9223372036854775807L)
+#define UCHAR_MAX  ((unsigned char)~0ul)
+#define UINT8_MAX  ((uint8_t)~0ul)
+#define UINT16_MAX ((uint16_t)~0ul)
+#define UINT64_MAX ((uint64_t)~0ul)
+
+// Some recent versions of <linux/kernel.h> define this for us
+#ifndef SIZE_MAX
+#define SIZE_MAX   ((size_t)~0ul)
+#endif
+
+#define PRId64 "lld"
+#define PRIu16 "u"
+#define PRIu32 "u"
+#define PRIu64 "llu"
+
+typedef unsigned long uintmax_t;
+#define PRIuMAX "lu"
+
+typedef unsigned char byte;
+
+#endif /* LINUX_KERNEL_TYPE_DEFS_H */
diff --git a/uds/uds-block.h b/uds/uds-block.h
new file mode 100644
index 0000000..e1b8e61
--- /dev/null
+++ b/uds/uds-block.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/uds-block.h#1 $
+ */
+
+/**
+ * @file
+ * @brief Definitions for the UDS block interface
+ **/
+#ifndef UDS_BLOCK_H
+#define UDS_BLOCK_H
+
+#include "uds.h"
+
+/** General UDS block constants. */
+enum {
+  /** The maximum metadata size for a block. */
+  UDS_MAX_BLOCK_DATA_SIZE = UDS_MAX_METADATA_SIZE
+};
+
+/**
+ * Metadata to associate with a blockName.
+ **/
+struct udsChunkData {
+  unsigned char data[UDS_MAX_BLOCK_DATA_SIZE];
+};
+
+/**
+ * Represents a block address on disk.
+ *
+ * #UdsBlockAddress objects allow the Application Software and UDS
+ * to refer to specific disk blocks.  It might be, for instance, the
+ * logical block address divided by the block size.
+ *
+ * These objects are stored persistently in the index and are also cached.
+ * Therefore, make every effort to ensure that these objects are as small as
+ * possible.
+ **/
+typedef void *UdsBlockAddress;
+
+/** @{ */
+/** @name Deduplication */
+
+typedef struct udsRequest UdsRequest;
+
+/**
+ * Callback function invoked to inform the Application Software that an
+ * operation started by #udsStartChunkOperation has completed.
+ *
+ * @param [in] request  The operation that finished.  When the callback
+ *                      function is called, this UdsRequest structure can be
+ *                      reused or freed.
+ **/
+typedef void UdsChunkCallback(UdsRequest *request);
+
+/**
+ * Request structure passed to #udsStartChunkOperation to begin an operation,
+ * and returned to the Application Software when the callback function is
+ * invoked.
+ **/
+struct udsRequest {
+  /*
+   * The name of the block.
+   * Set before starting an operation.
+   * Unchanged at time of callback.
+   */
+  UdsChunkName chunkName;
+  /*
+   * The metadata found in the index that was associated with the block
+   * (sometimes called the canonical address).
+   * Set before the callback.
+   */
+  struct udsChunkData oldMetadata;
+  /*
+   * The new metadata to associate with the name of the block (sometimes called
+   * the duplicate address).
+   * Set before starting a #UDS_POST or #UDS_QUERY operation.
+   * Unchanged at time of callback.
+   */
+  struct udsChunkData newMetadata;
+  /*
+   * The callback method to be invoked when the operation finishes.
+   * Set before starting an operation.
+   * Unchanged at time of callback.
+   */
+  UdsChunkCallback *callback;
+  /*
+   * The index session.
+   * Set before starting an operation.
+   * Unchanged at time of callback.
+   */
+  struct uds_index_session *session;
+  /*
+   * The operation type, which is one of #UDS_DELETE, #UDS_POST, #UDS_QUERY or
+   * #UDS_UPDATE.
+   * Set before starting an operation.
+   * Unchanged at time of callback.
+   */
+  UdsCallbackType type;
+  /*
+   * The operation status, which is either #UDS_SUCCESS or an error code.
+   * Set before the callback.
+   */
+  int status;
+  /*
+   * If true, the name of the block was found in the index.
+   * Set before the callback.
+   */
+  bool found;
+  /*
+   * If true, move the entry to the end of the deduplication window.
+   * Set before starting a #UDS_QUERY operation.
+   * Unchanged at time of callback.
+   */
+  bool update;
+  long private[25];
+};
+
+/**
+ * Start a UDS index chunk operation.  The request <code>type</code> field must
+ * be set to the type of operation.  This is an asynchronous interface to the
+ * block-oriented UDS API.  The callback is invoked upon completion.
+ *
+ * The #UDS_DELETE operation type deletes the mapping for a particular block.
+ * #UDS_DELETE is typically used when UDS provides invalid advice.
+ *
+ * The #UDS_POST operation type indexes a block name and associates it with a
+ * particular address.  The caller provides the block's name. UDS then checks
+ * this name against its index.
+ * <ul>
+ *   <li>If the block is new, it is stored in the index.</li>
+ *   <li>If the block is a duplicate of an indexed block, UDS returns the
+ *       canonical block address via the callback.</li>
+ * </ul>
+ *
+ * The #UDS_QUERY operation type checks to see if a block name exists in the
+ * index.  The caller provides the block's name.  UDS then checks
+ * this name against its index.
+ * <ul>
+ *   <li>If the block is new, no action is taken.</li>
+
+ *   <li>If the block is a duplicate of an indexed block, UDS returns the
+ *       canonical block address via the callback.  If the <code>update</code>
+ *       field is set, the entry is moved to the end of the deduplication
+ *       window.</li> </ul>
+ *
+ * The #UDS_UPDATE operation type updates the mapping for a particular block.
+ * #UDS_UPDATE is typically used if the callback function provides invalid
+ * advice.
+ *
+ * @param [in] request  The operation.  The <code>type</code>,
+ *                      <code>chunkName</code>, <code>newMetadata</code>,
+ *                      <code>context</code>, <code>callback</code>, and
+ *                      <code>update</code> fields must be set.  At callback
+ *                      time, the <code>oldMetadata</code>,
+ *                      <code>status</code>, and <code>found</code> fields will
+ *                      be set.
+ *
+ * @return              Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsStartChunkOperation(UdsRequest *request);
+/** @} */
+
+#endif /* UDS_BLOCK_H */
diff --git a/uds/uds-error.h b/uds/uds-error.h
new file mode 100644
index 0000000..7658982
--- /dev/null
+++ b/uds/uds-error.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/uds-error.h#3 $
+ */
+
+/**
+ * @file
+ * @brief UDS error code definitions
+ **/
+#ifndef UDS_ERROR_H
+#define UDS_ERROR_H
+
+
+/**
+ * Valid return status codes for API routines.
+ **/
+enum udsStatusCodes {
+  /** Successful return */
+  UDS_SUCCESS      = 0,
+
+  /** Used as a base value for reporting errors  */
+  UDS_ERROR_CODE_BASE             = 1024,
+  /** The UDS library is not initialized */
+  UDS_UNINITIALIZED               = UDS_ERROR_CODE_BASE + 0,
+  /** The UDS library is shutting down */
+  UDS_SHUTTINGDOWN                = UDS_ERROR_CODE_BASE + 1,
+  /** Could not load scanner modules */
+  UDS_EMODULE_LOAD                = UDS_ERROR_CODE_BASE + 2,
+  /** Could not create a new thread */
+  UDS_ENOTHREADS                  = UDS_ERROR_CODE_BASE + 3,
+  /** Could not find the specified library context */
+  UDS_NOCONTEXT                   = UDS_ERROR_CODE_BASE + 4,
+  /** The specified library context is disabled */
+  UDS_DISABLED                    = UDS_ERROR_CODE_BASE + 5,
+  /** Some saved index component is corrupt */
+  UDS_CORRUPT_COMPONENT           = UDS_ERROR_CODE_BASE + 6,
+  UDS_CORRUPT_FILE                = UDS_CORRUPT_COMPONENT,
+  /** Unknown error */
+  UDS_UNKNOWN_ERROR               = UDS_ERROR_CODE_BASE + 7,
+  /** Unused */
+  UDS_UNUSED_CODE_8               = UDS_ERROR_CODE_BASE + 8,
+  /** Unused */
+  UDS_UNUSED_CODE_9               = UDS_ERROR_CODE_BASE + 9,
+  /** The index configuration or volume format is no longer supported */
+  UDS_UNSUPPORTED_VERSION         = UDS_ERROR_CODE_BASE + 10,
+  /** Index session not available */
+  UDS_NO_INDEXSESSION             = UDS_ERROR_CODE_BASE + 11,
+  /** Index data in memory is corrupt */
+  UDS_CORRUPT_DATA                = UDS_ERROR_CODE_BASE + 12,
+  /** Short read due to truncated file */
+  UDS_SHORT_READ                  = UDS_ERROR_CODE_BASE + 13,
+  /** Unused */
+  UDS_UNUSED_CODE_14              = UDS_ERROR_CODE_BASE + 14,
+  /** Internal resource limits exceeded */
+  UDS_RESOURCE_LIMIT_EXCEEDED     = UDS_ERROR_CODE_BASE + 15,
+  /** Memory overflow due to storage failure */
+  UDS_VOLUME_OVERFLOW             = UDS_ERROR_CODE_BASE + 16,
+  /** Unused */
+  UDS_UNUSED_CODE_17              = UDS_ERROR_CODE_BASE + 17,
+  /** Unused */
+  UDS_UNUSED_CODE_18              = UDS_ERROR_CODE_BASE + 18,
+  /** Unused */
+  UDS_UNUSED_CODE_19              = UDS_ERROR_CODE_BASE + 19,
+  /** Configuration pointer required */
+  UDS_CONF_PTR_REQUIRED           = UDS_ERROR_CODE_BASE + 20,
+  /** Index stats pointer required */
+  UDS_INDEX_STATS_PTR_REQUIRED    = UDS_ERROR_CODE_BASE + 21,
+  /** Context stats pointer required */
+  UDS_CONTEXT_STATS_PTR_REQUIRED  = UDS_ERROR_CODE_BASE + 22,
+  /** Unused */
+  UDS_UNUSED_CODE_23              = UDS_ERROR_CODE_BASE + 23,
+  /** Unused */
+  UDS_UNUSED_CODE_24              = UDS_ERROR_CODE_BASE + 24,
+  /** Unused */
+  UDS_UNUSED_CODE_25              = UDS_ERROR_CODE_BASE + 25,
+  /** Unused */
+  UDS_UNUSED_CODE_26              = UDS_ERROR_CODE_BASE + 26,
+  /** Unused */
+  UDS_UNUSED_CODE_27              = UDS_ERROR_CODE_BASE + 27,
+  /** Memory configuration not supported */
+  UDS_INVALID_MEMORY_SIZE         = UDS_ERROR_CODE_BASE + 28,
+  /** Unused */
+  UDS_UNUSED_CODE_29              = UDS_ERROR_CODE_BASE + 29,
+  /** Index name required */
+  UDS_INDEX_NAME_REQUIRED         = UDS_ERROR_CODE_BASE + 30,
+  /** Configuration required */
+  UDS_CONF_REQUIRED               = UDS_ERROR_CODE_BASE + 31,
+  /** Unused */
+  UDS_UNUSED_CODE_32              = UDS_ERROR_CODE_BASE + 32,
+  /** Unused */
+  UDS_UNUSED_CODE_33              = UDS_ERROR_CODE_BASE + 33,
+  /** Unused */
+  UDS_UNUSED_CODE_34              = UDS_ERROR_CODE_BASE + 34,
+  /** Unused */
+  UDS_UNUSED_CODE_35              = UDS_ERROR_CODE_BASE + 35,
+  /** Unused */
+  UDS_UNUSED_CODE_36              = UDS_ERROR_CODE_BASE + 36,
+  /** Essential files for index not found */
+  UDS_NO_INDEX                    = UDS_ERROR_CODE_BASE + 37,
+  /** Checkpoint frequency out of range */
+  UDS_BAD_CHECKPOINT_FREQUENCY    = UDS_ERROR_CODE_BASE + 38,
+  /** Wrong type of index configuration */
+  UDS_WRONG_INDEX_CONFIG          = UDS_ERROR_CODE_BASE + 39,
+  /** Unused */
+  UDS_UNUSED_CODE_40              = UDS_ERROR_CODE_BASE + 40,
+  /** Unused */
+  UDS_UNUSED_CODE_41              = UDS_ERROR_CODE_BASE + 41,
+  /** Unused */
+  UDS_UNUSED_CODE_42              = UDS_ERROR_CODE_BASE + 42,
+  /** Unused */
+  UDS_UNUSED_CODE_43              = UDS_ERROR_CODE_BASE + 43,
+  /** Premature end of file in scanned file */
+  UDS_END_OF_FILE                 = UDS_ERROR_CODE_BASE + 44,
+  /** Attempt to access unsaved index */
+  UDS_INDEX_NOT_SAVED_CLEANLY     = UDS_ERROR_CODE_BASE + 45,
+  /** Unused */
+  UDS_UNUSED_CODE_46              = UDS_ERROR_CODE_BASE + 46,
+  /** There is not sufficient space to create the index */
+  UDS_INSUFFICIENT_INDEX_SPACE    = UDS_ERROR_CODE_BASE + 47,
+  /** Unused */
+  UDS_UNUSED_CODE_48              = UDS_ERROR_CODE_BASE + 48,
+  /** Unused */
+  UDS_UNUSED_CODE_49              = UDS_ERROR_CODE_BASE + 49,
+  /** Index is suspended */
+  UDS_SUSPENDED                   = UDS_ERROR_CODE_BASE + 50,
+  /** Unused */
+  UDS_UNUSED_CODE_51              = UDS_ERROR_CODE_BASE + 51,
+  /** Index session is already initialized */
+  UDS_INDEXSESSION_IN_USE         = UDS_ERROR_CODE_BASE + 52,
+  /** Callback required */
+  UDS_CALLBACK_REQUIRED           = UDS_ERROR_CODE_BASE + 53,
+  /** Wrong operation type */
+  UDS_INVALID_OPERATION_TYPE      = UDS_ERROR_CODE_BASE + 54,
+  /** One more than the last UDS_ERROR_CODE */
+  UDS_ERROR_CODE_LAST,
+  /** One more than this block can use */
+  UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 1024
+};
+
+#endif /* UDS_ERROR_H */
diff --git a/uds/uds-platform.h b/uds/uds-platform.h
new file mode 100644
index 0000000..0df39ef
--- /dev/null
+++ b/uds/uds-platform.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/uds-platform.h#1 $
+ */
+
+/**
+ * @file
+ * @brief Platform definitions for albireo
+ **/
+#ifndef UDS_PLATFORM_H
+#define UDS_PLATFORM_H
+
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <time.h>
+#endif
+
+#endif /* UDS_PLATFORM_H */
diff --git a/uds/uds.h b/uds/uds.h
new file mode 100644
index 0000000..42e2863
--- /dev/null
+++ b/uds/uds.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/uds.h#2 $
+ */
+
+/**
+ * @mainpage UDS API Reference
+ * <center>Copyright (c) 2020 Red Hat, Inc.</center>
+ **/
+
+/**
+ * @file
+ * @brief General UDS definitions
+ **/
+#ifndef UDS_H
+#define UDS_H
+
+#include "uds-platform.h"
+
+#ifdef UDS_DISABLE_ATTR_WARN_UNUSED_RESULT
+#define UDS_ATTR_WARN_UNUSED_RESULT
+#else
+#define UDS_ATTR_WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#endif
+
+/**
+ * Valid request types as described in callbacks.
+ **/
+typedef enum {
+  /**
+   * Callback type for operations that post mappings to the UDS
+   * index.  When the chunk-hash being added already exists, the
+   * existing metadata is not overwritten. Regardless, the
+   * recency of the chunk is updated.
+   **/
+  UDS_POST,
+
+  /**
+   * Callback type for operations that update mappings in the UDS
+   * index. If the indicated entry does not have any mapping in the
+   * index, one is created. In either case, the recency of
+   * the chunk is updated.
+   **/
+  UDS_UPDATE,
+
+  /**
+   * Callback type for operations that delete mappings from the
+   * UDS index. */
+  UDS_DELETE,
+
+  /**
+   * Callback type for operations that query mappings in the UDS
+   * index. When a mapping is found, the recency of the mapping
+   * is updated unless it's the no-update call.
+   **/
+  UDS_QUERY
+} UdsCallbackType;
+
+/**
+ * Valid types for opening an index.
+ **/
+typedef enum {
+  /**
+   * Load an existing index.  If the index was not saved cleanly, try to
+   * recover and rebuild the index.
+   **/
+  UDS_LOAD = 0,
+
+  /**
+   * Create a new index.
+   **/
+  UDS_CREATE = 1,
+
+  /**
+   * Load an existing index, but only if it was cleanly saved.
+   **/
+  UDS_NO_REBUILD = 2,
+} UdsOpenIndexType;
+
+/** General UDS constants. */
+enum {
+  /** The chunk name size in bytes (128 bits = 16 bytes). */
+  UDS_CHUNK_NAME_SIZE   = 16,
+  /** The maximum metadata size in bytes. */
+  UDS_MAX_METADATA_SIZE = 16,
+};
+
+/**
+ *  Type representing memory configuration which is either a positive
+ *  integer number of gigabytes or one of the three special constants
+ *  for configurations which are smaller than 1 gigabyte.
+ **/
+typedef unsigned int UdsMemoryConfigSize;
+
+extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB;
+extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB;
+extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB;
+
+/**
+ *  The maximum configurable amount of memory.
+ **/
+extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX;
+
+/** The name (hash) of a chunk. */
+typedef struct udsChunkName {
+  /** The name (hash) of a chunk. */
+  unsigned char name[UDS_CHUNK_NAME_SIZE];
+} UdsChunkName;
+
+/**
+ * An active index session.
+ **/
+struct uds_index_session;
+
+/**
+ * The data used to configure a new index.
+ **/
+typedef struct udsConfiguration *UdsConfiguration;
+typedef uint64_t UdsNonce;
+
+/**
+ * The data used to configure a new index session.
+ **/
+struct uds_parameters {
+  // Tne number of threads used to process index requests.
+  int zone_count;
+  // The number of threads used to read volume pages.
+  int read_threads;
+  // The number of chapters to write between checkpoints.
+  int checkpoint_frequency;
+};
+#define UDS_PARAMETERS_INITIALIZER {		\
+		.zone_count = 0,		\
+		.read_threads = 2,		\
+		.checkpoint_frequency = 0,	\
+	}
+
+/**
+ * Index statistics
+ *
+ * These statistics capture the current index characteristics,
+ * including resource usage.
+ **/
+typedef struct udsIndexStats {
+  /** The total number of chunk names stored in the index */
+  uint64_t entriesIndexed;
+  /** An estimate of the index's memory usage */
+  uint64_t memoryUsed;
+  /** The number of collisions recorded in the master index */
+  uint64_t collisions;
+  /** The number of entries discarded from the index since index startup */
+  uint64_t entriesDiscarded;
+  /** The number of checkpoints done this session */
+  uint64_t checkpoints;
+} UdsIndexStats;
+
+/**
+ * Context statistics
+ *
+ * These statistics capture a library context's characteristics either since
+ * it was initialized or since its statistics were last reset, whichever
+ * is more recent.
+ **/
+typedef struct udsContextStats {
+  /** The time at which context statistics were last fetched */
+  time_t   currentTime;
+  /**
+   * The number of post calls since context statistics were last reset that
+   * found an existing entry
+   **/
+  uint64_t postsFound;
+  /**
+   * The number of post calls since context statistics were last reset that
+   * added an entry
+   **/
+  uint64_t postsNotFound;
+  /**
+   * The number of post calls since context statistics were last reset that
+   * found an existing entry is current enough to only exist in memory and not
+   * have been commited to disk yet.
+   **/
+  uint64_t inMemoryPostsFound;
+  /**
+   * The number of post calls since context statistics were last reset that
+   * found an existing entry in the dense portion of the index.
+   **/
+  uint64_t densePostsFound;
+  /**
+   * The number of post calls since context statistics were last reset that
+   * found an existing entry in the sparse portion of the index (if one
+   * exists).
+   **/
+  uint64_t sparsePostsFound;
+  /**
+   * The number of update calls since context statistics were last reset that
+   * updated an existing entry
+   **/
+  uint64_t updatesFound;
+  /**
+   * The number of update calls since context statistics were last reset that
+   * added a new entry
+   **/
+  uint64_t updatesNotFound;
+  /**
+   * The number of delete requests since context statistics were last reset
+   * that deleted an existing entry
+   **/
+  uint64_t deletionsFound;
+  /**
+   * The number of delete requests since context statistics were last reset
+   * that did nothing.
+   **/
+  uint64_t deletionsNotFound;
+  /**
+   * The number of query calls since context statistics were last reset that
+   * found existing entry
+   **/
+  uint64_t queriesFound;
+  /**
+   * The number of query calls since context statistics were last reset that
+   * did not find an entry
+   **/
+  uint64_t queriesNotFound;
+  /**
+   * The total number of library requests (the sum of posts, updates,
+   * deletions, and queries) since context
+   * statistics were last reset
+   **/
+  uint64_t requests;
+} UdsContextStats;
+
+/**
+ * Initializes an index configuration.
+ *
+ * @param [out] conf          The new configuration
+ * @param [in] memGB          The maximum memory allocation, in GB
+ *
+ * @return                    Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsInitializeConfiguration(UdsConfiguration    *conf,
+                               UdsMemoryConfigSize  memGB);
+
+/**
+ * Sets or clears an index configuration's sparse indexing settings.
+ *
+ * @param [in,out] conf       The configuration to change
+ * @param [in] sparse         If <code>true</code>, request a sparse
+ *                            index; if <code>false</code>, request
+ *                            a default index.
+ *
+ **/
+void udsConfigurationSetSparse(UdsConfiguration conf, bool sparse);
+
+/**
+ * Tests whether an index configuration specifies sparse indexing.
+ *
+ * @param [in] conf           The configuration to check
+ *
+ * @return                    Returns <code>true</code> if the configuration
+ *                            is sparse, or <code>false</code> if not
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+bool udsConfigurationGetSparse(UdsConfiguration conf);
+
+/**
+ * Sets an index configuration's nonce.
+ *
+ * @param [in,out] conf  The configuration to change
+ * @param [in] nonce    The 64 bit nonce.
+ *
+ **/
+void udsConfigurationSetNonce(UdsConfiguration conf, UdsNonce nonce);
+
+/**
+ * Gets an index configuration's nonce.
+ *
+ * @param [in] conf  The configuration to check
+ *
+ * @return  The 64 bit nonce.
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+UdsNonce udsConfigurationGetNonce(UdsConfiguration conf);
+
+/**
+ * Fetches a configuration's maximum memory allocation.
+ *
+ * @param [in] conf  The configuration to check
+ *
+ * @return      The amount of memory allocated, in GB
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+UdsMemoryConfigSize udsConfigurationGetMemory(UdsConfiguration conf);
+
+/**
+ * Fetches a configuration's chapters per volume value.
+ *
+ * @param [in] conf  The configuration to check
+ *
+ * @return      The number of chapters per volume
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+unsigned int udsConfigurationGetChaptersPerVolume(UdsConfiguration conf);
+
+/**
+ * Frees memory used by a configuration.
+ *
+ * @param [in,out] conf The configuration for which memory is being freed
+ **/
+void udsFreeConfiguration(UdsConfiguration conf);
+
+/**
+ * Compute the size required to store the index on persistent storage.  This
+ * size is valid for any index stored in a single file or on a single block
+ * device.  This size should be used when configuring a block device on which
+ * to store an index.
+ *
+ * @param [in]  config          A UdsConfiguration for an index.
+ * @param [in]  numCheckpoints  The maximum number of checkpoints.
+ * @param [out] indexSize       The number of bytes required to store
+ *                              the index.
+ *
+ * @return UDS_SUCCESS or an error code.
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsComputeIndexSize(const UdsConfiguration  config,
+                        unsigned int            numCheckpoints,
+                        uint64_t               *indexSize);
+
+/**
+ * Opens an index session.
+ *
+ * Creates a session for an index. #udsOpenIndex must be called before
+ * the index can be used.
+ *
+ * Destroy the session with #udsDestroyIndexSession.
+ *
+ * @param [out] session  A pointer to the new session
+ *
+ * @return Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsCreateIndexSession(struct uds_index_session **session);
+
+/**
+ * Fetches the UDS library version.
+ *
+ * @return       The library version
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+const char *udsGetVersion(void);
+
+#ifdef __KERNEL__
+/**
+ * The name argument to #udsOpenIndex is a text string that names the index.
+ * The name should have the form "path", where path is the name of the block
+ * device.  The path should not contain white space.  The names can optionally
+ * contain size and/or offset options which give the number of bytes in the
+ * index and the byte offset to the start of the index.  For example, the name
+ * "/dev/sda8 offset=409600 size=2048000000" is an index that is stored in
+ * 2040000000 bytes of /dev/sda8 starting at byte 409600.
+ **/
+#else
+/**
+ * The name argument to #udsOpenIndex is a text string that names the index.
+ * The name should have the form "path", where path is the name of the file or
+ * block device.  The path should not contain white space.  The name can
+ * optionally contain size and/or offset options which give the number of bytes
+ * in the index and the byte offset to the start of the index.  For example,
+ * the name "/dev/sda8 offset=409600 size=2048000000" is an index that is
+ * stored in 2040000000 bytes of /dev/sda8 starting at byte 409600.
+ **/
+#endif
+
+/**
+ * Opens an index with an existing session.  This operation will fail if the
+ * index session is suspended, or if there is already an open index.
+ *
+ * The index should be closed with #udsCloseIndex.
+ *
+ * @param openType  The type of open, which is one of #UDS_LOAD, #UDS_CREATE,
+ *                  or #UDS_NO_REBUILD.
+ * @param name      The name of the index
+ * @param params    The index session parameters.  If NULL, the default
+ *                       session parameters will be used.
+ * @param conf      The index configuration
+ * @param session   The index session
+ *
+ * @return          Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsOpenIndex(UdsOpenIndexType             openType,
+                 const char                  *name,
+                 const struct uds_parameters *params,
+                 UdsConfiguration             conf,
+                 struct uds_index_session    *session);
+
+/**
+ * Waits until all callbacks for index operations are complete, and prevents
+ * new index operations from starting. Index operations will return
+ * UDS_SUSPENDED until #udsResumeIndexSession is called. Optionally saves all
+ * index data before returning.
+ *
+ * @param session  The session to suspend
+ * @param save     Whether to save index data
+ *
+ * @return  Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsSuspendIndexSession(struct uds_index_session *session, bool save);
+
+/**
+ * Allows new index operations for an index, whether it was suspended or not.
+ *
+ * @param session  The session to resume
+ *
+ * @return  Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsResumeIndexSession(struct uds_index_session *session);
+
+/**
+ * Waits until all callbacks for index operations are complete.
+ *
+ * @param [in] session  The session to flush
+ *
+ * @return              Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsFlushIndexSession(struct uds_index_session *session);
+
+/**
+ * Closes an index.  This operation will fail if the index session is
+ * suspended.
+ *
+ * Saves changes to the index so that #udsOpenIndex can re-open it.
+ *
+ * @param [in] session  The session containing the index to close
+ *
+ * @return Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsCloseIndex(struct uds_index_session *session);
+
+/**
+ * Destroys an index session.
+ *
+ * Saves changes to the index and closes the index if one is open.
+ * Use #udsDestroyIndexSession for index sessions created by
+ * #udsCreateIndexSession.
+ *
+ * @param [in] session  The session to destroy
+ *
+ * @return Either #UDS_SUCCESS or an error code
+ **/
+int udsDestroyIndexSession(struct uds_index_session *session);
+
+/**
+ * Returns the configuration for the given index session.
+ *
+ * @param [in]  session The session
+ * @param [out] conf    The index configuration
+ *
+ * @return              Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsGetIndexConfiguration(struct uds_index_session *session,
+                             UdsConfiguration         *conf);
+
+/**
+ * Fetches index statistics for the given index session.
+ *
+ * @param [in]  session The session
+ * @param [out] stats   The index statistics structure to fill
+ *
+ * @return              Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsGetIndexStats(struct uds_index_session *session, UdsIndexStats *stats);
+
+/**
+ * Fetches index session statistics for the given index session.
+ *
+ * @param [in]  session  The session
+ * @param [out] stats    The context statistics structure to fill
+ *
+ * @return              Either #UDS_SUCCESS or an error code
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+int udsGetIndexSessionStats(struct uds_index_session *session,
+                            UdsContextStats          *stats);
+
+/**
+ * Convert an error code to a string.
+ *
+ * @param errnum       The error code
+ * @param buf          The buffer to hold the error string
+ * @param buflen       The length of the buffer
+ *
+ * @return A pointer to buf
+ **/
+UDS_ATTR_WARN_UNUSED_RESULT
+const char *udsStringError(int errnum, char *buf, size_t buflen);
+
+/**
+ * Suggested buffer size for udsStringError.
+ **/
+enum {
+  UDS_STRING_ERROR_BUFSIZE = 128
+};
+
+#endif /* UDS_H */
diff --git a/uds/udsMain.c b/uds/udsMain.c
new file mode 100644
index 0000000..8d4f411
--- /dev/null
+++ b/uds/udsMain.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/udsMain.c#12 $
+ */
+
+#include "uds.h"
+
+#include "config.h"
+#include "geometry.h"
+#include "indexLayout.h"
+#include "indexRouter.h"
+#include "indexSession.h"
+#include "loadType.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX   = 1024;
+const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB = (UdsMemoryConfigSize) -256;
+const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB = (UdsMemoryConfigSize) -512;
+const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB = (UdsMemoryConfigSize) -768;
+
+/*
+ * ===========================================================================
+ * UDS system management
+ * ===========================================================================
+ */
+
+/**********************************************************************/
+int udsInitializeConfiguration(UdsConfiguration    *userConfig,
+                               UdsMemoryConfigSize  memGB)
+{
+  if (userConfig == NULL) {
+    return logErrorWithStringError(UDS_CONF_PTR_REQUIRED,
+                                   "received a NULL config pointer");
+  }
+
+  /* Set the configuration parameters that change with memory size.  If you
+   * change these values, you should also:
+   *
+   * Change Configuration_x1, which tests these values and expects to see them
+   *
+   * Bump the index configuration version number.  This bump ensures that
+   * the test infrastructure will be forced to test the new configuration.
+   */
+
+  unsigned int chaptersPerVolume, recordPagesPerChapter;
+  if (memGB == UDS_MEMORY_CONFIG_256MB) {
+    chaptersPerVolume     = DEFAULT_CHAPTERS_PER_VOLUME;
+    recordPagesPerChapter = SMALL_RECORD_PAGES_PER_CHAPTER;
+  } else if (memGB == UDS_MEMORY_CONFIG_512MB) {
+    chaptersPerVolume     = DEFAULT_CHAPTERS_PER_VOLUME;
+    recordPagesPerChapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
+  } else if (memGB == UDS_MEMORY_CONFIG_768MB) {
+    chaptersPerVolume     = DEFAULT_CHAPTERS_PER_VOLUME;
+    recordPagesPerChapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
+  } else if (memGB == 1) {
+    chaptersPerVolume     = DEFAULT_CHAPTERS_PER_VOLUME;
+    recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+  } else if ((memGB > 1) && (memGB <= UDS_MEMORY_CONFIG_MAX)) {
+    chaptersPerVolume     = memGB * DEFAULT_CHAPTERS_PER_VOLUME;
+    recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+  } else {
+    return UDS_INVALID_MEMORY_SIZE;
+  }
+
+  int result = ALLOCATE(1, struct udsConfiguration, "udsConfiguration",
+                        userConfig);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  (*userConfig)->recordPagesPerChapter   = recordPagesPerChapter;
+  (*userConfig)->chaptersPerVolume       = chaptersPerVolume;
+  (*userConfig)->sparseChaptersPerVolume = DEFAULT_SPARSE_CHAPTERS_PER_VOLUME;
+  (*userConfig)->cacheChapters           = DEFAULT_CACHE_CHAPTERS;
+  (*userConfig)->checkpointFrequency     = DEFAULT_CHECKPOINT_FREQUENCY;
+  (*userConfig)->masterIndexMeanDelta    = DEFAULT_MASTER_INDEX_MEAN_DELTA;
+  (*userConfig)->bytesPerPage            = DEFAULT_BYTES_PER_PAGE;
+  (*userConfig)->sparseSampleRate        = DEFAULT_SPARSE_SAMPLE_RATE;
+  (*userConfig)->nonce                   = 0;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void udsConfigurationSetSparse(UdsConfiguration userConfig, bool sparse)
+{
+  bool prevSparse = (userConfig->sparseChaptersPerVolume != 0);
+  if (sparse == prevSparse) {
+    // nothing to do
+    return;
+  }
+
+  unsigned int prevChaptersPerVolume = userConfig->chaptersPerVolume;
+  if (sparse) {
+    // Index 10TB with 4K blocks, 95% sparse, fit in dense (1TB) footprint
+    userConfig->chaptersPerVolume = 10 * prevChaptersPerVolume;
+    userConfig->sparseChaptersPerVolume = 9 * prevChaptersPerVolume
+      + prevChaptersPerVolume / 2;
+    userConfig->sparseSampleRate = 32;
+  } else {
+    userConfig->chaptersPerVolume = prevChaptersPerVolume / 10;
+    userConfig->sparseChaptersPerVolume = 0;
+    userConfig->sparseSampleRate = 0;
+  }
+}
+
+/**********************************************************************/
+bool udsConfigurationGetSparse(UdsConfiguration userConfig)
+{
+  return userConfig->sparseChaptersPerVolume > 0;
+}
+
+/**********************************************************************/
+void udsConfigurationSetNonce(UdsConfiguration userConfig, UdsNonce nonce)
+{
+  userConfig->nonce = nonce;
+}
+
+/**********************************************************************/
+UdsNonce udsConfigurationGetNonce(UdsConfiguration userConfig)
+{
+  return userConfig->nonce;
+}
+
+/**********************************************************************/
+unsigned int udsConfigurationGetMemory(UdsConfiguration userConfig)
+{
+  enum {
+    CHAPTERS = DEFAULT_CHAPTERS_PER_VOLUME,
+    SMALL_PAGES = CHAPTERS * SMALL_RECORD_PAGES_PER_CHAPTER,
+    LARGE_PAGES = CHAPTERS * DEFAULT_RECORD_PAGES_PER_CHAPTER
+  };
+  unsigned int pages = (userConfig->chaptersPerVolume
+                        * userConfig->recordPagesPerChapter);
+  if (userConfig->sparseChaptersPerVolume != 0) {
+    pages /= 10;
+  }
+  switch (pages) {
+  case SMALL_PAGES:     return UDS_MEMORY_CONFIG_256MB;
+  case 2 * SMALL_PAGES: return UDS_MEMORY_CONFIG_512MB;
+  case 3 * SMALL_PAGES: return UDS_MEMORY_CONFIG_768MB;
+  default:              return pages / LARGE_PAGES;
+  }
+}
+
+/**********************************************************************/
+unsigned int
+udsConfigurationGetChaptersPerVolume(UdsConfiguration userConfig)
+{
+  return userConfig->chaptersPerVolume;
+}
+
+/**********************************************************************/
+void udsFreeConfiguration(UdsConfiguration userConfig)
+{
+  FREE(userConfig);
+}
+
+/**********************************************************************/
+int udsCreateIndexSession(struct uds_index_session **session)
+{
+  if (session == NULL) {
+    return UDS_NO_INDEXSESSION;
+  }
+
+  struct uds_index_session *indexSession = NULL;
+  int result = makeEmptyIndexSession(&indexSession);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *session = indexSession;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static
+int initializeIndexSessionWithLayout(struct uds_index_session    *indexSession,
+                                     IndexLayout                 *layout,
+                                     const struct uds_parameters *userParams,
+                                     LoadType                     loadType)
+{
+  int result = ((loadType == LOAD_CREATE)
+                ? writeIndexConfig(layout, &indexSession->userConfig)
+                : verifyIndexConfig(layout, &indexSession->userConfig));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  Configuration *indexConfig;
+  result = makeConfiguration(&indexSession->userConfig, &indexConfig);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Failed to allocate config");
+    return result;
+  }
+
+  // Zero the stats for the new index. 
+  memset(&indexSession->stats, 0, sizeof(indexSession->stats));
+
+  result = makeIndexRouter(layout, indexConfig, userParams, loadType,
+                           &indexSession->loadContext, enterCallbackStage,
+                           &indexSession->router);
+  freeConfiguration(indexConfig);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Failed to make router");
+    return result;
+  }
+
+  logUdsConfiguration(&indexSession->userConfig);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int initializeIndexSession(struct uds_index_session    *indexSession,
+                                  const char                  *name,
+                                  const struct uds_parameters *userParams,
+                                  LoadType                     loadType)
+{
+  IndexLayout *layout;
+  int result = makeIndexLayout(name, loadType == LOAD_CREATE,
+                               &indexSession->userConfig, &layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initializeIndexSessionWithLayout(indexSession, layout, userParams,
+                                            loadType);
+  putIndexLayout(&layout);
+  return result;
+}
+
+/**********************************************************************/
+int udsOpenIndex(UdsOpenIndexType             openType,
+                 const char                  *name,
+                 const struct uds_parameters *userParams,
+                 UdsConfiguration             userConfig,
+                 struct uds_index_session    *session)
+{
+  if (name == NULL) {
+    return UDS_INDEX_NAME_REQUIRED;
+  }
+  if (userConfig == NULL) {
+    return UDS_CONF_REQUIRED;
+  }
+  if (session == NULL) {
+    return UDS_NO_INDEXSESSION;
+  }
+
+  int result = startLoadingIndexSession(session);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  session->userConfig = *userConfig;
+
+  // Map the external openType to the internal loadType
+  LoadType loadType =   openType == UDS_CREATE     ? LOAD_CREATE
+                      : openType == UDS_NO_REBUILD ? LOAD_LOAD
+                      :                              LOAD_REBUILD;
+  logNotice("%s: %s", getLoadType(loadType), name);
+
+  result = initializeIndexSession(session, name, userParams, loadType);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Failed %s", getLoadType(loadType));
+    saveAndFreeIndex(session);
+  }
+
+  finishLoadingIndexSession(session, result);
+  return sansUnrecoverable(result);
+}
+
+/**********************************************************************/
+const char *udsGetVersion(void)
+{
+#ifdef UDS_VERSION
+  return UDS_VERSION;
+#else
+  return "internal version";
+#endif
+}
+
+/**********************************************************************/
+const char *udsStringError(int errnum, char *buf, size_t buflen)
+{
+  if (buf == NULL) {
+    return NULL;
+  }
+
+  return stringError(errnum, buf, buflen);
+}
diff --git a/uds/udsModule.c b/uds/udsModule.c
new file mode 100644
index 0000000..007f1a8
--- /dev/null
+++ b/uds/udsModule.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/kernelLinux/uds/udsModule.c#32 $
+ */
+
+#include <linux/module.h>
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "murmur/MurmurHash3.h"
+#include "sysfs.h"
+#include "timeUtils.h"
+#include "uds.h"
+#include "uds-block.h"
+#include "util/funnelQueue.h"
+
+/**********************************************************************/
+static int __init dedupeInit(void)
+{
+  memoryInit();
+  logInfo("loaded version %s", UDS_VERSION);
+  initSysfs();
+  return 0;
+}
+
+/**********************************************************************/
+static void __exit dedupeExit(void)
+{
+  putSysfs();
+  memoryExit();
+  logInfo("unloaded version %s", UDS_VERSION);
+}
+
+/**********************************************************************/
+module_init(dedupeInit);
+module_exit(dedupeExit);
+
+EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_256MB);
+EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_512MB);
+EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_768MB);
+EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_MAX);
+EXPORT_SYMBOL_GPL(udsInitializeConfiguration);
+EXPORT_SYMBOL_GPL(udsComputeIndexSize);
+EXPORT_SYMBOL_GPL(udsConfigurationSetNonce);
+EXPORT_SYMBOL_GPL(udsConfigurationGetNonce);
+EXPORT_SYMBOL_GPL(udsConfigurationSetSparse);
+EXPORT_SYMBOL_GPL(udsConfigurationGetSparse);
+EXPORT_SYMBOL_GPL(udsConfigurationGetMemory);
+EXPORT_SYMBOL_GPL(udsConfigurationGetChaptersPerVolume);
+EXPORT_SYMBOL_GPL(udsFreeConfiguration);
+EXPORT_SYMBOL_GPL(udsGetVersion);
+EXPORT_SYMBOL_GPL(udsCreateIndexSession);
+EXPORT_SYMBOL_GPL(udsOpenIndex);
+EXPORT_SYMBOL_GPL(udsSuspendIndexSession);
+EXPORT_SYMBOL_GPL(udsResumeIndexSession);
+EXPORT_SYMBOL_GPL(udsCloseIndex);
+EXPORT_SYMBOL_GPL(udsDestroyIndexSession);
+EXPORT_SYMBOL_GPL(udsFlushIndexSession);
+EXPORT_SYMBOL_GPL(udsGetIndexConfiguration);
+EXPORT_SYMBOL_GPL(udsGetIndexStats);
+EXPORT_SYMBOL_GPL(udsGetIndexSessionStats);
+EXPORT_SYMBOL_GPL(udsStringError);
+EXPORT_SYMBOL_GPL(udsStartChunkOperation);
+
+EXPORT_SYMBOL_GPL(allocSprintf);
+EXPORT_SYMBOL_GPL(allocateMemory);
+EXPORT_SYMBOL_GPL(allocateMemoryNowait);
+EXPORT_SYMBOL_GPL(assertionFailed);
+EXPORT_SYMBOL_GPL(assertionFailedLogOnly);
+EXPORT_SYMBOL_GPL(availableSpace);
+EXPORT_SYMBOL_GPL(bufferLength);
+EXPORT_SYMBOL_GPL(bufferUsed);
+EXPORT_SYMBOL_GPL(clearBuffer);
+EXPORT_SYMBOL_GPL(compactBuffer);
+EXPORT_SYMBOL_GPL(contentLength);
+EXPORT_SYMBOL_GPL(copyBytes);
+EXPORT_SYMBOL_GPL(currentTime);
+EXPORT_SYMBOL_GPL(duplicateString);
+EXPORT_SYMBOL_GPL(ensureAvailableSpace);
+EXPORT_SYMBOL_GPL(equalBuffers);
+EXPORT_SYMBOL_GPL(fixedSprintf);
+EXPORT_SYMBOL_GPL(freeBuffer);
+EXPORT_SYMBOL_GPL(freeFunnelQueue);
+EXPORT_SYMBOL_GPL(freeMemory);
+EXPORT_SYMBOL_GPL(funnelQueuePoll);
+EXPORT_SYMBOL_GPL(getBoolean);
+EXPORT_SYMBOL_GPL(getBufferContents);
+EXPORT_SYMBOL_GPL(getByte);
+EXPORT_SYMBOL_GPL(getBytesFromBuffer);
+EXPORT_SYMBOL_GPL(getMemoryStats);
+EXPORT_SYMBOL_GPL(getUInt16BEFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt16LEFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt16LEsFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt32BEFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt32BEsFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt32LEFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt64BEsFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt64LEFromBuffer);
+EXPORT_SYMBOL_GPL(getUInt64LEsFromBuffer);
+EXPORT_SYMBOL_GPL(growBuffer);
+EXPORT_SYMBOL_GPL(hasSameBytes);
+EXPORT_SYMBOL_GPL(isFunnelQueueEmpty);
+EXPORT_SYMBOL_GPL(makeBuffer);
+EXPORT_SYMBOL_GPL(makeFunnelQueue);
+EXPORT_SYMBOL_GPL(MurmurHash3_x64_128);
+EXPORT_SYMBOL_GPL(nowUsec);
+EXPORT_SYMBOL_GPL(peekByte);
+EXPORT_SYMBOL_GPL(putBoolean);
+EXPORT_SYMBOL_GPL(putBuffer);
+EXPORT_SYMBOL_GPL(putByte);
+EXPORT_SYMBOL_GPL(putBytes);
+EXPORT_SYMBOL_GPL(putInt64LEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt16BEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt16LEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt16LEsIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt32BEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt32BEsIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt32LEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt64BEsIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt64LEIntoBuffer);
+EXPORT_SYMBOL_GPL(putUInt64LEsIntoBuffer);
+EXPORT_SYMBOL_GPL(reallocateMemory);
+EXPORT_SYMBOL_GPL(registerAllocatingThread);
+EXPORT_SYMBOL_GPL(reportMemoryUsage);
+EXPORT_SYMBOL_GPL(resetBufferEnd);
+EXPORT_SYMBOL_GPL(rewindBuffer);
+EXPORT_SYMBOL_GPL(skipForward);
+EXPORT_SYMBOL_GPL(uncompactedAmount);
+EXPORT_SYMBOL_GPL(unregisterAllocatingThread);
+EXPORT_SYMBOL_GPL(wrapBuffer);
+EXPORT_SYMBOL_GPL(zeroBytes);
+
+/**********************************************************************/
+
+
+/**********************************************************************/
+
+MODULE_DESCRIPTION("deduplication engine");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(UDS_VERSION);
diff --git a/uds/util/eventCount.c b/uds/util/eventCount.c
new file mode 100644
index 0000000..7efeac6
--- /dev/null
+++ b/uds/util/eventCount.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.c#2 $
+ */
+
+/**
+ * This EventCount implementation uses a posix semaphore for portability,
+ * although a futex would be slightly superior to use and easy to substitute.
+ * It is designed to make signalling as cheap as possible, since that is the
+ * code path likely triggered on most updates to a lock-free data structure.
+ * Waiters are likely going to sleep, so optimizing for that case isn't
+ * necessary.
+ *
+ * The critical field is the state, which is really two fields that can be
+ * atomically updated in unison: an event counter and a waiter count. Every
+ * call to eventCountPrepare() issues a wait token by atomically incrementing
+ * the waiter count. The key invariant is a strict accounting of the number of
+ * tokens issued. Every token returned by eventCountPrepare() is a contract
+ * that the caller will call acquireSemaphore() and a signaller will call
+ * releaseSemaphore(), each exactly once. Atomic updates to the state field
+ * ensure that each token is counted once and that tokens are not lost.
+ * Cancelling a token attempts to take a fast-path by simply decrementing the
+ * waiters field, but if the token has already been claimed by a signaller,
+ * the canceller must still wait on the semaphore to consume the transferred
+ * token.
+ *
+ * The state field is 64 bits, partitioned into a 16-bit waiter field and a
+ * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16
+ * threads waiting on any single event transition. 2^48 microseconds is
+ * several years, so a token holder would have to wait that long for the
+ * counter to wrap around, and then call eventCountWait() at the exact right
+ * time to see the re-used counter, in order to lose a wakeup due to counter
+ * wrap-around. Using a 32-bit state field would greatly increase that chance,
+ * but if forced to do so, the implementation could likely tolerate it since
+ * callers are supposed to hold tokens for miniscule periods of time.
+ * Fortunately, x64 has 64-bit compare-and-swap, and the performance of
+ * interlocked 64-bit operations appears to be about the same as for 32-bit
+ * ones, so being paranoid and using 64 bits costs us nothing.
+ *
+ * Here are some sequences of calls and state transitions:
+ *
+ *    action                    postcondition
+ *                        counter   waiters   semaphore
+ *    initialized           0          0          0
+ *    prepare               0          1          0
+ *    wait (blocks)         0          1          0
+ *    signal                1          0          1
+ *    wait (unblocks)       1          0          0
+ *
+ *    signal (fast-path)    1          0          0
+ *    signal (fast-path)    1          0          0
+ *
+ *    prepare A             1          1          0
+ *    prepare B             1          2          0
+ *    signal                2          0          2
+ *    wait B (fast-path)    2          0          1
+ *    wait A (fast-path)    2          0          0
+ *
+ *    prepare               2          1          0
+ *    cancel (fast-path)    2          0          0
+ *
+ *    prepare               2          1          0
+ *    signal                3          0          1
+ *    cancel (must wait)    3          0          0
+ *
+ * The EventCount structure is aligned, sized, and allocated to cache line
+ * boundaries to avoid any false sharing between the EventCount and other
+ * shared state. The state field and semaphore should fit on a single cache
+ * line. The instrumentation counters increase the size of the structure so it
+ * rounds up to use two (64-byte x86) cache lines.
+ *
+ * XXX Need interface to access or display instrumentation counters.
+ **/
+
+#include "eventCount.h"
+
+#include "atomicDefs.h"
+#include "common.h"
+#include "compiler.h"
+#include "cpu.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "threads.h"
+
+enum {
+  ONE_WAITER   = 1,               // value used to increment the waiters field
+  ONE_EVENT    = (1 << 16),       // value used to increment the event counter
+  WAITERS_MASK = (ONE_EVENT - 1), // bit mask to access the waiters field
+  EVENTS_MASK  = ~WAITERS_MASK,   // bit mask to access the event counter
+};
+
+struct eventCount {
+  // Atomically mutable state:
+  // low  16 bits: the number of wait tokens not posted to the semaphore
+  // high 48 bits: current event counter
+  atomic64_t state;
+
+  // Semaphore used to block threads when waiting is required.
+  Semaphore semaphore;
+
+  // Instrumentation counters.
+
+  // Declare alignment so we don't share a cache line.
+} __attribute__((aligned(CACHE_LINE_BYTES)));
+
+/**
+ * Test the event field in two tokens for equality.
+ *
+ * @return  true iff the tokens contain the same event field value
+ **/
+static INLINE bool sameEvent(EventToken token1, EventToken token2)
+{
+  return ((token1 & EVENTS_MASK) == (token2 & EVENTS_MASK));
+}
+
+/**********************************************************************/
+void eventCountBroadcast(EventCount *ec)
+{
+
+  // Even if there are no waiters (yet), we will need a memory barrier.
+  smp_mb();
+
+  uint64_t waiters;
+  uint64_t state = atomic64_read(&ec->state);
+  uint64_t oldState = state;
+  do {
+    // Check if there are any tokens that have not yet been been transferred
+    // to the semaphore. This is the fast no-waiters path.
+    waiters = (state & WAITERS_MASK);
+    if (waiters == 0) {
+      // Fast path first time through--no need to signal or post if there are
+      // no observers.
+      return;
+    }
+
+    /*
+     * Attempt to atomically claim all the wait tokens and bump the event count
+     * using an atomic compare-and-swap.  This operation contains a memory
+     * barrier.
+     */
+    EventToken newState = ((state & ~WAITERS_MASK) + ONE_EVENT);
+    oldState = state;
+    state = atomic64_cmpxchg(&ec->state, oldState, newState);
+    // The cmpxchg fails when we lose a race with a new waiter or another
+    // signaller, so try again.
+  } while (unlikely(state != oldState));
+
+
+  /*
+   * Wake the waiters by posting to the semaphore. This effectively transfers
+   * the wait tokens to the semaphore. There's sadly no bulk post for posix
+   * semaphores, so we've got to loop to do them all.
+   */
+  while (waiters-- > 0) {
+    releaseSemaphore(&ec->semaphore);
+  }
+}
+
+/**
+ * Attempt to cancel a prepared wait token by decrementing the
+ * number of waiters in the current state. This can only be done
+ * safely if the event count hasn't been bumped.
+ *
+ * @param ec     the event count on which the wait token was issued
+ * @param token  the wait to cancel
+ *
+ * @return true if the wait was cancelled, false if the caller must
+ *         still wait on the semaphore
+ **/
+static INLINE bool fastCancel(EventCount *ec, EventToken token)
+{
+  EventToken currentToken = atomic64_read(&ec->state);
+  while (sameEvent(currentToken, token)) {
+    // Try to decrement the waiter count via compare-and-swap as if we had
+    // never prepared to wait.
+    EventToken et = atomic64_cmpxchg(&ec->state, currentToken,
+                                     currentToken - 1);
+    if (et == currentToken) {
+      return true;
+    }
+    currentToken = et;
+  }
+  return false;
+}
+
+/**
+ * Consume a token from the semaphore, waiting (with an optional timeout) if
+ * one is not currently available. Also attempts to count the number of times
+ * we'll actually have to wait because there are no tokens (permits) available
+ * in the semaphore, and the number of times the wait times out.
+ *
+ * @param ec       the event count instance
+ * @param timeout  an optional timeout value to pass to attemptSemaphore()
+ *
+ * @return true if a token was consumed, otherwise false only if a timeout
+ *         was specified and we timed out
+ **/
+static bool consumeWaitToken(EventCount *ec, const RelTime *timeout)
+{
+  // Try to grab a token without waiting.
+  if (attemptSemaphore(&ec->semaphore, 0)) {
+    return true;
+  }
+
+
+  if (timeout == NULL) {
+    acquireSemaphore(&ec->semaphore);
+  } else if (!attemptSemaphore(&ec->semaphore, *timeout)) {
+    return false;
+  }
+  return true;
+}
+
+/**********************************************************************/
+int makeEventCount(EventCount **ecPtr)
+{
+  // The event count will be allocated on a cache line boundary so there will
+  // not be false sharing of the line with any other data structure.
+  EventCount *ec = NULL;
+  int result = ALLOCATE(1, EventCount, "event count", &ec);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  atomic64_set(&ec->state, 0);
+  result = initializeSemaphore(&ec->semaphore, 0);
+  if (result != UDS_SUCCESS) {
+    FREE(ec);
+    return result;
+  }
+
+  *ecPtr = ec;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeEventCount(EventCount *ec)
+{
+  if (ec == NULL) {
+    return;
+  }
+  destroySemaphore(&ec->semaphore);
+  FREE(ec);
+}
+
+/**********************************************************************/
+EventToken eventCountPrepare(EventCount *ec)
+{
+  return atomic64_add_return(ONE_WAITER, &ec->state);
+}
+
+/**********************************************************************/
+void eventCountCancel(EventCount *ec, EventToken token)
+{
+  // Decrement the waiter count if the event hasn't been signalled.
+  if (fastCancel(ec, token)) {
+    return;
+  }
+  // A signaller has already transferred (or promised to transfer) our token
+  // to the semaphore, so we must consume it from the semaphore by waiting.
+  eventCountWait(ec, token, NULL);
+}
+
+/**********************************************************************/
+bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout)
+{
+
+  for (;;) {
+    // Wait for a signaller to transfer our wait token to the semaphore.
+    if (!consumeWaitToken(ec, timeout)) {
+      // The wait timed out, so we must cancel the token instead. Try to
+      // decrement the waiter count if the event hasn't been signalled.
+      if (fastCancel(ec, token)) {
+        return false;
+      }
+      /*
+       * We timed out, but a signaller came in before we could cancel the
+       * wait. We have no choice but to wait for the semaphore to be posted.
+       * Since signaller has promised to do it, the wait will be short. The
+       * timeout and the signal happened at about the same time, so either
+       * outcome could be returned. It's simpler to ignore the timeout.
+       */
+      timeout = NULL;
+      continue;
+    }
+
+    // A wait token has now been consumed from the semaphore.
+
+    // Stop waiting if the count has changed since the token was acquired.
+    if (!sameEvent(token, atomic64_read(&ec->state))) {
+      return true;
+    }
+
+    // We consumed someone else's wait token. Put it back in the semaphore,
+    // which will wake another waiter, hopefully one who can stop waiting.
+    releaseSemaphore(&ec->semaphore);
+
+    // Attempt to give an earlier waiter a shot at the semaphore.
+    yieldScheduler();
+  }
+}
diff --git a/uds/util/eventCount.h b/uds/util/eventCount.h
new file mode 100644
index 0000000..e3f2a33
--- /dev/null
+++ b/uds/util/eventCount.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.h#1 $
+ */
+
+#ifndef EVENT_COUNT_H
+#define EVENT_COUNT_H
+
+#include "timeUtils.h"
+#include "typeDefs.h"
+
+/**
+ * An EventCount is a lock-free equivalent of a condition variable.
+ *
+ * Using an EventCount, a lock-free producer/consumer can wait for a state
+ * change (adding an item to an empty queue, for example) without spinning or
+ * falling back on the use of mutex-based locks. Signalling is cheap when
+ * there are no waiters (a memory fence), and preparing to wait is
+ * also inexpensive (an atomic add instruction).
+ *
+ * A lock-free producer should call eventCountBroadcast() after any mutation
+ * to the lock-free data structure that a consumer might be waiting on. The
+ * consumers should poll for work like this:
+ *
+ *   for (;;) {
+ *     // Fast path--no additional cost to consumer.
+ *     if (lockFreeDequeue(&item)) {
+ *       return item;
+ *     }
+ *     // Two-step wait: get current token and poll state, either cancelling
+ *     // the wait or waiting for the token to be signalled.
+ *     EventToken token = eventCountPrepare(ec);
+ *     if (lockFreeDequeue(&item)) {
+ *       eventCountCancel(ec, token);
+ *       return item;
+ *     }
+ *     eventCountWait(ec, token, NULL);
+ *     // State has changed, but must check condition again, so loop.
+ *   }
+ *
+ * Once eventCountPrepare() is called, the caller should neither dally, sleep,
+ * nor perform long-running or blocking actions before passing the token to
+ * eventCountCancel() or eventCountWait(). The implementation is optimized for
+ * a short polling window, and will not perform well if there are outstanding
+ * tokens that have been signalled but not waited upon.
+ **/
+
+typedef struct eventCount EventCount;
+
+typedef unsigned int EventToken;
+
+/**
+ * Allocate and initialize an EventCount.
+ *
+ * @param ecPtr  a pointer to hold the new EventCount
+ **/
+__attribute__((warn_unused_result))
+int makeEventCount(EventCount **ecPtr);
+
+/**
+ * Free an EventCount. It must no longer be in use.
+ *
+ * @param ec  the EventCount to free
+ **/
+void freeEventCount(EventCount *ec);
+
+/**
+ * Wake all threads that are waiting for the next event.
+ *
+ * @param ec  the EventCount to signal
+ **/
+void eventCountBroadcast(EventCount *ec);
+
+/**
+ * Prepare to wait for the EventCount to change by capturing a token of its
+ * current state. The caller MUST eventually either call eventCountWait() or
+ * eventCountCancel() exactly once for each token obtained.
+ *
+ * @param ec  the EventCount on which to prepare to wait
+ *
+ * @return an EventToken to be passed to the next eventCountWait() call
+ **/
+EventToken eventCountPrepare(EventCount *ec)
+  __attribute__((warn_unused_result));
+
+/**
+ * Cancel a wait token that has been prepared but not waited upon. This must
+ * be called after eventCountPrepare() when eventCountWait() is not going to
+ * be invoked on the token.
+ *
+ * @param ec     the EventCount from which a wait token was obtained
+ * @param token  the wait token that will never be passed to eventCountWait()
+ **/
+void eventCountCancel(EventCount *ec, EventToken token);
+
+/**
+ * Check if the current event count state corresponds to the provided token,
+ * and if it is, wait for a signal that the state has changed. If an optional
+ * timeout is provided, the wait will terminate after the timeout has elapsed.
+ * Timing out automatically cancels the wait token, so callers must not
+ * attempt to cancel the token on timeout.
+ *
+ * @param ec       the EventCount on which to wait
+ * @param token    the EventToken returned by eventCountPrepare()
+ * @param timeout  either NULL or a relative timeout for the wait operation
+ *
+ * @return true if the state has already changed or if signalled, otherwise
+ *         false if a timeout was provided and the wait timed out
+ **/
+bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout);
+
+#endif /* EVENT_COUNT_H */
diff --git a/uds/util/funnelQueue.c b/uds/util/funnelQueue.c
new file mode 100644
index 0000000..017e405
--- /dev/null
+++ b/uds/util/funnelQueue.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.c#2 $
+ */
+
+#include "funnelQueue.h"
+
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "uds.h"
+
+/**********************************************************************/
+int makeFunnelQueue(FunnelQueue **queuePtr)
+{
+  // Allocate the queue on a cache line boundary so the producer and consumer
+  // fields in the structure will land on separate cache lines.
+  FunnelQueue *queue;
+  int result = ALLOCATE(1, FunnelQueue, "funnel queue", &queue);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Initialize the stub entry and put it in the queue, establishing the
+  // invariant that queue->newest and queue->oldest are never null.
+  queue->stub.next = NULL;
+  queue->newest = &queue->stub;
+  queue->oldest = &queue->stub;
+
+  *queuePtr = queue;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeFunnelQueue(FunnelQueue *queue)
+{
+  FREE(queue);
+}
+
+/**********************************************************************/
+static FunnelQueueEntry *getOldest(FunnelQueue *queue)
+{
+ /*
+  * Barrier requirements: We need a read barrier between reading a "next"
+  * field pointer value and reading anything it points to. There's an
+  * accompanying barrier in funnelQueuePut between its caller setting up the
+  * entry and making it visible.
+  */
+  FunnelQueueEntry *oldest = queue->oldest;
+  FunnelQueueEntry *next   = oldest->next;
+
+  if (oldest == &queue->stub) {
+    // When the oldest entry is the stub and it has no successor, the queue is
+    // logically empty.
+    if (next == NULL) {
+      return NULL;
+    }
+    // The stub entry has a successor, so the stub can be dequeued and ignored
+    // without breaking the queue invariants.
+    oldest = next;
+    queue->oldest = oldest;
+    smp_read_barrier_depends();
+    next = oldest->next;
+  }
+
+  // We have a non-stub candidate to dequeue. If it lacks a successor, we'll
+  // need to put the stub entry back on the queue first.
+  if (next == NULL) {
+    FunnelQueueEntry *newest = queue->newest;
+    if (oldest != newest) {
+      // Another thread has already swung queue->newest atomically, but not
+      // yet assigned previous->next. The queue is really still empty.
+      return NULL;
+    }
+
+    // Put the stub entry back on the queue, ensuring a successor will
+    // eventually be seen.
+    funnelQueuePut(queue, &queue->stub);
+
+    // Check again for a successor.
+    next = oldest->next;
+    if (next == NULL) {
+      // We lost a race with a producer who swapped queue->newest before we
+      // did, but who hasn't yet updated previous->next. Try again later.
+      return NULL;
+    }
+  }
+  return oldest;
+}
+
+/**********************************************************************/
+FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue)
+{
+  FunnelQueueEntry *oldest = getOldest(queue);
+  if (oldest == NULL) {
+    return oldest;
+  }
+
+  /*
+   * Dequeue the oldest entry and return it. Only one consumer thread may call
+   * this function, so no locking, atomic operations, or fences are needed;
+   * queue->oldest is owned by the consumer and oldest->next is never used by
+   * a producer thread after it is swung from NULL to non-NULL.
+   */
+  queue->oldest = oldest->next;
+  /*
+   * Make sure the caller sees the proper stored data for this entry.
+   *
+   * Since we've already fetched the entry pointer we stored in
+   * "queue->oldest", this also ensures that on entry to the next call we'll
+   * properly see the dependent data.
+   */
+  smp_rmb();
+  /*
+   * If "oldest" is a very light-weight work item, we'll be looking
+   * for the next one very soon, so prefetch it now.
+   */
+  prefetchAddress(queue->oldest, true);
+  oldest->next = NULL;
+  return oldest;
+}
+
+/**********************************************************************/
+bool isFunnelQueueEmpty(FunnelQueue *queue)
+{
+  return getOldest(queue) == NULL;
+}
+
+/**********************************************************************/
+bool isFunnelQueueIdle(FunnelQueue *queue)
+{
+  /*
+   * Oldest is not the stub, so there's another entry, though if next is
+   * NULL we can't retrieve it yet.
+   */
+  if (queue->oldest != &queue->stub) {
+    return false;
+  }
+
+  /*
+   * Oldest is the stub, but newest has been updated by _put(); either
+   * there's another, retrievable entry in the list, or the list is
+   * officially empty but in the intermediate state of having an entry
+   * added.
+   *
+   * Whether anything is retrievable depends on whether stub.next has
+   * been updated and become visible to us, but for idleness we don't
+   * care. And due to memory ordering in _put(), the update to newest
+   * would be visible to us at the same time or sooner.
+   */
+  if (queue->newest != &queue->stub) {
+    return false;
+  }
+
+  // Otherwise, we're idle.
+  return true;
+}
diff --git a/uds/util/funnelQueue.h b/uds/util/funnelQueue.h
new file mode 100644
index 0000000..083d00b
--- /dev/null
+++ b/uds/util/funnelQueue.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.h#2 $
+ */
+
+#ifndef FUNNEL_QUEUE_H
+#define FUNNEL_QUEUE_H
+
+#include "atomicDefs.h"
+#include "compiler.h"
+#include "cpu.h"
+#include "typeDefs.h"
+
+/**
+ * A FunnelQueue is a simple lock-free (almost) queue that accepts entries
+ * from multiple threads (multi-producer) and delivers them to a single thread
+ * (single-consumer). "Funnel" is an attempt to evoke the image of requests
+ * from more than one producer being "funneled down" to a single consumer.
+ *
+ * This is an unsynchronized but thread-safe data structure when used as
+ * intended. There is no mechanism to ensure that only one thread is consuming
+ * from the queue, so if that is done mistakenly, it will not be trapped, and
+ * the resulting behavior is undefined. Clients must not directly access or
+ * manipulate the internals, which are only exposed for the purpose of
+ * allowing the very simple enqueue operation to be in-lined.
+ *
+ * The implementation requires that a FunnelQueueEntry structure (a link
+ * pointer) be embedded in the queue entries, and pointers to those structures
+ * are used exclusively by the queue. No macros are defined to template the
+ * queue, so the offset of the FunnelQueueEntry in the records placed in the
+ * queue must all have a fixed offset so the client can derive their structure
+ * pointer from the entry pointer returned by funnelQueuePoll().
+ *
+ * Callers are wholly responsible for allocating and freeing the entries.
+ * Entries may be freed as soon as they are returned since this queue is not
+ * susceptible to the "ABA problem" present in many lock-free data structures.
+ * The queue is dynamically allocated to ensure cache-line alignment, but no
+ * other dynamic allocation is used.
+ *
+ * The algorithm is not actually 100% lock-free. There is a single point in
+ * funnelQueuePut() at which a pre-empted producer will prevent the consumers
+ * from seeing items added to the queue by later producers, and only if the
+ * queue is short enough or the consumer fast enough for it to reach what was
+ * the end of the queue at the time of the pre-empt.
+ *
+ * The consumer function, funnelQueuePoll(), will return NULL when the queue
+ * is empty. To wait for data to consume, spin (if safe) or combine the queue
+ * with an EventCount to signal the presence of new entries.
+ **/
+
+/**
+ * The queue link structure that must be embedded in client entries.
+ **/
+typedef struct funnelQueueEntry {
+  // The next (newer) entry in the queue.
+  struct funnelQueueEntry * volatile next;
+} FunnelQueueEntry;
+
+/**
+ * The dynamically allocated queue structure, which is aligned to a cache line
+ * boundary when allocated. This should be consider opaque; it is exposed here
+ * so funnelQueuePut() can be in-lined.
+ **/
+typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) funnelQueue {
+  // The producers' end of the queue--an atomically exchanged pointer that
+  // will never be NULL.
+  FunnelQueueEntry * volatile newest;
+
+  // The consumer's end of the queue. Owned by the consumer and never NULL.
+  FunnelQueueEntry *oldest __attribute__((aligned(CACHE_LINE_BYTES)));
+
+  // A re-usable dummy entry used to provide the non-NULL invariants above.
+  FunnelQueueEntry stub;
+} FunnelQueue;
+
+/**
+ * Construct and initialize a new, empty queue.
+ *
+ * @param queuePtr  a pointer in which to store the queue
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeFunnelQueue(FunnelQueue **queuePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a queue.
+ *
+ * This will not free any entries in the queue. The caller must ensure that
+ * either the queue will be empty or that any entries in the queue will not be
+ * leaked by dropping the references from queue.
+ *
+ * @param queue  the queue to free
+ **/
+void freeFunnelQueue(FunnelQueue *queue);
+
+/**
+ * Put an entry on the end of the queue.
+ *
+ * The entry pointer must be to the FunnelQueueEntry embedded in the caller's
+ * data structure. The caller must be able to derive the address of the start
+ * of their data structure from the pointer that passed in here, so every
+ * entry in the queue must have the FunnelQueueEntry at the same offset within
+ * the client's structure.
+ *
+ * @param queue  the queue on which to place the entry
+ * @param entry  the entry to be added to the queue
+ **/
+static INLINE void funnelQueuePut(FunnelQueue *queue, FunnelQueueEntry *entry)
+{
+  /*
+   * Barrier requirements: All stores relating to the entry ("next" pointer,
+   * containing data structure fields) must happen before the previous->next
+   * store making it visible to the consumer. Also, the entry's "next" field
+   * initialization to NULL must happen before any other producer threads can
+   * see the entry (the xchg) and try to update the "next" field.
+   *
+   * xchg implements a full barrier.
+   */
+  entry->next = NULL;
+  /*
+   * The xchg macro in the PPC kernel calls a function that takes a void*
+   * argument, triggering a warning about dropping the volatile qualifier.
+   */
+#pragma GCC diagnostic push
+#if __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+#endif
+  FunnelQueueEntry *previous = xchg(&queue->newest, entry);
+#pragma GCC diagnostic pop
+  // Pre-empts between these two statements hide the rest of the queue from
+  // the consumer, preventing consumption until the following assignment runs.
+  previous->next = entry;
+}
+
+/**
+ * Poll a queue, removing the oldest entry if the queue is not empty. This
+ * function must only be called from a single consumer thread.
+ *
+ * @param queue  the queue from which to remove an entry
+ *
+ * @return the oldest entry in the queue, or NULL if the queue is empty.
+ **/
+FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the funnel queue is empty or not. This function must only be
+ * called from a single consumer thread, as with funnelQueuePoll.
+ *
+ * If the queue is in a transition state with one or more entries being added
+ * such that the list view is incomplete, it may not be possible to retrieve an
+ * entry with the funnelQueuePoll() function. In such states this function will
+ * report an empty indication.
+ *
+ * @param queue  the queue which to check for entries.
+ *
+ * @return true iff queue contains no entry which can be retrieved
+ **/
+bool isFunnelQueueEmpty(FunnelQueue *queue)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the funnel queue is idle or not. This function must only be
+ * called from a single consumer thread, as with funnel_queue_poll.
+ *
+ * If the queue has entries available to be retrieved, it is not idle. If the
+ * queue is in a transition state with one or more entries being added such
+ * that the list view is incomplete, it may not be possible to retrieve an
+ * entry with the funnel_queue_poll() function, but the queue will not be
+ * considered idle.
+ *
+ * @param queue  the queue which to check for entries.
+ *
+ * @return true iff queue contains no entry which can be retrieved nor is
+ *              known to be having an entry added
+ **/
+bool isFunnelQueueIdle(FunnelQueue *queue)
+  __attribute__((warn_unused_result));
+
+#endif /* FUNNEL_QUEUE_H */
diff --git a/uds/util/radixSort.c b/uds/util/radixSort.c
new file mode 100644
index 0000000..cae4f90
--- /dev/null
+++ b/uds/util/radixSort.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.c#2 $
+ */
+
+/*
+ * Radix sort is implemented using an American Flag sort, an unstable,
+ * in-place 8-bit radix exchange sort.
+ *
+ * Adapted from the algorithm in the paper by Peter M. McIlroy, Keith Bostic,
+ * and M. Douglas McIlroy, "Engineering Radix Sort".
+ * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf
+ */
+
+#include "radixSort.h"
+
+#include "compiler.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+#include "typeDefs.h"
+#include "uds.h"
+
+enum {
+  // Piles smaller than this are handled with a simple insertion sort.
+  INSERTION_SORT_THRESHOLD  = 12
+};
+
+// Sort keys are pointers to immutable fixed-length arrays of bytes.
+typedef const uint8_t * Key;
+
+/**
+ * The keys are separated into piles based on the byte in each
+ * keys at the current offset, so the number of keys with each
+ * byte must be counted.
+ **/
+typedef struct {
+  uint16_t used;       // number of non-empty bins
+  uint16_t first;      // index (key byte)  of the first non-empty bin
+  uint16_t last;       // index (key byte) of the last non-empty bin
+  uint32_t size[256];  // size[byte] == # of occurrences of byte
+} Histogram;
+
+/**
+ * Sub-tasks are manually managed on a stack, both for performance
+ * and to put a logarithmic bound on the stack space needed.
+ **/
+typedef struct {
+  Key      *firstKey;  // Pointers to first and last keys to sort, inclusive.
+  Key      *lastKey;
+  uint16_t  offset;    // The offset into the key at which to continue sorting.
+  uint16_t  length;    // The number of bytes remaining in the sort keys.
+} Task;
+
+struct radixSorter {
+  unsigned int  count;
+  Histogram     bins;
+  Key          *pile[256];
+  Task         *endOfStack;
+  Task          isList[256];
+  Task          stack[];
+};
+
+/**
+ * Compare a segment of two fixed-length keys starting an offset.
+ *
+ * @param key1    the first key
+ * @param key2    the second key
+ * @param offset  the offset into the keys of the first byte to compare
+ * @param length  the number of bytes remaining in each key
+ **/
+static INLINE int compare(Key key1, Key key2, uint16_t offset, uint16_t length)
+{
+  return memcmp(&key1[offset], &key2[offset], length);
+}
+
+/**
+ * Insert the next unsorted key into an array of sorted keys.
+ *
+ * @param task  the description of the keys being sorted
+ * @param next  the pointer to the unsorted key to insert into
+ *              the array of sorted key pointers preceding it
+ **/
+static INLINE void insertKey(const Task task, Key *next)
+{
+  // Pull the unsorted key out, freeing up the array slot.
+  Key unsorted = *next;
+  // Compare the key to the preceding sorted entries, shifting
+  // down the ones that are larger.
+  while ((--next >= task.firstKey)
+         && (compare(unsorted, next[0], task.offset, task.length) < 0)) {
+    next[1] = next[0];
+  }
+  // Insert the key into the last slot that was cleared, sorting it.
+  next[1] = unsorted;
+}
+
+/**
+ * Sort a range of key segments using an insertion sort. This simple sort is
+ * faster than the 256-way radix sort when the number of keys to sort is
+ * small.
+ *
+ * @param task  the description of the keys to sort
+ **/
+static INLINE void insertionSort(const Task task)
+{
+  // (firstKey .. firstKey) is trivially sorted. Repeatedly insert the next
+  // key into the sorted list of keys preceding it, and voila!
+  Key *next;
+  for (next = task.firstKey + 1; next <= task.lastKey; next++) {
+    insertKey(task, next);
+  }
+}
+
+/**
+ * Push a sorting task onto the task stack, increasing the stack pointer.
+ **/
+static INLINE void pushTask(Task    **stackPointer,
+                            Key      *firstKey,
+                            uint32_t  count,
+                            uint16_t  offset,
+                            uint16_t  length)
+{
+  Task *task = (*stackPointer)++;
+  task->firstKey = firstKey;
+  task->lastKey  = &firstKey[count - 1];
+  task->offset   = offset;
+  task->length   = length;
+}
+
+/**********************************************************************/
+static INLINE void swapKeys(Key *a, Key *b)
+{
+  Key c = *a;
+  *a = *b;
+  *b = c;
+}
+
+/**
+ * Count the number of times each byte value appears in in the arrays of keys
+ * to sort at the current offset, keeping track of the number of non-empty
+ * bins, and the index of the first and last non-empty bin.
+ *
+ * @param task  the description of the keys to sort
+ * @param bins  the histogram bins receiving the counts
+ **/
+static INLINE void measureBins(const Task task, Histogram *bins)
+{
+  // Set bogus values that will will be replaced by min and max, respectively.
+  bins->first = UINT8_MAX;
+  bins->last = 0;
+
+  // Subtle invariant: bins->used and bins->size[] are zero because the
+  // sorting code clears it all out as it goes. Even though this structure is
+  // re-used, we don't need to pay to zero it before starting a new tally.
+
+  Key *keyPtr;
+  for (keyPtr = task.firstKey; keyPtr <= task.lastKey; keyPtr++) {
+    // Increment the count for the byte in the key at the current offset.
+    uint8_t bin = (*keyPtr)[task.offset];
+    uint32_t size = ++bins->size[bin];
+
+    // Track non-empty bins when the count transitions from zero to one.
+    if (size == 1) {
+      bins->used += 1;
+      if (bin < bins->first) {
+        bins->first = bin;
+      }
+      if (bin > bins->last) {
+        bins->last = bin;
+      }
+    }
+  }
+}
+
+/**
+ * Convert the bin sizes to pointers to where each pile goes.
+ *
+ *   pile[0] = firstKey + bin->size[0],
+ *   pile[1] = pile[0]  + bin->size[1], etc.
+ *
+ * After the keys are moved to the appropriate pile, we'll need to sort
+ * each of the piles by the next radix position.  A new task is put on the
+ * stack for each pile containing lots of keys, or a new task is is put on
+ * the list for each pile containing few keys.
+ *
+ * @param stack      pointer the top of the stack
+ * @param endOfStack the end of the stack
+ * @param list       pointer the head of the list
+ * @param pile       array that will be filled pointers to the end of each pile
+ * @param bins       the histogram of the sizes of each pile
+ * @param firstKey   the first key of the stack
+ * @param offset     the next radix position to sort by
+ * @param length     the number of bytes remaining in the sort keys
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static INLINE int pushBins(Task       **stack,
+                           Task       *endOfStack,
+                           Task       **list,
+                           Key        *pile[],
+                           Histogram  *bins,
+                           Key        *firstKey,
+                           uint16_t    offset,
+                           uint16_t    length)
+{
+  Key *pileStart = firstKey;
+  int bin;
+  for (bin = bins->first; ; bin++) {
+    uint32_t size = bins->size[bin];
+    // Skip empty piles.
+    if (size == 0) {
+      continue;
+    }
+    // There's no need to sort empty keys.
+    if (length > 0) {
+      if (size > INSERTION_SORT_THRESHOLD) {
+        if (*stack >= endOfStack) {
+          return UDS_BAD_STATE;
+        }
+        pushTask(stack, pileStart, size, offset, length);
+      } else if (size > 1) {
+        pushTask(list, pileStart, size, offset, length);
+      }
+    }
+    pileStart += size;
+    pile[bin] = pileStart;
+    if (--bins->used == 0) {
+      break;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeRadixSorter(unsigned int count, RadixSorter **sorter)
+{
+  unsigned int stackSize = count / INSERTION_SORT_THRESHOLD;
+  RadixSorter *radixSorter;
+  int result = ALLOCATE_EXTENDED(RadixSorter, stackSize, Task, __func__,
+                                 &radixSorter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  radixSorter->count = count;
+  radixSorter->endOfStack = radixSorter->stack + stackSize;
+  *sorter = radixSorter;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeRadixSorter(RadixSorter *sorter)
+{
+  FREE(sorter);
+}
+
+/**********************************************************************/
+int radixSort(RadixSorter         *sorter,
+              const unsigned char *keys[],
+              unsigned int         count,
+              unsigned short       length)
+{
+  // All zero-length keys are identical and therefore already sorted.
+  if ((count == 0) || (length == 0)) {
+    return UDS_SUCCESS;
+  }
+
+  // The initial task is to sort the entire length of all the keys.
+  Task start = {
+    .firstKey = keys,
+    .lastKey  = &keys[count - 1],
+    .offset   = 0,
+    .length   = length,
+  };
+
+  if (count <= INSERTION_SORT_THRESHOLD) {
+    insertionSort(start);
+    return UDS_SUCCESS;
+  }
+
+  if (count > sorter->count) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  Histogram  *bins  = &sorter->bins;
+  Key       **pile  = sorter->pile;
+  Task       *sp    = sorter->stack;
+
+  /*
+   * Repeatedly consume a sorting task from the stack and process it, pushing
+   * new sub-tasks onto to the stack for each radix-sorted pile. When all
+   * tasks and sub-tasks have been processed, the stack will be empty and all
+   * the keys in the starting task will be fully sorted.
+   */
+  for (*sp = start; sp >= sorter->stack; sp--) {
+    const Task task = *sp;
+    measureBins(task, bins);
+
+    // Now that we know how large each bin is, generate pointers for each of
+    // the piles and push a new task to sort each pile by the next radix byte.
+    Task *lp = sorter->isList;
+    int result = pushBins(&sp, sorter->endOfStack, &lp, pile, bins,
+                          task.firstKey, task.offset + 1, task.length - 1);
+    if (result != UDS_SUCCESS) {
+      memset(bins, 0, sizeof(*bins));
+      return result;
+    }
+    // Now bins->used is zero again.
+
+    // Don't bother processing the last pile--when piles 0..N-1 are all in
+    // place, then pile N must also be in place.
+    Key *end = task.lastKey - bins->size[bins->last];
+    bins->size[bins->last] = 0;
+
+    Key *fence;
+    for (fence = task.firstKey; fence <= end; ) {
+      uint8_t bin;
+      Key key = *fence;
+      // The radix byte of the key tells us which pile it belongs in. Swap it
+      // for an unprocessed item just below that pile, and repeat.
+      while (--pile[bin = key[task.offset]] > fence) {
+        swapKeys(pile[bin], &key);
+      }
+      // The pile reached the fence. Put the key at the bottom of that pile.
+      // completing it, and advance the fence to the next pile.
+      *fence = key;
+      fence += bins->size[bin];
+      bins->size[bin] = 0;
+    }
+    // Now bins->size[] is all zero again.
+
+    // When the number of keys in a task gets small enough, its faster to use
+    // an insertion sort than to keep subdividing into tiny piles.
+    while (--lp >= sorter->isList) {
+      insertionSort(*lp);
+    }
+  }
+  return UDS_SUCCESS;
+}
diff --git a/uds/util/radixSort.h b/uds/util/radixSort.h
new file mode 100644
index 0000000..55f19ba
--- /dev/null
+++ b/uds/util/radixSort.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.h#1 $
+ */
+
+#ifndef RADIX_SORT_H
+#define RADIX_SORT_H
+
+/*
+ * The implementation uses one large object allocated on the heap.  This
+ * large object can be reused as many times as desired.  There is no
+ * further heap usage by the sorting.
+ */
+typedef struct radixSorter RadixSorter;
+
+/**
+ * Reserve the heap storage needed by the radixSort routine.  The amount of
+ * heap space is logarithmically proportional to the number of keys.
+ *
+ * @param count   The maximum number of keys to be sorted
+ * @param sorter  The RadixSorter object is returned here
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeRadixSorter(unsigned int count, RadixSorter **sorter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the heap storage needed by the radixSort routine.
+ *
+ * @param sorter  The RadixSorter object to free
+ **/
+void freeRadixSorter(RadixSorter *sorter);
+
+/**
+ * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort.
+ *
+ * The sort implementation is unstable--relative ordering of equal keys is not
+ * preserved. The implementation does not use any heap allocation.
+ *
+ * @param [in] sorter  the heap storage used by the sorting
+ * @param      keys    the array of key pointers to sort (modified in place)
+ * @param [in] count   the number of keys
+ * @param [in] length  the length of every key, in bytes
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int radixSort(RadixSorter         *sorter,
+              const unsigned char *keys[],
+              unsigned int         count,
+              unsigned short       length)
+  __attribute__((warn_unused_result));
+
+#endif /* RADIX_SORT_H */
diff --git a/uds/volume.c b/uds/volume.c
new file mode 100644
index 0000000..4f320c5
--- /dev/null
+++ b/uds/volume.c
@@ -0,0 +1,1383 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/volume.c#23 $
+ */
+
+#include "volume.h"
+
+#include "cacheCounters.h"
+#include "chapterIndex.h"
+#include "compiler.h"
+#include "errors.h"
+#include "geometry.h"
+#include "hashUtils.h"
+#include "indexConfig.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "recordPage.h"
+#include "request.h"
+#include "sparseCache.h"
+#include "stringUtils.h"
+#include "threads.h"
+
+enum {
+  MAX_BAD_CHAPTERS = 100,           // max number of contiguous bad chapters
+  DEFAULT_VOLUME_READ_THREADS = 2,  // Default number of reader threads
+  MAX_VOLUME_READ_THREADS = 16,     // Maximum number of reader threads
+};
+
+/**********************************************************************/
+static unsigned int getReadThreads(const struct uds_parameters *userParams)
+{
+  unsigned int readThreads = (userParams == NULL
+                              ? DEFAULT_VOLUME_READ_THREADS
+                              : userParams->read_threads);
+  if (readThreads < 1) {
+    readThreads = 1;
+  }
+  if (readThreads > MAX_VOLUME_READ_THREADS) {
+    readThreads = MAX_VOLUME_READ_THREADS;
+  }
+  return readThreads;
+}
+
+/**********************************************************************/
+static INLINE unsigned int mapToPageNumber(Geometry     *geometry,
+                                           unsigned int  physicalPage)
+{
+  return ((physicalPage - 1) % geometry->pagesPerChapter);
+}
+
+/**********************************************************************/
+static INLINE unsigned int mapToChapterNumber(Geometry     *geometry,
+                                              unsigned int  physicalPage)
+{
+  return ((physicalPage - 1) / geometry->pagesPerChapter);
+}
+
+/**********************************************************************/
+static INLINE bool isRecordPage(Geometry *geometry, unsigned int physicalPage)
+{
+  return (((physicalPage - 1) % geometry->pagesPerChapter)
+          >= geometry->indexPagesPerChapter);
+}
+
+/**********************************************************************/
+static INLINE unsigned int getZoneNumber(Request *request)
+{
+  return (request == NULL) ? 0 : request->zoneNumber;
+}
+
+/**********************************************************************/
+int mapToPhysicalPage(const Geometry *geometry, int chapter, int page)
+{
+  // Page zero is the header page, so the first index page in the
+  // first chapter is physical page one.
+  return (1 + (geometry->pagesPerChapter * chapter) + page);
+}
+
+/**********************************************************************/
+static void waitForReadQueueNotFull(Volume *volume, Request *request)
+{
+  unsigned int zoneNumber = getZoneNumber(request);
+  InvalidateCounter invalidateCounter = getInvalidateCounter(volume->pageCache,
+                                                             zoneNumber);
+  if (searchPending(invalidateCounter)) {
+    // Increment the invalidate counter to avoid deadlock where the reader
+    // threads cannot make progress because they are waiting on the counter
+    // and the index thread cannot because the read queue is full.
+    endPendingSearch(volume->pageCache, zoneNumber);
+  }
+
+  while (readQueueIsFull(volume->pageCache)) {
+    logDebug("Waiting until read queue not full");
+    signalCond(&volume->readThreadsCond);
+    waitCond(&volume->readThreadsReadDoneCond, &volume->readThreadsMutex);
+  }
+
+  if (searchPending(invalidateCounter)) {
+    // Increment again so we get back to an odd value.
+    beginPendingSearch(volume->pageCache, pageBeingSearched(invalidateCounter),
+                       zoneNumber);
+  }
+}
+
+/**********************************************************************/
+int enqueuePageRead(Volume *volume, Request *request, int physicalPage)
+{
+  // Don't allow new requests if we are shutting down, but make sure
+  // to process any requests that are still in the pipeline.
+  if ((volume->readerState & READER_STATE_EXIT) != 0) {
+    logInfo("failed to queue read while shutting down");
+    return UDS_SHUTTINGDOWN;
+  }
+
+  // Mark the page as queued in the volume cache, for chapter invalidation to
+  // be able to cancel a read.
+  // If we are unable to do this because the queues are full, flush them first
+  int result;
+  while ((result = enqueueRead(volume->pageCache, request, physicalPage))
+         == UDS_SUCCESS) {
+    logDebug("Read queues full, waiting for reads to finish");
+    waitForReadQueueNotFull(volume, request);
+  }
+
+  if (result == UDS_QUEUED) {
+    /* signal a read thread */
+    signalCond(&volume->readThreadsCond);
+  }
+
+  return result;
+}
+
+/**********************************************************************/
+static INLINE void waitToReserveReadQueueEntry(Volume        *volume,
+                                               unsigned int  *queuePos,
+                                               Request      **requestList,
+                                               unsigned int  *physicalPage,
+                                               bool          *invalid)
+{
+  while (((volume->readerState & READER_STATE_EXIT) == 0)
+         && (((volume->readerState & READER_STATE_STOP) != 0)
+             || !reserveReadQueueEntry(volume->pageCache, queuePos,
+                                       requestList, physicalPage, invalid))) {
+    waitCond(&volume->readThreadsCond, &volume->readThreadsMutex);
+  }
+}
+
+/**********************************************************************/
+static int initChapterIndexPage(const Volume   *volume,
+                                byte           *indexPage,
+                                unsigned int    chapter,
+                                unsigned int    indexPageNumber,
+                                DeltaIndexPage *chapterIndexPage)
+{
+  Geometry *geometry = volume->geometry;
+
+  int result = initializeChapterIndexPage(chapterIndexPage, geometry,
+                                          indexPage, volume->nonce);
+  if (volume->lookupMode == LOOKUP_FOR_REBUILD) {
+    return result;
+  }
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "Reading chapter index page for chapter %u"
+                                   " page %u",
+                                   chapter, indexPageNumber);
+  }
+
+  IndexPageBounds bounds;
+  result = getListNumberBounds(volume->indexPageMap, chapter,
+                               indexPageNumber, &bounds);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint64_t     ciVirtual = chapterIndexPage->virtualChapterNumber;
+  unsigned int ciChapter = mapToPhysicalChapter(geometry, ciVirtual);
+  if ((chapter == ciChapter)
+      && (bounds.lowestList == chapterIndexPage->lowestListNumber)
+      && (bounds.highestList == chapterIndexPage->highestListNumber)) {
+    return UDS_SUCCESS;
+  }
+
+  logWarning("Index page map updated to %llu",
+             getLastUpdate(volume->indexPageMap));
+  logWarning("Page map expects that chapter %u page %u has range %u to %u, "
+             "but chapter index page has chapter %" PRIu64
+             " with range %u to %u",
+             chapter, indexPageNumber, bounds.lowestList, bounds.highestList,
+             ciVirtual, chapterIndexPage->lowestListNumber,
+             chapterIndexPage->highestListNumber);
+  return ASSERT_WITH_ERROR_CODE(false,
+                                UDS_CORRUPT_DATA,
+                                "index page map mismatch with chapter index");
+}
+
+/**********************************************************************/
+static int initializeIndexPage(const Volume *volume,
+                               unsigned int  physicalPage,
+                               CachedPage   *page)
+{
+  unsigned int chapter = mapToChapterNumber(volume->geometry, physicalPage);
+  unsigned int indexPageNumber = mapToPageNumber(volume->geometry,
+                                                 physicalPage);
+  int result = initChapterIndexPage(volume, getPageData(&page->cp_pageData),
+                                    chapter, indexPageNumber,
+                                    &page->cp_indexPage);
+  return result;
+}
+
+/**********************************************************************/
+static void readThreadFunction(void *arg)
+{
+  Volume       *volume  = arg;
+  unsigned int  queuePos;
+  Request      *requestList;
+  unsigned int  physicalPage;
+  bool          invalid = false;
+
+  logDebug("reader starting");
+  lockMutex(&volume->readThreadsMutex);
+  while (true) {
+    waitToReserveReadQueueEntry(volume, &queuePos, &requestList, &physicalPage,
+                                &invalid);
+    if ((volume->readerState & READER_STATE_EXIT) != 0) {
+      break;
+    }
+
+    volume->busyReaderThreads++;
+
+    bool recordPage = isRecordPage(volume->geometry, physicalPage);
+
+    CachedPage *page = NULL;
+    int result = UDS_SUCCESS;
+    if (!invalid) {
+      // Find a place to put the read queue page we reserved above.
+      result = selectVictimInCache(volume->pageCache, &page);
+      if (result == UDS_SUCCESS) {
+        unlockMutex(&volume->readThreadsMutex);
+        result = readVolumePage(&volume->volumeStore, physicalPage,
+                                &page->cp_pageData);
+        if (result != UDS_SUCCESS) {
+          logWarning("Error reading page %u from volume", physicalPage);
+          cancelPageInCache(volume->pageCache, physicalPage, page);
+        }
+        lockMutex(&volume->readThreadsMutex);
+      } else {
+        logWarning("Error selecting cache victim for page read");
+      }
+
+      if (result == UDS_SUCCESS) {
+        if (!volume->pageCache->readQueue[queuePos].invalid) {
+          if (!recordPage) {
+            result = initializeIndexPage(volume, physicalPage, page);
+            if (result != UDS_SUCCESS) {
+              logWarning("Error initializing chapter index page");
+              cancelPageInCache(volume->pageCache, physicalPage, page);
+            }
+          }
+
+          if (result == UDS_SUCCESS) {
+            result = putPageInCache(volume->pageCache, physicalPage, page);
+            if (result != UDS_SUCCESS) {
+              logWarning("Error putting page %u in cache", physicalPage);
+              cancelPageInCache(volume->pageCache, physicalPage, page);
+            }
+          }
+        } else {
+          logWarning("Page %u invalidated after read", physicalPage);
+          cancelPageInCache(volume->pageCache, physicalPage, page);
+          invalid = true;
+        }
+      }
+    } else {
+      logDebug("Requeuing requests for invalid page");
+    }
+
+    if (invalid) {
+      result = UDS_SUCCESS;
+      page = NULL;
+    }
+
+    while (requestList != NULL) {
+      Request *request = requestList;
+      requestList = request->nextRequest;
+
+      /*
+       * If we've read in a record page, we're going to do an immediate search,
+       * in an attempt to speed up processing when we requeue the request, so
+       * that it doesn't have to go back into the getRecordFromZone code again.
+       * However, if we've just read in an index page, we don't want to search.
+       * We want the request to be processed again and getRecordFromZone to be
+       * run.  We have added new fields in request to allow the index code to
+       * know whether it can stop processing before getRecordFromZone is called
+       * again.
+       */
+      if ((result == UDS_SUCCESS) && (page != NULL) && recordPage) {
+        if (searchRecordPage(getPageData(&page->cp_pageData),
+                             &request->chunkName, volume->geometry,
+                             &request->oldMetadata)) {
+          request->slLocation = LOC_IN_DENSE;
+        } else {
+          request->slLocation = LOC_UNAVAILABLE;
+        }
+        request->slLocationKnown = true;
+      }
+
+      // reflect any read failures in the request status
+      request->status = result;
+      restartRequest(request);
+    }
+
+    releaseReadQueueEntry(volume->pageCache, queuePos);
+
+    volume->busyReaderThreads--;
+    broadcastCond(&volume->readThreadsReadDoneCond);
+  }
+  unlockMutex(&volume->readThreadsMutex);
+  logDebug("reader done");
+}
+
+/**********************************************************************/
+static int readPageLocked(Volume        *volume,
+                          Request       *request,
+                          unsigned int   physicalPage,
+                          bool           syncRead,
+                          CachedPage   **pagePtr)
+{
+  syncRead |= ((volume->lookupMode == LOOKUP_FOR_REBUILD)
+               || (request == NULL)
+               || (request->session == NULL));
+
+  int result = UDS_SUCCESS;
+
+  CachedPage *page = NULL;
+  if (syncRead) {
+    // Find a place to put the page.
+    result = selectVictimInCache(volume->pageCache, &page);
+    if (result != UDS_SUCCESS) {
+      logWarning("Error selecting cache victim for page read");
+      return result;
+    }
+    result = readVolumePage(&volume->volumeStore, physicalPage,
+                            &page->cp_pageData);
+    if (result != UDS_SUCCESS) {
+      logWarning("Error reading page %u from volume", physicalPage);
+      cancelPageInCache(volume->pageCache, physicalPage, page);
+      return result;
+    }
+    if (!isRecordPage(volume->geometry, physicalPage)) {
+      result = initializeIndexPage(volume, physicalPage, page);
+      if (result != UDS_SUCCESS) {
+        if (volume->lookupMode != LOOKUP_FOR_REBUILD) {
+          logWarning("Corrupt index page %u", physicalPage);
+        }
+        cancelPageInCache(volume->pageCache, physicalPage, page);
+        return result;
+      }
+    }
+    result = putPageInCache(volume->pageCache, physicalPage, page);
+    if (result != UDS_SUCCESS) {
+      logWarning("Error putting page %u in cache", physicalPage);
+      cancelPageInCache(volume->pageCache, physicalPage, page);
+      return result;
+    }
+  } else {
+    result = enqueuePageRead(volume, request, physicalPage);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  *pagePtr = page;
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getPageLocked(Volume          *volume,
+                  Request         *request,
+                  unsigned int     physicalPage,
+                  CacheProbeType   probeType,
+                  CachedPage     **pagePtr)
+{
+  CachedPage *page = NULL;
+  int result = getPageFromCache(volume->pageCache, physicalPage, probeType,
+                                &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if (page == NULL) {
+    result = readPageLocked(volume, request, physicalPage, true, &page);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  } else if (getZoneNumber(request) == 0) {
+    // Only 1 zone is responsible for updating LRU
+    makePageMostRecent(volume->pageCache, page);
+  }
+
+  *pagePtr = page;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getPageProtected(Volume          *volume,
+                     Request         *request,
+                     unsigned int     physicalPage,
+                     CacheProbeType   probeType,
+                     CachedPage     **pagePtr)
+{
+  CachedPage *page = NULL;
+  int result = getPageFromCache(volume->pageCache, physicalPage,
+                                probeType | CACHE_PROBE_IGNORE_FAILURE,
+                                &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int zoneNumber = getZoneNumber(request);
+  // If we didn't find a page we need to enqueue a read for it, in which
+  // case we need to grab the mutex.
+  if (page == NULL) {
+    endPendingSearch(volume->pageCache, zoneNumber);
+    lockMutex(&volume->readThreadsMutex);
+
+    /*
+     * Do the lookup again while holding the read mutex (no longer the fast
+     * case so this should be ok to repeat). We need to do this because an
+     * page may have been added to the page map by the reader thread between
+     * the time searched above and the time we went to actually try to enqueue
+     * it below. This could result in us enqueuing another read for an page
+     * which is already in the cache, which would mean we end up with two
+     * entries in the cache for the same page.
+     */
+    result
+      = getPageFromCache(volume->pageCache, physicalPage, probeType, &page);
+    if (result != UDS_SUCCESS) {
+      /*
+       * In non-success cases (anything not UDS_SUCCESS, meaning both
+       * UDS_QUEUED and "real" errors), the caller doesn't get a
+       * handle on a cache page, so it can't continue the search, and
+       * we don't need to prevent other threads from messing with the
+       * cache.
+       *
+       * However, we do need to set the "search pending" flag because
+       * the callers expect it to always be set on return, even if
+       * they can't actually do the search.
+       *
+       * Doing the calls in this order ought to be faster, since we
+       * let other threads have the reader thread mutex (which can
+       * require a syscall) ASAP, and set the "search pending" state
+       * that can block the reader thread as the last thing.
+       */
+      unlockMutex(&volume->readThreadsMutex);
+      beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+      return result;
+    }
+
+    // If we found the page now, we can release the mutex and proceed
+    // as if this were the fast case.
+    if (page != NULL) {
+      /*
+       * If we found a page (*pagePtr != NULL and return
+       * UDS_SUCCESS), then we're telling the caller where to look for
+       * the cache page, and need to switch to "reader thread
+       * unlocked" and "search pending" state in careful order so no
+       * other thread can mess with the data before our caller gets to
+       * look at it.
+       */
+      beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+      unlockMutex(&volume->readThreadsMutex);
+    }
+  }
+
+  if (page == NULL) {
+    result = readPageLocked(volume, request, physicalPage, false, &page);
+    if (result != UDS_SUCCESS) {
+      /*
+       * This code path is used frequently in the UDS_QUEUED case, so
+       * the performance gain from unlocking first, while "search
+       * pending" mode is off, turns out to be significant in some
+       * cases.
+       */
+      unlockMutex(&volume->readThreadsMutex);
+      beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+      return result;
+    }
+
+    // See above re: ordering requirement.
+    beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+    unlockMutex(&volume->readThreadsMutex);
+  } else {
+    if (getZoneNumber(request) == 0 ) {
+      // Only 1 zone is responsible for updating LRU
+      makePageMostRecent(volume->pageCache, page);
+    }
+  }
+
+  *pagePtr = page;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int getPage(Volume          *volume,
+            unsigned int     chapter,
+            unsigned int     pageNumber,
+            CacheProbeType   probeType,
+            byte           **dataPtr,
+            DeltaIndexPage **indexPagePtr)
+{
+  unsigned int physicalPage
+    = mapToPhysicalPage(volume->geometry, chapter, pageNumber);
+
+  lockMutex(&volume->readThreadsMutex);
+  CachedPage *page = NULL;
+  int result = getPageLocked(volume, NULL, physicalPage, probeType, &page);
+  unlockMutex(&volume->readThreadsMutex);
+
+  if (dataPtr != NULL) {
+    *dataPtr = (page != NULL) ? getPageData(&page->cp_pageData) : NULL;
+  }
+  if (indexPagePtr != NULL) {
+    *indexPagePtr = (page != NULL) ? &page->cp_indexPage : NULL;
+  }
+  return result;
+}
+
+/**
+ * Search for a chunk name in a cached index page or chapter index, returning
+ * the record page number from a chapter index match.
+ *
+ * @param volume           the volume containing the index page to search
+ * @param request          the request originating the search (may be NULL for
+ *                         a direct query from volume replay)
+ * @param name             the name of the block or chunk
+ * @param chapter          the chapter to search
+ * @param indexPageNumber  the index page number of the page to search
+ * @param recordPageNumber pointer to return the chapter record page number
+ *                         (value will be NO_CHAPTER_INDEX_ENTRY if the name
+ *                         was not found)
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int searchCachedIndexPage(Volume             *volume,
+                                 Request            *request,
+                                 const UdsChunkName *name,
+                                 unsigned int        chapter,
+                                 unsigned int        indexPageNumber,
+                                 int                *recordPageNumber)
+{
+  unsigned int zoneNumber = getZoneNumber(request);
+  unsigned int physicalPage
+    = mapToPhysicalPage(volume->geometry, chapter, indexPageNumber);
+
+  /*
+   * Make sure the invalidate counter is updated before we try and read from
+   * the page map.  This prevents this thread from reading a page in the
+   * page map which has already been marked for invalidation by the reader
+   * thread, before the reader thread has noticed that the invalidateCounter
+   * has been incremented.
+   */
+  beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+
+  CachedPage *page = NULL;
+  int result = getPageProtected(volume, request, physicalPage,
+                                cacheProbeType(request, true), &page);
+  if (result != UDS_SUCCESS) {
+    endPendingSearch(volume->pageCache, zoneNumber);
+    return result;
+  }
+
+  result
+    = ASSERT_LOG_ONLY(searchPending(getInvalidateCounter(volume->pageCache,
+                                                         zoneNumber)),
+                      "Search is pending for zone %u", zoneNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = searchChapterIndexPage(&page->cp_indexPage, volume->geometry, name,
+                                  recordPageNumber);
+  endPendingSearch(volume->pageCache, zoneNumber);
+  return result;
+}
+
+/**********************************************************************/
+int searchCachedRecordPage(Volume             *volume,
+                           Request            *request,
+                           const UdsChunkName *name,
+                           unsigned int        chapter,
+                           int                 recordPageNumber,
+                           UdsChunkData       *duplicate,
+                           bool               *found)
+{
+  *found = false;
+
+  if (recordPageNumber == NO_CHAPTER_INDEX_ENTRY) {
+    // No record for that name can exist in the chapter.
+    return UDS_SUCCESS;
+  }
+
+  Geometry *geometry = volume->geometry;
+  int result = ASSERT(((recordPageNumber >= 0)
+                       && ((unsigned int) recordPageNumber
+                           < geometry->recordPagesPerChapter)),
+                      "0 <= %d <= %u",
+                      recordPageNumber, geometry->recordPagesPerChapter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  unsigned int pageNumber = geometry->indexPagesPerChapter + recordPageNumber;
+
+  unsigned int zoneNumber = getZoneNumber(request);
+  int physicalPage
+    = mapToPhysicalPage(volume->geometry, chapter, pageNumber);
+
+  /*
+   * Make sure the invalidate counter is updated before we try and read from
+   * the page map. This prevents this thread from reading a page in the page
+   * map which has already been marked for invalidation by the reader thread,
+   * before the reader thread has noticed that the invalidateCounter has been
+   * incremented.
+   */
+  beginPendingSearch(volume->pageCache, physicalPage, zoneNumber);
+
+  CachedPage *recordPage;
+  result = getPageProtected(volume, request, physicalPage,
+                            cacheProbeType(request, false), &recordPage);
+  if (result != UDS_SUCCESS) {
+    endPendingSearch(volume->pageCache, zoneNumber);
+    return result;
+  }
+
+  if (searchRecordPage(getPageData(&recordPage->cp_pageData), name, geometry,
+                       duplicate)) {
+    *found = true;
+  }
+  endPendingSearch(volume->pageCache, zoneNumber);
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int readChapterIndexFromVolume(const Volume       *volume,
+                               uint64_t            virtualChapter,
+                               struct volume_page  volumePages[],
+                               DeltaIndexPage      indexPages[])
+{
+  const Geometry *geometry = volume->geometry;
+  unsigned int physicalChapter = mapToPhysicalChapter(geometry,
+                                                      virtualChapter);
+  int physicalPage = mapToPhysicalPage(geometry, physicalChapter, 0);
+  prefetchVolumePages(&volume->volumeStore, physicalPage,
+                      geometry->indexPagesPerChapter);
+
+  unsigned int i;
+  struct volume_page volumePage;
+  int result = initializeVolumePage(geometry, &volumePage);
+  for (i = 0; i < geometry->indexPagesPerChapter; i++) {
+    int result = readVolumePage(&volume->volumeStore, physicalPage + i,
+                                &volumePages[i]);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+    byte *indexPage = getPageData(&volumePages[i]);
+    result = initChapterIndexPage(volume, indexPage, physicalChapter, i,
+                                  &indexPages[i]);
+    if (result != UDS_SUCCESS) {
+      break;
+    }
+  }
+  destroyVolumePage(&volumePage);
+  return result;
+}
+
+/**********************************************************************/
+int searchVolumePageCache(Volume             *volume,
+                          Request            *request,
+                          const UdsChunkName *name,
+                          uint64_t            virtualChapter,
+                          UdsChunkData       *metadata,
+                          bool               *found)
+{
+  unsigned int physicalChapter
+    = mapToPhysicalChapter(volume->geometry, virtualChapter);
+  unsigned int indexPageNumber;
+  int result = findIndexPageNumber(volume->indexPageMap, name, physicalChapter,
+                                   &indexPageNumber);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  int recordPageNumber;
+  result = searchCachedIndexPage(volume, request, name, physicalChapter,
+                                 indexPageNumber, &recordPageNumber);
+  if (result == UDS_SUCCESS) {
+    result = searchCachedRecordPage(volume, request, name, physicalChapter,
+                                    recordPageNumber, metadata, found);
+  }
+
+  return result;
+}
+
+/**********************************************************************/
+int forgetChapter(Volume             *volume,
+                  uint64_t            virtualChapter,
+                  InvalidationReason  reason)
+{
+  logDebug("forgetting chapter %llu", virtualChapter);
+  unsigned int physicalChapter
+    = mapToPhysicalChapter(volume->geometry, virtualChapter);
+  lockMutex(&volume->readThreadsMutex);
+  int result
+    = invalidatePageCacheForChapter(volume->pageCache, physicalChapter,
+                                    volume->geometry->pagesPerChapter,
+                                    reason);
+  unlockMutex(&volume->readThreadsMutex);
+  return result;
+}
+
+/**
+ * Donate index page data to the page cache for an index page that was just
+ * written to the volume.  The caller must already hold the reader thread
+ * mutex.
+ *
+ * @param volume           the volume
+ * @param physicalChapter  the physical chapter number of the index page
+ * @param indexPageNumber  the chapter page number of the index page
+ * @param scratchPage      the index page data
+ **/
+static int donateIndexPageLocked(Volume             *volume,
+                                 unsigned int        physicalChapter,
+                                 unsigned int        indexPageNumber,
+                                 struct volume_page *scratchPage)
+{
+  unsigned int physicalPage
+    = mapToPhysicalPage(volume->geometry, physicalChapter, indexPageNumber);
+
+  // Find a place to put the page.
+  CachedPage *page = NULL;
+  int result = selectVictimInCache(volume->pageCache, &page);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Exchange the scratch page with the cache page
+  swapVolumePages(&page->cp_pageData, scratchPage);
+  
+  result = initChapterIndexPage(volume, getPageData(&page->cp_pageData),
+                                physicalChapter, indexPageNumber,
+                                &page->cp_indexPage);
+  if (result != UDS_SUCCESS) {
+    logWarning("Error initialize chapter index page");
+    cancelPageInCache(volume->pageCache, physicalPage, page);
+    return result;
+  }
+
+  result = putPageInCache(volume->pageCache, physicalPage, page);
+  if (result != UDS_SUCCESS) {
+    logWarning("Error putting page %u in cache", physicalPage);
+    cancelPageInCache(volume->pageCache, physicalPage, page);
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int writeIndexPages(Volume            *volume,
+                    int                physicalPage,
+                    OpenChapterIndex  *chapterIndex,
+                    byte             **pages)
+{
+  Geometry *geometry = volume->geometry;
+  unsigned int physicalChapterNumber
+    = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber);
+  unsigned int deltaListNumber = 0;
+
+  unsigned int indexPageNumber;
+  for (indexPageNumber = 0;
+       indexPageNumber < geometry->indexPagesPerChapter;
+       indexPageNumber++) {
+    int result = prepareToWriteVolumePage(&volume->volumeStore,
+                                          physicalPage + indexPageNumber,
+                                          &volume->scratchPage);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result, "failed to prepare index page");
+    }
+
+    // Pack as many delta lists into the index page as will fit.
+    unsigned int listsPacked;
+    bool lastPage = ((indexPageNumber + 1) == geometry->indexPagesPerChapter);
+    result = packOpenChapterIndexPage(chapterIndex,
+                                      getPageData(&volume->scratchPage),
+                                      deltaListNumber, lastPage, &listsPacked);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result, "failed to pack index page");
+    }
+
+    result = writeVolumePage(&volume->volumeStore,
+                             physicalPage + indexPageNumber,
+                             &volume->scratchPage);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to write chapter index page");
+    }
+
+    if (pages != NULL) {
+      memcpy(pages[indexPageNumber], getPageData(&volume->scratchPage),
+             geometry->bytesPerPage);
+    }
+
+    // Tell the index page map the list number of the last delta list that was
+    // packed into the index page.
+    if (listsPacked == 0) {
+      logDebug("no delta lists packed on chapter %u page %u",
+               physicalChapterNumber, indexPageNumber);
+    } else {
+      deltaListNumber += listsPacked;
+    }
+    result = updateIndexPageMap(volume->indexPageMap,
+                                chapterIndex->virtualChapterNumber,
+                                physicalChapterNumber,
+                                indexPageNumber, deltaListNumber - 1);
+    if (result != UDS_SUCCESS) {
+      return logErrorWithStringError(result,
+                                     "failed to update index page map");
+    }
+
+    // Donate the page data for the index page to the page cache.
+    lockMutex(&volume->readThreadsMutex);
+    result = donateIndexPageLocked(volume, physicalChapterNumber,
+                                   indexPageNumber, &volume->scratchPage);
+    unlockMutex(&volume->readThreadsMutex);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int writeRecordPages(Volume                *volume,
+                     int                    physicalPage,
+                     const UdsChunkRecord   records[],
+                     byte                 **pages)
+{
+  Geometry *geometry = volume->geometry;
+  // Skip over the index pages, which come before the record pages
+  physicalPage += geometry->indexPagesPerChapter;
+  // The record array from the open chapter is 1-based.
+  const UdsChunkRecord *nextRecord = &records[1];
+
+  unsigned int recordPageNumber;
+  for (recordPageNumber = 0;
+       recordPageNumber < geometry->recordPagesPerChapter;
+       recordPageNumber++) {
+    int result = prepareToWriteVolumePage(&volume->volumeStore,
+                                          physicalPage + recordPageNumber,
+                                          &volume->scratchPage);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to prepare record page");
+    }
+
+    // Sort the next page of records and copy them to the record page as a
+    // binary tree stored in heap order.
+    result = encodeRecordPage(volume, nextRecord,
+                              getPageData(&volume->scratchPage));
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to encode record page %u",
+                                       recordPageNumber);
+    }
+    nextRecord += geometry->recordsPerPage;
+
+    result = writeVolumePage(&volume->volumeStore,
+                             physicalPage + recordPageNumber,
+                             &volume->scratchPage);
+    if (result != UDS_SUCCESS) {
+      return logWarningWithStringError(result,
+                                       "failed to write chapter record page");
+    }
+
+    if (pages != NULL) {
+      memcpy(pages[recordPageNumber], getPageData(&volume->scratchPage),
+             geometry->bytesPerPage);
+    }
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int writeChapter(Volume                 *volume,
+                 OpenChapterIndex       *chapterIndex,
+                 const UdsChunkRecord    records[])
+{
+  // Determine the position of the virtual chapter in the volume file.
+  Geometry *geometry = volume->geometry;
+  unsigned int physicalChapterNumber
+    = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber);
+  int physicalPage = mapToPhysicalPage(geometry, physicalChapterNumber, 0);
+
+  // Pack and write the delta chapter index pages to the volume.
+  int result = writeIndexPages(volume, physicalPage, chapterIndex, NULL);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  // Sort and write the record pages to the volume.
+  result = writeRecordPages(volume, physicalPage, records, NULL);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  releaseVolumePage(&volume->scratchPage);
+  // Flush the data to permanent storage.
+  return syncVolumeStore(&volume->volumeStore);
+}
+
+/**********************************************************************/
+size_t getCacheSize(Volume *volume)
+{
+  size_t size = getPageCacheSize(volume->pageCache);
+  if (isSparse(volume->geometry)) {
+    size += getSparseCacheMemorySize(volume->sparseCache);
+  }
+  return size;
+}
+
+/**********************************************************************/
+static int probeChapter(Volume       *volume,
+                        unsigned int  chapterNumber,
+                        uint64_t     *virtualChapterNumber)
+{
+  const Geometry *geometry = volume->geometry;
+  unsigned int expectedListNumber = 0;
+  uint64_t lastVCN = UINT64_MAX;
+
+  prefetchVolumePages(&volume->volumeStore,
+                      mapToPhysicalPage(geometry, chapterNumber, 0),
+                      geometry->indexPagesPerChapter);
+
+  unsigned int i;
+  for (i = 0; i < geometry->indexPagesPerChapter; ++i) {
+    DeltaIndexPage *page;
+    int result = getPage(volume, chapterNumber, i, CACHE_PROBE_INDEX_FIRST,
+                         NULL, &page);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    uint64_t vcn = page->virtualChapterNumber;
+    if (lastVCN == UINT64_MAX) {
+      lastVCN = vcn;
+    } else if (vcn != lastVCN) {
+      logError("inconsistent chapter %u index page %u: expected vcn %"
+               PRIu64 ", got vcn %llu",
+               chapterNumber, i, lastVCN, vcn);
+      return UDS_CORRUPT_COMPONENT;
+    }
+
+    if (expectedListNumber != page->lowestListNumber) {
+      logError("inconsistent chapter %u index page %u: expected list number %u"
+               ", got list number %u",
+               chapterNumber, i, expectedListNumber, page->lowestListNumber);
+      return UDS_CORRUPT_COMPONENT;
+    }
+    expectedListNumber = page->highestListNumber + 1;
+
+    result = validateChapterIndexPage(page, geometry);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  if (lastVCN == UINT64_MAX) {
+    logError("no chapter %u virtual chapter number determined", chapterNumber);
+    return UDS_CORRUPT_COMPONENT;
+  }
+  if (chapterNumber != lastVCN % geometry->chaptersPerVolume) {
+    logError("chapter %u vcn %llu is out of phase (%u)",
+             chapterNumber, lastVCN, geometry->chaptersPerVolume);
+    return UDS_CORRUPT_COMPONENT;
+  }
+  *virtualChapterNumber = lastVCN;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int probeWrapper(void         *aux,
+                        unsigned int  chapterNumber,
+                        uint64_t     *virtualChapterNumber)
+{
+  Volume *volume = aux;
+  int result = probeChapter(volume, chapterNumber, virtualChapterNumber);
+  if ((result == UDS_CORRUPT_COMPONENT) || (result == UDS_CORRUPT_DATA)) {
+    *virtualChapterNumber = UINT64_MAX;
+    return UDS_SUCCESS;
+  }
+  return result;
+}
+
+/**********************************************************************/
+static int findRealEndOfVolume(Volume       *volume,
+                               unsigned int  limit,
+                               unsigned int *limitPtr)
+{
+  /*
+   * Start checking from the end of the volume. As long as we hit corrupt
+   * data, start skipping larger and larger amounts until we find real data.
+   * If we find real data, reduce the span and try again until we find
+   * the exact boundary.
+   */
+  unsigned int span = 1;
+  unsigned int tries = 0;
+  while (limit > 0) {
+    unsigned int chapter = (span > limit) ? 0 : limit - span;
+    uint64_t vcn = 0;
+    int result = probeChapter(volume, chapter, &vcn);
+    if (result == UDS_SUCCESS) {
+      if (span == 1) {
+        break;
+      }
+      span /= 2;
+      tries = 0;
+    } else if (result == UDS_CORRUPT_COMPONENT) {
+      limit = chapter;
+      if (++tries > 1) {
+        span *= 2;
+      }
+    } else {
+      return logErrorWithStringError(result, "cannot determine end of volume");
+    }
+  }
+
+  if (limitPtr != NULL) {
+    *limitPtr = limit;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int findVolumeChapterBoundaries(Volume   *volume,
+                                uint64_t *lowestVCN,
+                                uint64_t *highestVCN,
+                                bool     *isEmpty)
+{
+  unsigned int chapterLimit = volume->geometry->chaptersPerVolume;
+
+  int result = findRealEndOfVolume(volume, chapterLimit, &chapterLimit);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot find end of volume");
+  }
+
+  if (chapterLimit == 0) {
+    *lowestVCN = 0;
+    *highestVCN = 0;
+    *isEmpty = true;
+    return UDS_SUCCESS;
+  }
+
+  *isEmpty = false;
+  return findVolumeChapterBoundariesImpl(chapterLimit, MAX_BAD_CHAPTERS,
+                                         lowestVCN, highestVCN, probeWrapper,
+                                         volume);
+}
+
+/**********************************************************************/
+int findVolumeChapterBoundariesImpl(unsigned int  chapterLimit,
+                                    unsigned int  maxBadChapters,
+                                    uint64_t     *lowestVCN,
+                                    uint64_t     *highestVCN,
+                                    int (*probeFunc)(void         *aux,
+                                                     unsigned int  chapter,
+                                                     uint64_t     *vcn),
+                                    void *aux)
+{
+  if (chapterLimit == 0) {
+    *lowestVCN = 0;
+    *highestVCN = 0;
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * This method assumes there is at most one run of contiguous bad chapters
+   * caused by unflushed writes. Either the bad spot is at the beginning and
+   * end, or somewhere in the middle. Wherever it is, the highest and lowest
+   * VCNs are adjacent to it. Otherwise the volume is cleanly saved and
+   * somewhere in the middle of it the highest VCN immediately preceeds the
+   * lowest one.
+   */
+
+  uint64_t firstVCN = UINT64_MAX;
+
+  // doesn't matter if this results in a bad spot (UINT64_MAX)
+  int result = (*probeFunc)(aux, 0, &firstVCN);
+  if (result != UDS_SUCCESS) {
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * Binary search for end of the discontinuity in the monotonically
+   * increasing virtual chapter numbers; bad spots are treated as a span of
+   * UINT64_MAX values. In effect we're searching for the index of the
+   * smallest value less than firstVCN. In the case we go off the end it means
+   * that chapter 0 has the lowest vcn.
+   */
+
+  unsigned int leftChapter = 0;
+  unsigned int rightChapter = chapterLimit;
+
+  while (leftChapter < rightChapter) {
+    unsigned int chapter = (leftChapter + rightChapter) / 2;
+    uint64_t probeVCN;
+
+    result = (*probeFunc)(aux, chapter, &probeVCN);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (firstVCN <= probeVCN) {
+      leftChapter = chapter + 1;
+    } else {
+      rightChapter = chapter;
+    }
+  }
+
+  uint64_t lowest = UINT64_MAX;
+  uint64_t highest = UINT64_MAX;
+
+  result = ASSERT(leftChapter == rightChapter, "leftChapter == rightChapter");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  leftChapter %= chapterLimit;  // in case we're at the end
+
+  // At this point, leftChapter is the chapter with the lowest virtual chapter
+  // number.
+
+  result = (*probeFunc)(aux, leftChapter, &lowest);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((lowest != UINT64_MAX), "invalid lowest chapter");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // We now circularly scan backwards, moving over any bad chapters until we
+  // find the chapter with the highest vcn (the first good chapter we
+  // encounter).
+
+  unsigned int badChapters = 0;
+
+  for (;;) {
+    rightChapter = (rightChapter + chapterLimit - 1) % chapterLimit;
+    result = (*probeFunc)(aux, rightChapter, &highest);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    if (highest != UINT64_MAX) {
+      break;
+    }
+    if (++badChapters >= maxBadChapters) {
+      logError("too many bad chapters in volume: %u", badChapters);
+      return UDS_CORRUPT_COMPONENT;
+    }
+  }
+
+  *lowestVCN = lowest;
+  *highestVCN = highest;
+  return UDS_SUCCESS;
+}
+
+/**
+ * Allocate a volume.
+ *
+ * @param config            The configuration to use
+ * @param layout            The index layout
+ * @param readQueueMaxSize  The maximum size of the read queue
+ * @param zoneCount         The number of zones to use
+ * @param newVolume         A pointer to hold the new volume
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int allocateVolume(const Configuration  *config,
+                          IndexLayout          *layout,
+                          unsigned int          readQueueMaxSize,
+                          unsigned int          zoneCount,
+                          Volume              **newVolume)
+{
+  Volume *volume;
+  int result = ALLOCATE(1, Volume, "volume", &volume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  volume->nonce = getVolumeNonce(layout);
+  // It is safe to call freeVolume now to clean up and close the volume
+
+  result = copyGeometry(config->geometry, &volume->geometry);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return logWarningWithStringError(result,
+                                     "failed to allocate geometry: error");
+  }
+
+  // Need a buffer for each entry in the page cache
+  unsigned int reservedBuffers
+    = config->cacheChapters * config->geometry->recordPagesPerChapter;
+  // And a buffer for the chapter writer
+  reservedBuffers += 1;
+  // And a buffer for each entry in the sparse cache
+  if (isSparse(volume->geometry)) {
+    reservedBuffers
+      += config->cacheChapters * config->geometry->indexPagesPerChapter;
+  }
+  result = openVolumeStore(&volume->volumeStore, layout, reservedBuffers,
+                           config->geometry->bytesPerPage);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+  result = initializeVolumePage(config->geometry, &volume->scratchPage);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+
+  result = makeRadixSorter(config->geometry->recordsPerPage,
+                           &volume->radixSorter);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+
+  result = ALLOCATE(config->geometry->recordsPerPage, const UdsChunkRecord *,
+                    "record pointers", &volume->recordPointers);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+
+  if (isSparse(volume->geometry)) {
+    result = makeSparseCache(volume->geometry, config->cacheChapters,
+                             zoneCount, &volume->sparseCache);
+    if (result != UDS_SUCCESS) {
+      freeVolume(volume);
+      return result;
+    }
+  }
+  result = makePageCache(volume->geometry, config->cacheChapters,
+                         readQueueMaxSize, zoneCount, &volume->pageCache);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+  result = makeIndexPageMap(volume->geometry, &volume->indexPageMap);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+
+  *newVolume = volume;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeVolume(const Configuration          *config,
+               IndexLayout                  *layout,
+               const struct uds_parameters  *userParams,
+               unsigned int                  readQueueMaxSize,
+               unsigned int                  zoneCount,
+               Volume                      **newVolume)
+{
+  unsigned int volumeReadThreads = getReadThreads(userParams);
+
+  if (readQueueMaxSize <= volumeReadThreads) {
+    logError("Number of read threads must be smaller than read queue");
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  Volume *volume = NULL;
+  int result = allocateVolume(config, layout, readQueueMaxSize, zoneCount,
+                              &volume);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = initMutex(&volume->readThreadsMutex);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+  result = initCond(&volume->readThreadsReadDoneCond);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+  result = initCond(&volume->readThreadsCond);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+
+  // Start the reader threads.  If this allocation succeeds, freeVolume knows
+  // that it needs to try and stop those threads.
+  result = ALLOCATE(volumeReadThreads, Thread, "reader threads",
+                    &volume->readerThreads);
+  if (result != UDS_SUCCESS) {
+    freeVolume(volume);
+    return result;
+  }
+  unsigned int i;
+  for (i = 0; i < volumeReadThreads; i++) {
+    result = createThread(readThreadFunction, (void *) volume, "reader",
+                          &volume->readerThreads[i]);
+    if (result != UDS_SUCCESS) {
+      freeVolume(volume);
+      return result;
+    }
+    // We only stop as many threads as actually got started.
+    volume->numReadThreads = i + 1;
+  }
+
+  *newVolume = volume;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void freeVolume(Volume *volume)
+{
+  if (volume == NULL) {
+    return;
+  }
+
+  // If readerThreads is NULL, then we haven't set up the reader threads.
+  if (volume->readerThreads != NULL) {
+    // Stop the reader threads.  It is ok if there aren't any of them.
+    lockMutex(&volume->readThreadsMutex);
+    volume->readerState |= READER_STATE_EXIT;
+    broadcastCond(&volume->readThreadsCond);
+    unlockMutex(&volume->readThreadsMutex);
+    unsigned int i;
+    for (i = 0; i < volume->numReadThreads; i++) {
+      joinThreads(volume->readerThreads[i]);
+    }
+    FREE(volume->readerThreads);
+    volume->readerThreads = NULL;
+  }
+
+  // Must close the volume store AFTER freeing the scratch page and the caches
+  destroyVolumePage(&volume->scratchPage);
+  freePageCache(volume->pageCache);
+  freeSparseCache(volume->sparseCache);
+  closeVolumeStore(&volume->volumeStore);
+
+  destroyCond(&volume->readThreadsCond);
+  destroyCond(&volume->readThreadsReadDoneCond);
+  destroyMutex(&volume->readThreadsMutex);
+  freeIndexPageMap(volume->indexPageMap);
+  freeRadixSorter(volume->radixSorter);
+  FREE(volume->geometry);
+  FREE(volume->recordPointers);
+  FREE(volume);
+}
diff --git a/uds/volume.h b/uds/volume.h
new file mode 100644
index 0000000..82aef00
--- /dev/null
+++ b/uds/volume.h
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/volume.h#14 $
+ */
+
+#ifndef VOLUME_H
+#define VOLUME_H
+
+#include "cacheCounters.h"
+#include "common.h"
+#include "chapterIndex.h"
+#include "indexConfig.h"
+#include "indexLayout.h"
+#include "indexPageMap.h"
+#include "pageCache.h"
+#include "request.h"
+#include "sparseCache.h"
+#include "uds.h"
+#include "util/radixSort.h"
+#include "volumeStore.h"
+
+typedef enum {
+  READER_STATE_RUN   = 1,
+  READER_STATE_EXIT  = 2,
+  READER_STATE_STOP  = 4
+} ReaderState;
+
+typedef enum indexLookupMode {
+  /* Always do lookups in all chapters normally.  */
+  LOOKUP_NORMAL,
+  /*
+   * Don't do lookups in closed chapters; assume records not in the
+   * open chapter are always new.  You don't want this normally; it's
+   * for programs like albfill.  (Even then, with multiple runs using
+   * the same tag, we may actually duplicate older records, but if
+   * it's in a separate chapter it won't really matter.)
+   */
+  LOOKUP_CURRENT_CHAPTER_ONLY,
+  /*
+   * Only do a subset of lookups needed when rebuilding an index.
+   * This cannot be set externally.
+   */
+  LOOKUP_FOR_REBUILD
+} IndexLookupMode;
+
+typedef struct volume {
+  /* The layout of the volume */
+  Geometry              *geometry;
+  /* The configuration of the volume */
+  Configuration         *config;
+  /* The access to the volume's backing store */
+  struct volume_store    volumeStore;
+  /* A single page used for writing to the volume */
+  struct volume_page     scratchPage;
+  /* The nonce used to save the volume */
+  uint64_t               nonce;
+  /* A single page's records, for sorting */
+  const UdsChunkRecord **recordPointers;
+  /* For sorting record pages */
+  RadixSorter           *radixSorter;
+  /* The sparse chapter index cache */
+  SparseCache           *sparseCache;
+  /* The page cache */
+  PageCache             *pageCache;
+  /* The index page map maps delta list numbers to index page numbers */
+  IndexPageMap          *indexPageMap;
+  /* Mutex to sync between read threads and index thread */
+  Mutex                  readThreadsMutex;
+  /* Condvar to indicate when read threads should start working */
+  CondVar                readThreadsCond;
+  /* Condvar to indicate when a read thread has finished a read */
+  CondVar                readThreadsReadDoneCond;
+  /* Threads to read data from disk */
+  Thread                *readerThreads;
+  /* Number of threads busy with reads */
+  unsigned int           busyReaderThreads;
+  /* The state of the reader threads */
+  ReaderState            readerState;
+  /* The lookup mode for the index */
+  IndexLookupMode        lookupMode;
+  /* Number of read threads to use (run-time parameter) */
+  unsigned int           numReadThreads;
+} Volume;
+
+/**
+ * Create a volume.
+ *
+ * @param config            The configuration to use.
+ * @param layout            The index layout
+ * @param userParams        The index session parameters.  If NULL, the default
+ *                          session parameters will be used.
+ * @param readQueueMaxSize  The maximum size of the read queue.
+ * @param zoneCount         The number of zones to use.
+ * @param newVolume         A pointer to hold a pointer to the new volume.
+ *
+ * @return          UDS_SUCCESS or an error code
+ **/
+int makeVolume(const Configuration          *config,
+               IndexLayout                  *layout,
+               const struct uds_parameters  *userParams,
+               unsigned int                  readQueueMaxSize,
+               unsigned int                  zoneCount,
+               Volume                      **newVolume)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up a volume and its memory.
+ *
+ * @param volume  The volume to destroy.
+ **/
+void freeVolume(Volume *volume);
+
+/**
+ * Enqueue a page read.
+ *
+ * @param volume       the volume
+ * @param request      the request to waiting on the read
+ * @param physicalPage the page number to read
+ *
+ * @return UDS_QUEUED if successful, or an error code
+ **/
+int enqueuePageRead(Volume *volume, Request *request, int physicalPage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the lowest and highest contiguous chapters and determine their
+ * virtual chapter numbers.
+ *
+ * @param [in]  volume      The volume to probe.
+ * @param [out] lowestVCN   Pointer for lowest virtual chapter number.
+ * @param [out] highestVCN  Pointer for highest virtual chapter number.
+ * @param [out] isEmpty     Pointer to a bool indicating whether or not the
+ *                          volume is empty.
+ *
+ * @return              UDS_SUCCESS, or an error code.
+ *
+ * @note        This routine does something similar to a binary search to find
+ *              the location in the volume file where the discontinuity of
+ *              chapter numbers occurs.  In a good save, the discontinuity is
+ *              a sharp cliff, but if write failures occured during saving
+ *              there may be one or more chapters which are partially written.
+ *
+ * @note        This method takes advantage of the fact that the physical
+ *              chapter number in which the index pages are found should have
+ *              headers which state that the virtual chapter number are all
+ *              identical and maintain the invariant that
+ *              pcn == vcn % chaptersPerVolume.
+ **/
+int findVolumeChapterBoundaries(Volume   *volume,
+                                uint64_t *lowestVCN,
+                                uint64_t *highestVCN,
+                                bool     *isEmpty)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find any matching metadata for the given name within a given physical
+ * chapter.
+ *
+ * @param volume          The volume.
+ * @param request         The request originating the search.
+ * @param name            The block name of interest.
+ * @param virtualChapter  The number of the chapter to search.
+ * @param metadata        The old metadata for the name.
+ * @param found           A pointer which will be set to
+ *                        <code>true</code> if a match was found.
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int searchVolumePageCache(Volume             *volume,
+                          Request            *request,
+                          const UdsChunkName *name,
+                          uint64_t            virtualChapter,
+                          UdsChunkData       *metadata,
+                          bool               *found)
+  __attribute__((warn_unused_result));
+
+/**
+ * Fetch a record page from the cache or read it from the volume and search it
+ * for a chunk name.
+ *
+ * If a match is found, optionally returns the metadata from the stored
+ * record. If the requested record page is not cached, the page fetch may be
+ * asynchronously completed on the slow lane, in which case UDS_QUEUED will be
+ * returned and the request will be requeued for continued processing after
+ * the page is read and added to the cache.
+ *
+ * @param volume           the volume containing the record page to search
+ * @param request          the request originating the search (may be NULL for
+ *                         a direct query from volume replay)
+ * @param name             the name of the block or chunk
+ * @param chapter          the chapter to search
+ * @param recordPageNumber the record page number of the page to search
+ * @param duplicate        an array in which to place the metadata of the
+ *                         duplicate, if one was found
+ * @param found            a (bool *) which will be set to true if the chunk
+ *                         was found
+ *
+ * @return UDS_SUCCESS, UDS_QUEUED, or an error code
+ **/
+int searchCachedRecordPage(Volume             *volume,
+                           Request            *request,
+                           const UdsChunkName *name,
+                           unsigned int        chapter,
+                           int                 recordPageNumber,
+                           UdsChunkData       *duplicate,
+                           bool               *found)
+  __attribute__((warn_unused_result));
+
+/**
+ * Forget the contents of a chapter. Invalidates any cached state for the
+ * specified chapter.
+ *
+ * @param volume   the volume containing the chapter
+ * @param chapter  the virtual chapter number
+ * @param reason   the reason for invalidation
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int forgetChapter(Volume             *volume,
+                  uint64_t            chapter,
+                  InvalidationReason  reason)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write a chapter's worth of index pages to a volume
+ *
+ * @param volume        the volume containing the chapter
+ * @param physicalPage  the page number in the volume for the chapter
+ * @param chapterIndex  the populated delta chapter index
+ * @param pages         pointer to array of page pointers. Used only in testing
+ *                      to return what data has been written to disk.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int writeIndexPages(Volume            *volume,
+                    int                physicalPage,
+                    OpenChapterIndex  *chapterIndex,
+                    byte             **pages)
+__attribute__((warn_unused_result));
+
+/**
+ * Write a chapter's worth of record pages to a volume
+ *
+ * @param volume        the volume containing the chapter
+ * @param physicalPage  the page number in the volume for the chapter
+ * @param records       a 1-based array of chunk records in the chapter
+ * @param pages         pointer to array of page pointers. Used only in testing
+ *                      to return what data has been written to disk.
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int writeRecordPages(Volume                *volume,
+                     int                    physicalPage,
+                     const UdsChunkRecord   records[],
+                     byte                 **pages)
+__attribute__((warn_unused_result));
+
+/**
+ * Write the index and records from the most recently filled chapter to the
+ * volume.
+ *
+ * @param volume                the volume containing the chapter
+ * @param chapterIndex          the populated delta chapter index
+ * @param records               a 1-based array of chunk records in the chapter
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int writeChapter(Volume               *volume,
+                 OpenChapterIndex     *chapterIndex,
+                 const UdsChunkRecord  records[])
+  __attribute__((warn_unused_result));
+
+/**
+ * Read all the index pages for a chapter from the volume and initialize an
+ * array of ChapterIndexPages to represent them.
+ *
+ * @param [in]  volume          the volume containing the chapter
+ * @param [in]  virtualChapter  the virtual chapter number of the index to read
+ * @param [out] volumePages     an array to receive the raw index page data
+ * @param [out] indexPages      an array of ChapterIndexPages to initialize
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int readChapterIndexFromVolume(const Volume       *volume,
+                               uint64_t            virtualChapter,
+                               struct volume_page  volumePages[],
+                               DeltaIndexPage      indexPages[])
+  __attribute__((warn_unused_result));
+
+/**
+ * Retrieve a page either from the cache (if we can) or from disk. If a read
+ * from disk is required, this is done immediately in the same thread and the
+ * page is then returned.
+ *
+ * The caller of this function must be holding the volume read mutex before
+ * calling this function.
+ *
+ * As a side-effect, the retrieved page will become the most recent page in
+ * the cache.
+ *
+ * This function is only exposed for the use of unit tests.
+ *
+ * @param volume       The volume containing the page
+ * @param request      The request originating the search
+ * @param physicalPage The physical page number
+ * @param probeType    The type of cache access being done
+ * @param entryPtr     A pointer to hold the retrieved cached entry
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getPageLocked(Volume          *volume,
+                  Request         *request,
+                  unsigned int     physicalPage,
+                  CacheProbeType   probeType,
+                  CachedPage     **entryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Retrieve a page either from the cache (if we can) or from disk. If a read
+ * from disk is required, the read request is enqueued for later processing
+ * by another thread. When that thread finally reads the page into the cache,
+ * a callback function is called to inform the caller the read is complete.
+ *
+ * The caller of this function should not be holding the volume read lock.
+ * Instead, the caller must call beingPendingSearch() for the given zone
+ * the request is being processed in. That state will be maintained or
+ * restored when the call returns, at which point the caller should call
+ * endPendingSearch().
+ *
+ * As a side-effect, the retrieved page will become the most recent page in
+ * the cache.
+ *
+ * This function is only exposed for the use of unit tests.
+ *
+ * @param volume       The volume containing the page
+ * @param request      The request originating the search
+ * @param physicalPage The physical page number
+ * @param probeType    The type of cache access being done
+ * @param entryPtr     A pointer to hold the retrieved cached entry
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getPageProtected(Volume          *volume,
+                     Request         *request,
+                     unsigned int     physicalPage,
+                     CacheProbeType   probeType,
+                     CachedPage     **entryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Retrieve a page either from the cache (if we can) or from disk. If a read
+ * from disk is required, this is done immediately in the same thread and the
+ * page is then returned.
+ *
+ * The caller of this function must not be holding the volume read lock before
+ * calling this function. This method will grab that lock and release it
+ * when it returns.
+ *
+ * As a side-effect, the retrieved page will become the most recent page in
+ * the cache.
+ *
+ * This function should only be called by areas of the code that do not use
+ * multi-threading to access the volume. These include rebuild, volume
+ * explorer, and certain unit tests.
+ *
+ * @param volume        The volume containing the page
+ * @param chapter       The number of the chapter containing the page
+ * @param pageNumber    The number of the page
+ * @param probeType     The type of cache access being done
+ * @param dataPtr       Pointer to hold the retrieved page, NULL if not wanted
+ * @param indexPagePtr  Pointer to hold the retrieved chapter index page, or
+ *                      NULL if not wanted
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int getPage(Volume          *volume,
+            unsigned int     chapter,
+            unsigned int     pageNumber,
+            CacheProbeType   probeType,
+            byte           **dataPtr,
+            DeltaIndexPage **indexPagePtr)
+  __attribute__((warn_unused_result));
+
+/**********************************************************************/
+size_t getCacheSize(Volume *volume) __attribute__((warn_unused_result));
+
+/**********************************************************************/
+int findVolumeChapterBoundariesImpl(unsigned int  chapterLimit,
+                                    unsigned int  maxBadChapters,
+                                    uint64_t     *lowestVCN,
+                                    uint64_t     *highestVCN,
+                                    int (*probeFunc)(void         *aux,
+                                                     unsigned int  chapter,
+                                                     uint64_t     *vcn),
+                                    void *aux)
+  __attribute__((warn_unused_result));
+
+/**
+ * Map a chapter number and page number to a phsical volume page number.
+ *
+ * @param geometry the layout of the volume
+ * @param chapter  the chapter number of the desired page
+ * @param page     the chapter page number of the desired page
+ *
+ * @return the physical page number
+ **/
+int mapToPhysicalPage(const Geometry *geometry, int chapter, int page)
+  __attribute__((warn_unused_result));
+
+#endif /* VOLUME_H */
diff --git a/uds/volumeStore.c b/uds/volumeStore.c
new file mode 100644
index 0000000..8b9f820
--- /dev/null
+++ b/uds/volumeStore.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.c#2 $
+ */
+
+#include "geometry.h"
+#include "indexLayout.h"
+#include "logger.h"
+#include "uds-error.h"
+#include "volumeStore.h"
+
+
+/*****************************************************************************/
+void closeVolumeStore(struct volume_store *volumeStore)
+{
+#ifdef __KERNEL__
+  if (volumeStore->vs_client != NULL) {
+    dm_bufio_client_destroy(volumeStore->vs_client);
+    volumeStore->vs_client = NULL;
+  }
+#else
+  if (volumeStore->vs_region != NULL) {
+    putIORegion(volumeStore->vs_region);
+    volumeStore->vs_region = NULL;
+  }
+#endif
+}
+
+/*****************************************************************************/
+void destroyVolumePage(struct volume_page *volumePage)
+{
+#ifdef __KERNEL__
+  releaseVolumePage(volumePage);
+#else
+  FREE(volumePage->vp_data);
+  volumePage->vp_data = NULL;
+#endif
+}
+
+/*****************************************************************************/
+int initializeVolumePage(const struct geometry *geometry,
+                         struct volume_page    *volumePage)
+{
+#ifdef __KERNEL__
+  volumePage->vp_buffer = NULL;
+  return UDS_SUCCESS;
+#else
+  return ALLOCATE_IO_ALIGNED(geometry->bytesPerPage, byte, __func__,
+                             &volumePage->vp_data);
+#endif
+}
+
+/*****************************************************************************/
+int openVolumeStore(struct volume_store *volumeStore,
+                    IndexLayout  *layout,
+                    unsigned int  reservedBuffers __attribute__((unused)),
+                    size_t        bytesPerPage)
+{
+#ifdef __KERNEL__
+  return openVolumeBufio(layout, bytesPerPage, reservedBuffers,
+                         &volumeStore->vs_client);
+#else
+  volumeStore->vs_bytesPerPage = bytesPerPage;
+  return openVolumeRegion(layout, &volumeStore->vs_region);
+#endif
+}
+
+/*****************************************************************************/
+void prefetchVolumePages(const struct volume_store *vs __attribute__((unused)),
+                         unsigned int physicalPage __attribute__((unused)),
+                         unsigned int pageCount __attribute__((unused)))
+{
+#ifdef __KERNEL__
+  dm_bufio_prefetch(vs->vs_client, physicalPage, pageCount);
+#else
+  // Nothing to do in user mode
+#endif
+}
+
+/*****************************************************************************/
+int prepareToWriteVolumePage(const struct volume_store *volumeStore
+                             __attribute__((unused)),
+                             unsigned int         physicalPage
+                             __attribute__((unused)),
+                             struct volume_page  *volumePage
+                             __attribute__((unused)))
+{
+#ifdef __KERNEL__
+  releaseVolumePage(volumePage);
+  struct dm_buffer *buffer = NULL;
+  byte *data = dm_bufio_new(volumeStore->vs_client, physicalPage, &buffer);
+  if (IS_ERR(data)) {
+    return -PTR_ERR(data);
+  }
+  volumePage->vp_buffer = buffer;
+#else
+  // Nothing to do in user mode
+#endif
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int readVolumePage(const struct volume_store *volumeStore,
+                   unsigned int               physicalPage,
+                   struct volume_page        *volumePage)
+{
+#ifdef __KERNEL__
+  releaseVolumePage(volumePage);
+  byte *data = dm_bufio_read(volumeStore->vs_client, physicalPage,
+                             &volumePage->vp_buffer);
+  if (IS_ERR(data)) {
+    return logWarningWithStringError(-PTR_ERR(data),
+                                     "error reading physical page %u",
+                                     physicalPage);
+  }
+#else
+  off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage;
+  int result = readFromRegion(volumeStore->vs_region, offset,
+                              getPageData(volumePage),
+                              volumeStore->vs_bytesPerPage, NULL);
+  if (result != UDS_SUCCESS) {
+    return logWarningWithStringError(result,
+                                     "error reading physical page %u",
+                                     physicalPage);
+  }
+#endif
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+void releaseVolumePage(struct volume_page *volumePage __attribute__((unused)))
+{
+#ifdef __KERNEL__
+  if (volumePage->vp_buffer != NULL) {
+    dm_bufio_release(volumePage->vp_buffer);
+    volumePage->vp_buffer = NULL;
+  }
+#else
+  // Nothing to do in user mode
+#endif
+}
+
+/*****************************************************************************/
+void swapVolumePages(struct volume_page *volumePage1,
+                     struct volume_page *volumePage2)
+{
+  struct volume_page temp = *volumePage1;
+  *volumePage1 = *volumePage2;
+  *volumePage2 = temp;
+}
+
+/*****************************************************************************/
+int syncVolumeStore(const struct volume_store *volumeStore)
+{
+#ifdef __KERNEL__
+  int result = -dm_bufio_write_dirty_buffers(volumeStore->vs_client);
+#else
+  int result = syncRegionContents(volumeStore->vs_region);
+#endif
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "cannot sync chapter to volume");
+  }
+  return UDS_SUCCESS;
+}
+
+/*****************************************************************************/
+int writeVolumePage(const struct volume_store *volumeStore,
+                    unsigned int               physicalPage,
+                    struct volume_page        *volumePage)
+{
+#ifdef __KERNEL__
+  dm_bufio_mark_buffer_dirty(volumePage->vp_buffer);
+  return UDS_SUCCESS;
+#else
+  off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage;
+  return writeToRegion(volumeStore->vs_region, offset, getPageData(volumePage),
+                       volumeStore->vs_bytesPerPage,
+                       volumeStore->vs_bytesPerPage);
+#endif
+}
diff --git a/uds/volumeStore.h b/uds/volumeStore.h
new file mode 100644
index 0000000..f475427
--- /dev/null
+++ b/uds/volumeStore.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.h#2 $
+ */
+
+#ifndef VOLUME_STORE_H
+#define VOLUME_STORE_H
+
+#include "common.h"
+#include "compiler.h"
+#include "memoryAlloc.h"
+
+#ifdef __KERNEL__
+#include <linux/dm-bufio.h>
+#else
+#include "ioRegion.h"
+#endif
+
+struct geometry;
+struct indexLayout;
+
+
+struct volume_store {
+#ifdef __KERNEL__
+  struct dm_bufio_client *vs_client;
+#else
+  IORegion               *vs_region;
+  size_t                  vs_bytesPerPage;
+#endif
+};
+
+
+struct volume_page {
+#ifdef __KERNEL__
+  struct dm_buffer *vp_buffer;
+#else
+  byte             *vp_data;
+#endif
+};
+
+/**
+ * Close a volume store.
+ *
+ * @param volumeStore   The volume store
+ **/
+void closeVolumeStore(struct volume_store *volumeStore);
+
+/**
+ * Uninitialize a volume page buffer.
+ *
+ * @param volumePage  The volume page buffer
+ **/
+void destroyVolumePage(struct volume_page *volumePage);
+
+/**
+ * Get a pointer to the data contained in a volume page buffer.
+ *
+ * @param volumePage  The volume page buffer
+ *
+ * @return the address of the data
+ **/
+__attribute__((warn_unused_result))
+static INLINE byte *getPageData(const struct volume_page *volumePage)
+{
+#ifdef __KERNEL__
+  return dm_bufio_get_block_data(volumePage->vp_buffer);
+#else
+  return volumePage->vp_data;
+#endif
+}
+
+/**
+ * Initialize a volume page buffer.
+ *
+ * @param geometry    The volume geometry
+ * @param volumePage  The volume page buffer
+ *
+ * @return UDS_SUCCESS or an error status
+ **/
+int initializeVolumePage(const struct geometry *geometry,
+                         struct volume_page    *volumePage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Open a volume store.
+ *
+ * @param volumeStore      The volume store
+ * @param layout           The index layout
+ * @param reservedBuffers  The number of buffers that can be reserved
+ * @param bytesPerPage     The number of bytes in a volume page
+ **/
+int openVolumeStore(struct volume_store *volumeStore,
+                    struct indexLayout  *layout,
+                    unsigned int         reservedBuffers,
+                    size_t               bytesPerPage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prefetch volume pages into memory.
+ *
+ * @param volumeStore   The volume store
+ * @param physicalPage  The volume page number of the first desired page
+ * @param pageCount     The number of volume pages to prefetch
+ **/
+void prefetchVolumePages(const struct volume_store *volumeStore,
+                         unsigned int               physicalPage,
+                         unsigned int               pageCount);
+
+/**
+ * Prepare a buffer to write a page to the volume.
+ *
+ * @param volumeStore   The volume store
+ * @param physicalPage  The volume page number of the desired page
+ * @param volumePage    The volume page buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int prepareToWriteVolumePage(const struct volume_store *volumeStore,
+                             unsigned int               physicalPage,
+                             struct volume_page        *volumePage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Read a page from a volume store.
+ *
+ * @param volumeStore   The volume store
+ * @param physicalPage  The volume page number of the desired page
+ * @param volumePage    The volume page buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int readVolumePage(const struct volume_store *volumeStore,
+                   unsigned int               physicalPage,
+                   struct volume_page        *volumePage)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release a volume page buffer, because it will no longer be accessed before a
+ * call to readVolumePage or prepareToWriteVolumePage.
+ *
+ * @param volumePage  The volume page buffer
+ **/
+void releaseVolumePage(struct volume_page *volumePage);
+
+/**
+ * Swap volume pages.  This is used to put the contents of a newly written
+ * index page (in the scratch page) into the page cache.
+ *
+ * @param volumePage1  The volume page buffer
+ * @param volumePage2  The volume page buffer
+ **/
+void swapVolumePages(struct volume_page *volumePage1,
+                     struct volume_page *volumePage2);
+
+/**
+ * Sync the volume store to storage.
+ *
+ * @param volumeStore  The volume store
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int syncVolumeStore(const struct volume_store *volumeStore)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write a page to a volume store.
+ *
+ * @param volumeStore   The volume store
+ * @param physicalPage  The volume page number of the desired page
+ * @param volumePage    The volume page buffer
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int writeVolumePage(const struct volume_store *volumeStore,
+                    unsigned int               physicalPage,
+                    struct volume_page        *volumePage)
+  __attribute__((warn_unused_result));
+
+#endif /* VOLUME_STORE_H */
diff --git a/uds/zone.c b/uds/zone.c
new file mode 100644
index 0000000..cc07674
--- /dev/null
+++ b/uds/zone.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/zone.c#4 $
+ */
+
+#include "zone.h"
+
+#include "logger.h"
+#include "threads.h"
+
+/**********************************************************************/
+unsigned int getZoneCount(const struct uds_parameters *userParams)
+{
+  unsigned int zoneCount = (userParams == NULL) ? 0 : userParams->zone_count;
+  if (zoneCount == 0) {
+    zoneCount = getNumCores() / 2;
+  }
+  if (zoneCount < 1) {
+    zoneCount = 1;
+  }
+  if (zoneCount > MAX_ZONES) {
+    zoneCount = MAX_ZONES;
+  }
+  logInfo("Using %u indexing zone%s for concurrency.", zoneCount,
+          zoneCount == 1 ? "" : "s");
+  return zoneCount;
+}
diff --git a/uds/zone.h b/uds/zone.h
new file mode 100644
index 0000000..99daf40
--- /dev/null
+++ b/uds/zone.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/uds-releases/jasper/src/uds/zone.h#2 $
+ */
+
+#ifndef ZONE_H
+#define ZONE_H
+
+#include "uds.h"
+
+enum {
+  MAX_ZONES = 16,
+};
+
+/**
+ * Return the number of zones.
+ *
+ * @param userParams  the index session parameters.  If NULL, the default
+ *                    session parameters will be used.
+ *
+ * @return the number of zones
+ **/
+unsigned int getZoneCount(const struct uds_parameters *userParams)
+  __attribute__((warn_unused_result));
+
+#endif /* ZONE_H */
diff --git a/vdo/Makefile b/vdo/Makefile
new file mode 100644
index 0000000..816c219
--- /dev/null
+++ b/vdo/Makefile
@@ -0,0 +1,31 @@
+VDO_VERSION = 6.2.4.26
+
+VDO_VERSION_MAJOR = $(word 1,$(subst ., ,$(VDO_VERSION)))
+VDO_VERSION_MINOR = $(word 2,$(subst ., ,$(VDO_VERSION)))
+VDO_VERSION_MICRO = $(word 3,$(subst ., ,$(VDO_VERSION)))
+
+SOURCES  = $(addprefix base/,$(notdir $(wildcard $(src)/base/*.c)))
+SOURCES += $(addprefix kernel/,$(notdir $(wildcard $(src)/kernel/*.c)))
+OBJECTS = $(SOURCES:%.c=%.o)
+INCLUDES = -I$(src)/base -I$(src)/kernel -I$(src)/../uds
+
+EXTRA_CFLAGS =	-std=gnu99					\
+		-fno-builtin-memset				\
+		-Werror						\
+		-Wframe-larger-than=400				\
+		-Wno-declaration-after-statement		\
+		-DVDO_VERSION_MAJOR=$(VDO_VERSION_MAJOR)	\
+		-DVDO_VERSION_MINOR=$(VDO_VERSION_MINOR)	\
+		-DVDO_VERSION_MICRO=$(VDO_VERSION_MICRO)	\
+		-DCURRENT_VERSION=\"$(VDO_VERSION)\"		\
+		$(INCLUDES)
+
+CFLAGS_REMOVE_vdoPageCache.o= -std=gnu99
+CFLAGS_REMOVE_vio.o= -std=gnu99
+
+CFLAGS_vdoPageCache.o= -std=gnu89
+CFLAGS_vio.o= -std=gnu89
+
+obj-m += kvdo.o
+
+kvdo-objs = $(OBJECTS)
diff --git a/vdo/base/actionManager.c b/vdo/base/actionManager.c
new file mode 100644
index 0000000..664131d
--- /dev/null
+++ b/vdo/base/actionManager.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.c#9 $
+ */
+
+#include "actionManager.h"
+
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "completion.h"
+#include "types.h"
+
+/** An action to be performed in each of a set of zones */
+typedef struct action Action;
+struct action {
+  /** Whether this structure is in use */
+  bool              inUse;
+ /** The admin operation associated with this action */
+  AdminStateCode    operation;
+  /**
+   * The method to run on the initiator thread before the action is applied to
+   * each zone.
+   **/
+  ActionPreamble   *preamble;
+  /** The action to be performed in each zone */
+  ZoneAction       *zoneAction;
+  /**
+   * The method to run on the initiator thread after the action has been
+   * applied to each zone
+   **/
+  ActionConclusion *conclusion;
+  /** The object to notify when the action is complete */
+  VDOCompletion    *parent;
+  /** The action specific context */
+  void             *context;
+  /** The action to perform after this one */
+  Action           *next;
+};
+
+struct actionManager {
+  /** The completion for performing actions */
+  VDOCompletion     completion;
+  /** The state of this action manager */
+  AdminState        state;
+  /** The two action slots*/
+  Action            actions[2];
+  /** The current action slot */
+  Action           *currentAction;
+  /** The number of zones in which an action is to be applied */
+  ZoneCount         zones;
+  /** A function to schedule a default next action */
+  ActionScheduler  *scheduler;
+  /**
+   * A function to get the id of the thread on which to apply an action to a
+   * zone
+   **/
+  ZoneThreadGetter *getZoneThreadID;
+  /** The ID of the thread on which actions may be initiated */
+  ThreadID          initiatorThreadID;
+  /** Opaque data associated with this action manager */
+  void             *context;
+  /** The zone currently being acted upon */
+  ZoneCount         actingZone;
+};
+
+/**
+ * Convert a generic VDOCompletion to a ActionManager.
+ *
+ * @param completion The completion to convert
+ *
+ * @return The completion as a ActionManager
+ **/
+static inline ActionManager *asActionManager(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(ActionManager, completion) == 0);
+  assertCompletionType(completion->type, ACTION_COMPLETION);
+  return (ActionManager *) completion;
+}
+
+/**
+ * An action scheduler which does not schedule an action.
+ *
+ * <p>Implements ActionScheduler.
+ **/
+static bool noDefaultAction(void *context __attribute__((unused)))
+{
+  return false;
+}
+
+/**
+ * A default preamble which does nothing.
+ *
+ * <p>Implements ActionPreamble
+ **/
+static void noPreamble(void          *context __attribute__((unused)),
+                       VDOCompletion *completion)
+{
+  completeCompletion(completion);
+}
+
+/**
+ * A default conclusion which does nothing.
+ *
+ * <p>Implements ActionConclusion.
+ **/
+static int noConclusion(void *context __attribute__((unused))) {
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeActionManager(ZoneCount          zones,
+                      ZoneThreadGetter  *getZoneThreadID,
+                      ThreadID           initiatorThreadID,
+                      void              *context,
+                      ActionScheduler   *scheduler,
+                      PhysicalLayer     *layer,
+                      ActionManager    **managerPtr)
+{
+  ActionManager *manager;
+  int result = ALLOCATE(1, ActionManager, __func__, &manager);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *manager = (ActionManager) {
+    .zones             = zones,
+    .scheduler         = ((scheduler == NULL) ? noDefaultAction : scheduler),
+    .getZoneThreadID   = getZoneThreadID,
+    .initiatorThreadID = initiatorThreadID,
+    .context           = context,
+  };
+
+  manager->actions[0].next = &manager->actions[1];
+  manager->currentAction = manager->actions[1].next = &manager->actions[0];
+
+  result = initializeEnqueueableCompletion(&manager->completion,
+                                           ACTION_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    freeActionManager(&manager);
+    return result;
+  }
+
+  *managerPtr = manager;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeActionManager(ActionManager **managerPtr)
+{
+  ActionManager *manager = *managerPtr;
+  if (manager == NULL) {
+    return;
+  }
+
+  destroyEnqueueable(&manager->completion);
+  FREE(manager);
+  *managerPtr = NULL;
+}
+
+/**********************************************************************/
+AdminStateCode getCurrentManagerOperation(ActionManager *manager)
+{
+  return manager->state.state;
+}
+
+/**********************************************************************/
+void *getCurrentActionContext(ActionManager *manager)
+{
+  return (manager->currentAction->inUse
+          ? manager->currentAction->context : NULL);
+}
+
+/**********************************************************************/
+static void finishActionCallback(VDOCompletion *completion);
+static void applyToZone(VDOCompletion *completion);
+
+/**
+ * Get the thread ID for the current zone.
+ *
+ * @param manager  The action manager
+ *
+ * @return The ID of the thread on which to run actions for the current zone
+ **/
+static ThreadID getActingZoneThreadID(ActionManager *manager)
+{
+  return manager->getZoneThreadID(manager->context, manager->actingZone);
+}
+
+/**
+ * Prepare the manager's completion to run on the next zone.
+ *
+ * @param manager  The action manager
+ **/
+static void prepareForNextZone(ActionManager *manager)
+{
+  prepareForRequeue(&manager->completion, applyToZone,
+                    preserveErrorAndContinue, getActingZoneThreadID(manager),
+                    manager->currentAction->parent);
+}
+
+/**
+ * Prepare the manager's completion to run the conclusion on the initiator
+ * thread.
+ *
+ * @param manager  The action manager
+ **/
+static void prepareForConclusion(ActionManager *manager)
+{
+  prepareForRequeue(&manager->completion, finishActionCallback,
+                    preserveErrorAndContinue, manager->initiatorThreadID,
+                    manager->currentAction->parent);
+}
+
+/**
+ * Perform an action on the next zone if there is one.
+ *
+ * @param completion  The action completion
+ **/
+static void applyToZone(VDOCompletion *completion)
+{
+  ActionManager *manager = asActionManager(completion);
+  ASSERT_LOG_ONLY((getCallbackThreadID() == getActingZoneThreadID(manager)),
+                  "applyToZone() called on acting zones's thread");
+
+  ZoneCount zone = manager->actingZone++;
+  if (manager->actingZone == manager->zones) {
+    // We are about to apply to the last zone. Once that is finished,
+    // we're done, so go back to the initiator thread and finish up.
+    prepareForConclusion(manager);
+  } else {
+    // Prepare to come back on the next zone
+    prepareForNextZone(manager);
+  }
+
+  manager->currentAction->zoneAction(manager->context, zone, completion);
+}
+
+/**
+ * The error handler for preamble errors.
+ *
+ * @param completion  The manager completion
+ **/
+static void handlePreambleError(VDOCompletion *completion)
+{
+  // Skip the zone actions since the preamble failed.
+  completion->callback = finishActionCallback;
+  preserveErrorAndContinue(completion);
+}
+
+/**
+ * Launch the current action.
+ *
+ * @param manager  The action manager
+ **/
+static void launchCurrentAction(ActionManager *manager)
+{
+  Action *action = manager->currentAction;
+  int     result = startOperation(&manager->state, action->operation);
+  if (result != VDO_SUCCESS) {
+    if (action->parent != NULL) {
+      setCompletionResult(action->parent, result);
+    }
+
+    // We aren't going to run the preamble, so don't run the conclusion
+    action->conclusion = noConclusion;
+    finishActionCallback(&manager->completion);
+    return;
+  }
+
+  if (action->zoneAction == NULL) {
+    prepareForConclusion(manager);
+  } else {
+    manager->actingZone = 0;
+    prepareForRequeue(&manager->completion, applyToZone, handlePreambleError,
+                      getActingZoneThreadID(manager),
+                      manager->currentAction->parent);
+  }
+
+  action->preamble(manager->context, &manager->completion);
+}
+
+/**********************************************************************/
+bool scheduleDefaultAction(ActionManager *manager)
+{
+  // Don't schedule a default action if we are operating or not in normal
+  // operation.
+  return ((manager->state.state == ADMIN_STATE_NORMAL_OPERATION)
+          && manager->scheduler(manager->context));
+}
+
+/**
+ * Finish an action now that it has been applied to all zones. This
+ * callback is registered in applyToZone().
+ *
+ * @param completion  The action manager completion
+ **/
+static void finishActionCallback(VDOCompletion *completion)
+{
+  ActionManager *manager        = asActionManager(completion);
+  Action         action         = *(manager->currentAction);
+  manager->currentAction->inUse = false;
+  manager->currentAction        = manager->currentAction->next;
+
+  // We need to check this now to avoid use-after-free issues if running the
+  // conclusion or notifying the parent results in the manager being freed.
+  bool hasNextAction = (manager->currentAction->inUse
+                        || scheduleDefaultAction(manager));
+  int result = action.conclusion(manager->context);
+  finishOperation(&manager->state);
+  if (action.parent != NULL) {
+    finishCompletion(action.parent, result);
+  }
+
+  if (hasNextAction) {
+    launchCurrentAction(manager);
+  }
+}
+
+/**********************************************************************/
+bool scheduleAction(ActionManager    *manager,
+                    ActionPreamble   *preamble,
+                    ZoneAction       *zoneAction,
+                    ActionConclusion *conclusion,
+                    VDOCompletion    *parent)
+{
+  return scheduleOperation(manager, ADMIN_STATE_OPERATING, preamble,
+                           zoneAction, conclusion, parent);
+}
+
+/**********************************************************************/
+bool scheduleOperation(ActionManager    *manager,
+                       AdminStateCode    operation,
+                       ActionPreamble   *preamble,
+                       ZoneAction       *zoneAction,
+                       ActionConclusion *conclusion,
+                       VDOCompletion    *parent)
+{
+  return scheduleOperationWithContext(manager, operation, preamble, zoneAction,
+                                      conclusion, NULL, parent);
+}
+
+/**********************************************************************/
+bool scheduleOperationWithContext(ActionManager    *manager,
+                                  AdminStateCode    operation,
+                                  ActionPreamble   *preamble,
+                                  ZoneAction       *zoneAction,
+                                  ActionConclusion *conclusion,
+                                  void             *context,
+                                  VDOCompletion    *parent)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == manager->initiatorThreadID),
+                  "action initiated from correct thread");
+  Action *action;
+  if (!manager->currentAction->inUse) {
+    action = manager->currentAction;
+  } else if (!manager->currentAction->next->inUse) {
+    action = manager->currentAction->next;
+  } else {
+    if (parent != NULL) {
+      finishCompletion(parent, VDO_COMPONENT_BUSY);
+    }
+
+    return false;
+  }
+
+  *action = (Action) {
+    .inUse      = true,
+    .operation  = operation,
+    .preamble   = (preamble == NULL) ? noPreamble : preamble,
+    .zoneAction = zoneAction,
+    .conclusion = (conclusion == NULL) ? noConclusion : conclusion,
+    .context    = context,
+    .parent     = parent,
+    .next       = action->next,
+  };
+
+  if (action == manager->currentAction) {
+    launchCurrentAction(manager);
+  }
+
+  return true;
+}
diff --git a/vdo/base/actionManager.h b/vdo/base/actionManager.h
new file mode 100644
index 0000000..2e0ef13
--- /dev/null
+++ b/vdo/base/actionManager.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.h#6 $
+ */
+
+#ifndef ACTION_MANAGER_H
+#define ACTION_MANAGER_H
+
+#include "adminState.h"
+#include "completion.h"
+#include "types.h"
+
+/**
+ * ActionManager provides a generic mechanism for applying actions to
+ * multi-zone entities (such as the block map or slab depot). Each action
+ * manager is tied to a specific context for which it manages actions. The
+ * manager ensures that only one action is active on that context at a time,
+ * and supports at most one pending action. Calls to schedule an action when
+ * there is already a pending action will result in VDO_COMPONENT_BUSY errors.
+ * Actions may only be submitted to the action manager from a single thread
+ * (which thread is determined when the action manager is constructed).
+ *
+ * A scheduled action consists of four components:
+ *   preamble:   an optional method to be run on the initator thread before
+ *               applying the action to all zones
+ *   zoneAction: an optional method to be applied to each of the zones
+ *   conclusion: an optional method to be run on the initiator thread once the
+ *               per-zone method has been applied to all zones
+ *   parent:     an optional completion to be finished once the conclusion
+ *               is done
+ *
+ * At least one of the three methods must be provided.
+ **/
+
+/**
+ * A function which is to be applied asynchronously to a set of zones.
+ *
+ * @param context     The object which holds the per-zone context for the
+ *                    action
+ * @param zoneNumber  The number of zone to which the action is being applied
+ * @param parent      The object to notify when the action is complete
+ **/
+typedef void ZoneAction(void          *context,
+                        ZoneCount      zoneNumber,
+                        VDOCompletion *parent);
+
+/**
+ * A function which is to be applied asynchronously on an action manager's
+ * initiator thread as the preamble of an action.
+ *
+ * @param context  The object which holds the per-zone context for the action
+ * @param parent   The object to notify when the action is complete
+ **/
+typedef void ActionPreamble(void *context, VDOCompletion *parent);
+
+/**
+ * A function which will run on the action manager's initiator thread as the
+ * conclusion of an action.
+ *
+ * @param context  The object which holds the per-zone context for the action
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+typedef int ActionConclusion(void *context);
+
+/**
+ * A function to schedule an action.
+ *
+ * @param context  The object which holds the per-zone context for the action
+ *
+ * @return <code>true</code> if an action was scheduled
+ **/
+typedef bool ActionScheduler(void *context);
+
+/**
+ * Get the id of the thread associated with a given zone.
+ *
+ * @param context     The action context
+ * @param zoneNumber  The number of the zone for which the thread ID is desired
+ **/
+typedef ThreadID ZoneThreadGetter(void *context, ZoneCount zoneNumber);
+
+/**
+ * Make an action manager.
+ *
+ * @param [in]  zones              The number of zones to which actions will be
+ *                                 applied
+ * @param [in]  getZoneThreadID    A function to get the thread id associated
+ *                                 with a zone
+ * @param [in]  initiatorThreadID  The thread on which actions may initiated
+ * @param [in]  context            The object which holds the per-zone context
+ *                                 for the action
+ * @param [in]  scheduler          A function to schedule a next action after an
+ *                                 action concludes if there is no pending
+ *                                 action (may be NULL)
+ * @param [in]  layer              The layer used to make completions
+ * @param [out] managerPtr         A pointer to hold the new action manager
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeActionManager(ZoneCount          zones,
+                      ZoneThreadGetter  *getZoneThreadID,
+                      ThreadID           initiatorThreadID,
+                      void              *context,
+                      ActionScheduler   *scheduler,
+                      PhysicalLayer     *layer,
+                      ActionManager    **managerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy an action manager and null out the reference to it.
+ *
+ * @param managerPtr  The reference to the manager to destroy
+ **/
+void freeActionManager(ActionManager **managerPtr);
+
+/**
+ * Get the current operation an action manager is performing.
+ *
+ * @param manager  The manager to query
+ *
+ * @return The manager's current operation
+ **/
+AdminStateCode getCurrentManagerOperation(ActionManager *manager)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the action-specific context for the operation an action manager is
+ * currently performing.
+ *
+ * @param manager  The manager to query
+ *
+ * @return The action-specific context for the manager's current action or
+ *         NULL if there is no context or no current action
+ **/
+void *getCurrentActionContext(ActionManager *manager)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to schedule the default action. If the manager is not operating
+ * normally, the action will not be scheduled.
+ *
+ * @param manager  The action manager
+ *
+ * @return <code>true</code> if an action was scheduled.
+ **/
+bool scheduleDefaultAction(ActionManager *manager);
+
+/**
+ * Schedule an action to be applied to all zones. The action will be launched
+ * immediately if there is no current action, or as soon as the current action
+ * completes. If there is already a pending action, this action will not be
+ * scheduled, and, if it has a parent, that parent will be notified. At least
+ * one of the preamble, zoneAction, or conclusion must not be NULL.
+ *
+ * @param manager     The action manager to schedule the action on
+ * @param preamble    A method to be invoked on the initiator thread once this
+ *                    action is started but before applying to each zone; may
+ *                    be NULL
+ * @param zoneAction  The action to apply to each zone; may be NULL
+ * @param conclusion  A method to be invoked back on the initiator thread once
+ *                    the action has been applied to all zones; may be NULL
+ * @param parent      The object to notify once the action is complete or if
+ *                    the action can not be scheduled; may be NULL
+ *
+ * @return <code>true</code> if the action was scheduled
+ **/
+bool scheduleAction(ActionManager    *manager,
+                    ActionPreamble   *preamble,
+                    ZoneAction       *zoneAction,
+                    ActionConclusion *conclusion,
+                    VDOCompletion    *parent);
+
+/**
+ * Schedule an operation to be applied to all zones. The operation's action
+ * will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this
+ * operation will not be scheduled, and, if it has a parent, that parent will
+ * be notified. At least one of the preamble, zoneAction, or conclusion must
+ * not be NULL.
+ *
+ * @param manager     The action manager to schedule the action on
+ * @param operation   The operation this action will perform
+ * @param preamble    A method to be invoked on the initiator thread once this
+ *                    action is started but before applying to each zone; may
+ *                    be NULL
+ * @param zoneAction  The action to apply to each zone; may be NULL
+ * @param conclusion  A method to be invoked back on the initiator thread once
+ *                    the action has been applied to all zones; may be NULL
+ * @param parent      The object to notify once the action is complete or if
+ *                    the action can not be scheduled; may be NULL
+ *
+ * @return <code>true</code> if the action was scheduled
+ **/
+bool scheduleOperation(ActionManager    *manager,
+                       AdminStateCode    operation,
+                       ActionPreamble   *preamble,
+                       ZoneAction       *zoneAction,
+                       ActionConclusion *conclusion,
+                       VDOCompletion    *parent);
+
+/**
+ * Schedule an operation to be applied to all zones. The operation's action
+ * will be launched immediately if there is no current action, or as soon as
+ * the current action completes. If there is already a pending action, this
+ * operation will not be scheduled, and, if it has a parent, that parent will
+ * be notified. At least one of the preamble, zoneAction, or conclusion must
+ * not be NULL.
+ *
+ * @param manager     The action manager to schedule the action on
+ * @param operation   The operation this action will perform
+ * @param preamble    A method to be invoked on the initiator thread once this
+ *                    action is started but before applying to each zone; may
+ *                    be NULL
+ * @param zoneAction  The action to apply to each zone; may be NULL
+ * @param conclusion  A method to be invoked back on the initiator thread once
+ *                    the action has been applied to all zones; may be NULL
+ * @param context     An action-specific context which may be retrieved via
+ *                    getCurrentActionContext(); may be NULL
+ * @param parent      The object to notify once the action is complete or if
+ *                    the action can not be scheduled; may be NULL
+ *
+ * @return <code>true</code> if the action was scheduled
+ **/
+bool scheduleOperationWithContext(ActionManager    *manager,
+                                  AdminStateCode    operation,
+                                  ActionPreamble   *preamble,
+                                  ZoneAction       *zoneAction,
+                                  ActionConclusion *conclusion,
+                                  void             *context,
+                                  VDOCompletion    *parent);
+
+#endif // ACTION_MANAGER_H
diff --git a/vdo/base/adminCompletion.c b/vdo/base/adminCompletion.c
new file mode 100644
index 0000000..5c5ed26
--- /dev/null
+++ b/vdo/base/adminCompletion.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.c#4 $
+ */
+
+#include "adminCompletion.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "atomic.h"
+#include "completion.h"
+#include "types.h"
+#include "vdoInternal.h"
+
+/**********************************************************************/
+void assertAdminOperationType(AdminCompletion    *completion,
+                              AdminOperationType  expected)
+{
+  ASSERT_LOG_ONLY(completion->type == expected,
+                  "admin operation type is %u instead of %u",
+                  completion->type, expected);
+}
+
+/**********************************************************************/
+AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(AdminCompletion, completion) == 0);
+  assertCompletionType(completion->type, SUB_TASK_COMPLETION);
+  VDOCompletion *parent = completion->parent;
+  assertCompletionType(parent->type, ADMIN_COMPLETION);
+  return (AdminCompletion *) parent;
+}
+
+/**********************************************************************/
+void assertAdminPhaseThread(AdminCompletion *adminCompletion,
+                            const char      *what,
+                            const char      *phaseNames[])
+{
+  ThreadID expected = adminCompletion->getThreadID(adminCompletion);
+  ASSERT_LOG_ONLY((getCallbackThreadID() == expected),
+                  "%s on correct thread for %s",
+                  what, phaseNames[adminCompletion->phase]);
+}
+
+/**********************************************************************/
+VDO *vdoFromAdminSubTask(VDOCompletion      *completion,
+                         AdminOperationType  expected)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  assertAdminOperationType(adminCompletion, expected);
+  return adminCompletion->completion.parent;
+}
+
+/**********************************************************************/
+int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion)
+{
+  int result = initializeEnqueueableCompletion(&adminCompletion->completion,
+                                               ADMIN_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&adminCompletion->subTaskCompletion,
+                                           SUB_TASK_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    uninitializeAdminCompletion(adminCompletion);
+    return result;
+  }
+
+  atomicStoreBool(&adminCompletion->busy, false);
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void uninitializeAdminCompletion(AdminCompletion *adminCompletion)
+{
+  destroyEnqueueable(&adminCompletion->subTaskCompletion);
+  destroyEnqueueable(&adminCompletion->completion);
+}
+
+/**********************************************************************/
+VDOCompletion *resetAdminSubTask(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  resetCompletion(completion);
+  completion->callbackThreadID = adminCompletion->getThreadID(adminCompletion);
+  return completion;
+}
+
+/**********************************************************************/
+void prepareAdminSubTaskOnThread(VDO       *vdo,
+                                 VDOAction *callback,
+                                 VDOAction *errorHandler,
+                                 ThreadID   threadID)
+{
+  prepareForRequeue(&vdo->adminCompletion.subTaskCompletion, callback,
+                    errorHandler, threadID, &vdo->adminCompletion);
+}
+
+/**********************************************************************/
+void prepareAdminSubTask(VDO       *vdo,
+                         VDOAction *callback,
+                         VDOAction *errorHandler)
+{
+  AdminCompletion *adminCompletion = &vdo->adminCompletion;
+  prepareAdminSubTaskOnThread(vdo, callback, errorHandler,
+                              adminCompletion->completion.callbackThreadID);
+}
+
+/**
+ * Callback for admin operations which will notify the layer that the operation
+ * is complete.
+ *
+ * @param completion  The admin completion
+ **/
+static void adminOperationCallback(VDOCompletion *completion)
+{
+  completion->layer->completeAdminOperation(completion->layer);
+}
+
+/**********************************************************************/
+int performAdminOperation(VDO                    *vdo,
+                          AdminOperationType      type,
+                          ThreadIDGetterForPhase *threadIDGetter,
+                          VDOAction              *action,
+                          VDOAction              *errorHandler)
+{
+  AdminCompletion *adminCompletion = &vdo->adminCompletion;
+  if (!compareAndSwapBool(&adminCompletion->busy, false, true)) {
+    return logErrorWithStringError(VDO_COMPONENT_BUSY,
+                                   "Can't start admin operation of type %u, "
+                                   "another operation is already in progress",
+                                   type);
+  }
+
+  prepareCompletion(&adminCompletion->completion, adminOperationCallback,
+                    adminOperationCallback,
+                    getAdminThread(getThreadConfig(vdo)), vdo);
+  adminCompletion->type        = type;
+  adminCompletion->getThreadID = threadIDGetter;
+  adminCompletion->phase       = 0;
+  prepareAdminSubTask(vdo, action, errorHandler);
+
+  PhysicalLayer *layer = vdo->layer;
+  layer->enqueue(adminCompletion->subTaskCompletion.enqueueable);
+  layer->waitForAdminOperation(layer);
+  int result = adminCompletion->completion.result;
+  atomicStoreBool(&adminCompletion->busy, false);
+  return result;
+}
diff --git a/vdo/base/adminCompletion.h b/vdo/base/adminCompletion.h
new file mode 100644
index 0000000..50eeecd
--- /dev/null
+++ b/vdo/base/adminCompletion.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.h#4 $
+ */
+
+#ifndef ADMIN_COMPLETION_H
+#define ADMIN_COMPLETION_H
+
+#include "atomic.h"
+#include "completion.h"
+#include "types.h"
+
+typedef enum adminOperationType {
+  ADMIN_OPERATION_UNKNOWN = 0,
+  ADMIN_OPERATION_GROW_LOGICAL,
+  ADMIN_OPERATION_GROW_PHYSICAL,
+  ADMIN_OPERATION_PREPARE_GROW_PHYSICAL,
+  ADMIN_OPERATION_LOAD,
+  ADMIN_OPERATION_RESUME,
+  ADMIN_OPERATION_SAVE,
+  ADMIN_OPERATION_SUSPEND,
+} AdminOperationType;
+
+typedef struct adminCompletion AdminCompletion;
+
+/**
+ * A function which gets the ID of the thread on which the current phase of an
+ * admin operation should be run.
+ *
+ * @param adminCompletion The AdminCompletion
+ *
+ * @return The ID of the thread on which the current phase should be performed
+ **/
+typedef ThreadID ThreadIDGetterForPhase(AdminCompletion *adminCompletion);
+
+struct adminCompletion {
+  /** The completion */
+  VDOCompletion           completion;
+  /** The sub-task completion */
+  VDOCompletion           subTaskCompletion;
+  /** Whether this completion is in use */
+  AtomicBool              busy;
+  /** The operation type */
+  AdminOperationType      type;
+  /** Method to get the ThreadID for the current phase */
+  ThreadIDGetterForPhase *getThreadID;
+  /** The current phase of the operation */
+  uint32_t                phase;
+};
+
+/**
+ * Check that an AdminCompletion's type is as expected.
+ *
+ * @param completion  The AdminCompletion to check
+ * @param expected    The expected type
+ **/
+void assertAdminOperationType(AdminCompletion    *completion,
+                              AdminOperationType  expected);
+
+/**
+ * Convert the sub-task completion of an AdminCompletion to an AdminCompletion.
+ *
+ * @param completion  the AdminCompletion's sub-task completion
+ *
+ * @return The sub-task completion as its enclosing AdminCompletion
+ **/
+AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion)
+  __attribute__((warn_unused_result));
+
+/**
+ * Assert that we are operating on the correct thread for the current phase.
+ *
+ * @param adminCompletion  The AdminCompletion to check
+ * @param what             The method doing the phase check
+ * @param phaseNames       The names of the phases of the current operation
+ **/
+void assertAdminPhaseThread(AdminCompletion *adminCompletion,
+                            const char      *what,
+                            const char      *phaseNames[]);
+
+/**
+ * Get the VDO from the sub-task completion of its AdminCompletion.
+ *
+ * @param completion  the sub-task completion
+ * @param expected    the expected operation type of the AdminCompletion
+ *
+ * @return The VDO
+ **/
+VDO *vdoFromAdminSubTask(VDOCompletion      *completion,
+                         AdminOperationType  expected)
+  __attribute__((warn_unused_result));
+
+/**
+ * Initialize an admin completion.
+ *
+ * @param vdo               The VDO which owns the completion
+ * @param adminCompletion   The AdminCompletion to initialize
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up an admin completion's resources.
+ *
+ * @param adminCompletion  The AdminCompletion to uninitialize
+ **/
+void uninitializeAdminCompletion(AdminCompletion *adminCompletion);
+
+/**
+ * Reset an AdminCompletion's sub-task completion.
+ *
+ * @param completion  The AdminCompletion's sub-task completion
+ *
+ * @return The sub-task completion for the convenience of callers
+ **/
+VDOCompletion *resetAdminSubTask(VDOCompletion *completion);
+
+/**
+ * Prepare the sub-task completion of a VDO's AdminCompletion
+ *
+ * @param vdo           The VDO
+ * @param callback      The callback for the sub-task
+ * @param errorHandler  The error handler for the sub-task
+ * @param threadID      The ID of the thread on which to run the callback
+ **/
+void prepareAdminSubTaskOnThread(VDO       *vdo,
+                                 VDOAction *callback,
+                                 VDOAction *errorHandler,
+                                 ThreadID   threadID);
+
+/**
+ * Prepare the sub-task completion of a VDO's AdminCompletion to run on the
+ * same thread as the AdminCompletion's main completion.
+ *
+ * @param vdo           The VDO
+ * @param callback      The callback for the sub-task
+ * @param errorHandler  The error handler for the sub-task
+ **/
+void prepareAdminSubTask(VDO       *vdo,
+                         VDOAction *callback,
+                         VDOAction *errorHandler);
+
+/**
+ * Perform an administrative operation (load, suspend, grow logical, or grow
+ * physical). This method should not be called from base threads unless it is
+ * certain the calling thread won't be needed to perform the operation. It may
+ * (and should) be called from non-base threads.
+ *
+ * @param vdo             The VDO on which to perform the operation
+ * @param type            The type of operation to perform
+ * @param threadIDGetter  A function for getting the ID of the thread on which
+ *                        a given phase should be run
+ * @param action          The action which starts the operation
+ * @param errorHandler    The error handler for the operation
+ *
+ * @return The result of the operation
+ **/
+int performAdminOperation(VDO                    *vdo,
+                          AdminOperationType      type,
+                          ThreadIDGetterForPhase *threadIDGetter,
+                          VDOAction              *action,
+                          VDOAction              *errorHandler)
+  __attribute__((warn_unused_result));
+
+#endif /* ADMIN_COMPLETION_H */
diff --git a/vdo/base/adminState.c b/vdo/base/adminState.c
new file mode 100644
index 0000000..6b30315
--- /dev/null
+++ b/vdo/base/adminState.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.c#14 $
+ */
+
+#include "adminState.h"
+
+#include "logger.h"
+#include "permassert.h"
+
+#include "completion.h"
+#include "types.h"
+
+/**********************************************************************/
+const char *getAdminStateCodeName(AdminStateCode code)
+{
+  switch (code) {
+  case ADMIN_STATE_NORMAL_OPERATION:
+    return "ADMIN_STATE_NORMAL_OPERATION";
+
+  case ADMIN_STATE_OPERATING:
+    return "ADMIN_STATE_OPERATING";
+
+  case ADMIN_STATE_FORMATTING:
+    return "ADMIN_STATE_FORMATTING";
+
+  case ADMIN_STATE_LOADING:
+    return "ADMIN_STATE_LOADING";
+
+  case ADMIN_STATE_LOADING_FOR_RECOVERY:
+    return "ADMIN_STATE_LOADING_FOR_RECOVERY";
+
+  case ADMIN_STATE_LOADING_FOR_REBUILD:
+    return "ADMIN_STATE_LOADING_FOR_REBUILD";
+
+  case ADMIN_STATE_NEW:
+    return "ADMIN_STATE_NEW";
+
+  case ADMIN_STATE_WAITING_FOR_RECOVERY:
+    return "ADMIN_STATE_WAITING_FOR_RECOVERY";
+
+  case ADMIN_STATE_RECOVERING:
+    return "ADMIN_STATE_RECOVERING";
+
+  case ADMIN_STATE_REBUILDING:
+    return "ADMIN_STATE_REBUILDING";
+
+  case ADMIN_STATE_SAVING:
+    return "ADMIN_STATE_SAVING";
+
+  case ADMIN_STATE_SAVED:
+    return "ADMIN_STATE_SAVED";
+
+  case ADMIN_STATE_SCRUBBING:
+    return "ADMIN_STATE_SCRUBBING";
+
+  case ADMIN_STATE_SAVE_FOR_SCRUBBING:
+    return "ADMIN_STATE_SAVE_FOR_SCRUBBING";
+
+  case ADMIN_STATE_SUSPENDING:
+    return "ADMIN_STATE_SUSPENDING";
+
+  case ADMIN_STATE_SUSPENDED:
+    return "ADMIN_STATE_SUSPENDED";
+
+  case ADMIN_STATE_SUSPENDED_OPERATION:
+    return "ADMIN_STATE_SUSPENDED_OPERATION";
+
+  case ADMIN_STATE_RESUMING:
+    return "ADMIN_STATE_RESUMING";
+
+  default:
+    return "INVALID ADMIN_STATE";
+  }
+}
+
+/**********************************************************************/
+const char *getAdminStateName(const AdminState *state)
+{
+  return getAdminStateCodeName(state->state);
+}
+
+/**********************************************************************/
+static AdminStateCode getNextState(AdminStateCode previousState,
+                                   AdminStateCode operation)
+{
+  if (isQuiescingCode(operation)) {
+    return ((operation & ADMIN_TYPE_MASK) | ADMIN_FLAG_QUIESCENT);
+  }
+
+  if (operation == ADMIN_STATE_SUSPENDED_OPERATION) {
+    return previousState;
+  }
+
+  return ADMIN_STATE_NORMAL_OPERATION;
+}
+
+/**
+ * Finish an operation if one is in progress. If there is a waiter, it will be
+ * notified.
+ *
+ * @param state   The AdminState
+ * @param result  The result of the operation
+ *
+ * @return <code>true</code> if an operation was in progress and has been
+ *         finished.
+ **/
+static bool endOperation(AdminState *state, int result)
+{
+  if (!isOperating(state)) {
+    return false;
+  }
+
+  if (state->starting) {
+    state->complete = true;
+    if (state->waiter != NULL) {
+      setCompletionResult(state->waiter, result);
+    }
+  } else {
+    state->complete = false;
+    state->state    = state->nextState;
+    releaseCompletionWithResult(&state->waiter, result);
+  }
+
+  return true;
+}
+
+/**
+ * Begin an operation if it may be started given the current state.
+ *
+ * @param state      The AdminState
+ * @param operation  The operation to begin
+ * @param waiter     A completion to notify when the operation is complete; may
+ *                   be NULL
+ * @param initiator  The AdminInitiator to call if the operation may begin; may
+ *                   be NULL
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int beginOperation(AdminState     *state,
+                          AdminStateCode  operation,
+                          VDOCompletion  *waiter,
+                          AdminInitiator *initiator)
+{
+  int result;
+  if (isOperating(state)
+      || (isQuiescent(state) != isQuiescentOperation(operation))) {
+    result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE,
+                                     "Can't start %s from %s",
+                                     getAdminStateCodeName(operation),
+                                     getAdminStateName(state));
+  } else if (state->waiter != NULL) {
+    result = logErrorWithStringError(VDO_COMPONENT_BUSY,
+                                     "Can't start %s with extant waiter",
+                                     getAdminStateCodeName(operation));
+  } else {
+    state->waiter    = waiter;
+    state->nextState = getNextState(state->state, operation);
+    state->state     = operation;
+    if (initiator != NULL) {
+      state->starting = true;
+      initiator(state);
+      state->starting = false;
+      if (state->complete) {
+        endOperation(state, VDO_SUCCESS);
+      }
+    }
+
+    return VDO_SUCCESS;
+  }
+
+  if (waiter != NULL) {
+    finishCompletion(waiter, result);
+  }
+
+  return result;
+}
+
+/**
+ * Check the result of a state validation. If the result failed, log an invalid
+ * state error and, if there is a waiter, notify it.
+ *
+ * @param valid   <code>true</code> if the code is of an appropriate type
+ * @param code    The code which failed to be of the correct type
+ * @param what    What the code failed to be, for logging
+ * @param waiter  The completion to notify of the error; may be NULL
+ *
+ * @return The result of the check
+ **/
+static bool checkCode(bool            valid,
+                      AdminStateCode  code,
+                      const char     *what,
+                      VDOCompletion  *waiter)
+{
+  if (valid) {
+    return true;
+  }
+
+  int result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE,
+                                       "%s is not a %s",
+                                       getAdminStateCodeName(code), what);
+  if (waiter != NULL) {
+    finishCompletion(waiter, result);
+  }
+
+  return false;
+}
+
+/**********************************************************************/
+bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter)
+{
+  return checkCode(isDrainOperation(operation), operation, "drain operation",
+                   waiter);
+}
+
+/**********************************************************************/
+bool startDraining(AdminState     *state,
+                   AdminStateCode  operation,
+                   VDOCompletion  *waiter,
+                   AdminInitiator *initiator)
+{
+  return (assertDrainOperation(operation, waiter)
+          && (beginOperation(state, operation, waiter, initiator)
+              == VDO_SUCCESS));
+}
+
+/**********************************************************************/
+bool finishDraining(AdminState *state)
+{
+  return finishDrainingWithResult(state, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+bool finishDrainingWithResult(AdminState *state, int result)
+{
+  return (isDraining(state) && endOperation(state, result));
+}
+
+/**********************************************************************/
+bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter)
+{
+  return checkCode(isLoadOperation(operation), operation, "load operation",
+                   waiter);
+}
+
+/**********************************************************************/
+bool startLoading(AdminState     *state,
+                  AdminStateCode  operation,
+                  VDOCompletion  *waiter,
+                  AdminInitiator *initiator)
+{
+  return (assertLoadOperation(operation, waiter)
+          && (beginOperation(state, operation, waiter, initiator)
+              == VDO_SUCCESS));
+}
+
+/**********************************************************************/
+bool finishLoading(AdminState *state)
+{
+  return finishLoadingWithResult(state, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+bool finishLoadingWithResult(AdminState *state, int result)
+{
+  return (isLoading(state) && endOperation(state, result));
+}
+
+/**********************************************************************/
+bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter)
+{
+  return checkCode(isResumeOperation(operation), operation, "resume operation",
+                   waiter);
+}
+
+/**********************************************************************/
+bool startResuming(AdminState     *state,
+                   AdminStateCode  operation,
+                   VDOCompletion  *waiter,
+                   AdminInitiator *initiator)
+{
+  return (assertResumeOperation(operation, waiter)
+          && (beginOperation(state, operation, waiter, initiator)
+              == VDO_SUCCESS));
+}
+
+/**********************************************************************/
+bool finishResuming(AdminState *state)
+{
+  return finishResumingWithResult(state, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+bool finishResumingWithResult(AdminState *state, int result)
+{
+  return (isResuming(state) && endOperation(state, result));
+}
+
+/**********************************************************************/
+int resumeIfQuiescent(AdminState *state)
+{
+  if (!isQuiescent(state)) {
+    return VDO_INVALID_ADMIN_STATE;
+  }
+
+  state->state = ADMIN_STATE_NORMAL_OPERATION;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Check whether an AdminStateCode is an operation.
+ *
+ * @param code    The operation to check
+ * @param waiter  The completion to notify if the code is not an operation; may
+ *                be NULL
+ *
+ * @return <code>true</code> if the code is an operation
+ **/
+static bool assertOperation(AdminStateCode code, VDOCompletion *waiter)
+{
+  return checkCode(isOperation(code), code, "operation", waiter);
+}
+
+/**********************************************************************/
+int startOperation(AdminState *state, AdminStateCode operation)
+{
+  return (assertOperation(operation, NULL)
+          ? beginOperation(state, operation, NULL, NULL)
+          : VDO_INVALID_ADMIN_STATE);
+}
+
+/**********************************************************************/
+bool startOperationWithWaiter(AdminState     *state,
+                              AdminStateCode  operation,
+                              VDOCompletion  *waiter,
+                              AdminInitiator *initiator)
+{
+  return (assertOperation(operation, waiter)
+          && (beginOperation(state, operation, waiter, initiator)
+              == VDO_SUCCESS));
+}
+
+/**********************************************************************/
+bool finishOperation(AdminState *state)
+{
+  return finishOperationWithResult(state, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+bool finishOperationWithResult(AdminState *state, int result)
+{
+  return endOperation(state, result);
+}
diff --git a/vdo/base/adminState.h b/vdo/base/adminState.h
new file mode 100644
index 0000000..5ab13cb
--- /dev/null
+++ b/vdo/base/adminState.h
@@ -0,0 +1,666 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.h#17 $
+ */
+
+#ifndef ADMIN_STATE_H
+#define ADMIN_STATE_H
+
+#include "completion.h"
+#include "types.h"
+
+/**
+ * The list of state types.
+ **/
+typedef enum {
+  /** Normal operation, DataVIOs may be active */
+  ADMIN_TYPE_NORMAL = 0,
+  /**
+   * Format: an operation for formatting a new VDO.
+   **/
+  ADMIN_TYPE_FORMAT,
+  /**
+   * Recover: a recovery operation.
+   **/
+  ADMIN_TYPE_RECOVER,
+  /**
+   * Rebuild: write data necessary for a full rebuild, drain outstanding I/O,
+   *          and return to normal operation.
+   **/
+  ADMIN_TYPE_REBUILD,
+  /**
+   * Save: write all dirty metadata thereby restoring the VDO to a clean state,
+   *       drain outstanding I/O, and become quiescent.
+   **/
+  ADMIN_TYPE_SAVE,
+  /**
+   * Scrub: load and/or save state necessary to scrub a slab.
+   **/
+  ADMIN_TYPE_SCRUB,
+  /**
+   * Suspend: write enough dirty metadata to perform resize transactions,
+   *          drain outstanding I/O, and become quiescent.
+   **/
+  ADMIN_TYPE_SUSPEND,
+  /**
+   * Resume: return to normal from a quiescent state
+   **/
+  ADMIN_TYPE_RESUME,
+  /** The mask for extracting the AdminType from and AdminStateCode */
+  ADMIN_TYPE_MASK = 0xff,
+} AdminType;
+
+
+/**
+ * The bit position of flags used to categorize states.
+ **/
+typedef enum {
+  ADMIN_FLAG_BIT_START    = 8,
+  /** Flag indicating that I/O is draining */
+  ADMIN_FLAG_BIT_DRAINING = ADMIN_FLAG_BIT_START,
+  /** Flag indicating a load operation */
+  ADMIN_FLAG_BIT_LOADING,
+  /** Flag indicating that the next state will be a quiescent state */
+  ADMIN_FLAG_BIT_QUIESCING,
+  /** Flag indicating that the state is quiescent */
+  ADMIN_FLAG_BIT_QUIESCENT,
+  /**
+   * Flag indicating that an operation is in progress and so no other
+   * operation may be started.
+   **/
+  ADMIN_FLAG_BIT_OPERATING,
+} AdminFlagBit;
+
+/**
+ * The flags themselves.
+ **/
+typedef enum {
+  ADMIN_FLAG_DRAINING  = (uint32_t) (1 << ADMIN_FLAG_BIT_DRAINING),
+  ADMIN_FLAG_LOADING   = (uint32_t) (1 << ADMIN_FLAG_BIT_LOADING),
+  ADMIN_FLAG_QUIESCING = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCING),
+  ADMIN_FLAG_QUIESCENT = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCENT),
+  ADMIN_FLAG_OPERATING = (uint32_t) (1 << ADMIN_FLAG_BIT_OPERATING),
+} AdminFlag;
+
+/**
+ * The state codes.
+ **/
+typedef enum {
+  ADMIN_STATE_NORMAL_OPERATION     = ADMIN_TYPE_NORMAL,
+  ADMIN_STATE_OPERATING            = (ADMIN_TYPE_NORMAL
+                                      | ADMIN_FLAG_OPERATING),
+  ADMIN_STATE_FORMATTING           = (ADMIN_TYPE_FORMAT
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_LOADING),
+  ADMIN_STATE_LOADING              = (ADMIN_TYPE_NORMAL
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_LOADING),
+  ADMIN_STATE_LOADING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_LOADING),
+  ADMIN_STATE_LOADING_FOR_REBUILD  = (ADMIN_TYPE_REBUILD
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_LOADING),
+  ADMIN_STATE_WAITING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER
+                                      | ADMIN_FLAG_OPERATING),
+  ADMIN_STATE_NEW                  = (ADMIN_TYPE_NORMAL
+                                      | ADMIN_FLAG_QUIESCENT),
+  ADMIN_STATE_RECOVERING           = (ADMIN_TYPE_RECOVER
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING),
+  ADMIN_STATE_REBUILDING           = (ADMIN_TYPE_REBUILD
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING),
+  ADMIN_STATE_SAVING               = (ADMIN_TYPE_SAVE
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING
+                                      | ADMIN_FLAG_QUIESCING),
+  ADMIN_STATE_SAVED                = (ADMIN_TYPE_SAVE
+                                      | ADMIN_FLAG_QUIESCENT),
+  ADMIN_STATE_SCRUBBING            = (ADMIN_TYPE_SCRUB
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING
+                                      | ADMIN_FLAG_LOADING),
+  ADMIN_STATE_SAVE_FOR_SCRUBBING   = (ADMIN_TYPE_SCRUB
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING),
+  ADMIN_STATE_SUSPENDING           = (ADMIN_TYPE_SUSPEND
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_DRAINING
+                                      | ADMIN_FLAG_QUIESCING),
+  ADMIN_STATE_SUSPENDED            = (ADMIN_TYPE_SUSPEND
+                                      | ADMIN_FLAG_QUIESCENT),
+  ADMIN_STATE_SUSPENDED_OPERATION  = (ADMIN_TYPE_SUSPEND
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_QUIESCENT),
+  ADMIN_STATE_RESUMING             = (ADMIN_TYPE_RESUME
+                                      | ADMIN_FLAG_OPERATING
+                                      | ADMIN_FLAG_QUIESCENT),
+} AdminStateCode;
+
+typedef struct {
+  /** The current administrative state */
+  AdminStateCode  state;
+  /** The next administrative state (when the current operation finishes */
+  AdminStateCode  nextState;
+  /** A completion waiting on a state change */
+  VDOCompletion  *waiter;
+  /** Whether an operation is being initiated */
+  bool            starting;
+  /** Whether an operation has completed in the initiator */
+  bool            complete;
+} AdminState;
+
+/**
+ * A method to be called once an admin operation may be initiated.
+ **/
+typedef void AdminInitiator(AdminState *state);
+
+/**
+ * Get the name of an AdminStateCode for logging purposes.
+ *
+ * @param code  The AdminStateCode
+ *
+ * @return The name of the state's code
+ **/
+const char *getAdminStateCodeName(AdminStateCode code)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the name of an AdminState's code for logging purposes.
+ *
+ * @param state  The AdminState
+ *
+ * @return The name of the state's code
+ **/
+const char *getAdminStateName(const AdminState *state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether an AdminState is in normal operation.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is normal
+ **/
+__attribute__((warn_unused_result))
+static inline bool isNormal(AdminState *state)
+{
+  return ((state->state & ADMIN_TYPE_MASK) == ADMIN_TYPE_NORMAL);
+}
+
+/**
+ * Check whether an AdminStateCode is an operation.
+ *
+ * @param code  The code to check
+ *
+ * @return <code>true</code> if the code is an operation
+ **/
+__attribute__((warn_unused_result))
+static inline bool isOperation(AdminStateCode code)
+{
+  return ((code & ADMIN_FLAG_OPERATING) == ADMIN_FLAG_OPERATING);
+}
+
+/**
+ * Check whether an AdminState is operating.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is operating
+ **/
+__attribute__((warn_unused_result))
+static inline bool isOperating(AdminState *state)
+{
+  return isOperation(state->state);
+}
+
+/**
+ * Check whether an AdminState is suspending.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is suspending
+ **/
+__attribute__((warn_unused_result))
+static inline bool isSuspending(AdminState *state)
+{
+  return (state->state == ADMIN_STATE_SUSPENDING);
+}
+
+/**
+ * Check whether an AdminState is suspended.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is suspended
+ **/
+__attribute__((warn_unused_result))
+static inline bool isSuspended(AdminState *state)
+{
+  return (state->state == ADMIN_STATE_SUSPENDED);
+}
+
+/**
+ * Check whether an AdminState is saving.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is saving
+ **/
+__attribute__((warn_unused_result))
+static inline bool isSaving(AdminState *state)
+{
+  return (state->state == ADMIN_STATE_SAVING);
+}
+
+/**
+ * Check whether an AdminState is saved.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is saved
+ **/
+__attribute__((warn_unused_result))
+static inline bool isSaved(AdminState *state)
+{
+  return (state->state == ADMIN_STATE_SAVED);
+}
+
+/**
+ * Check whether an AdminStateCode is a drain operation.
+ *
+ * @param code  The AdminStateCode to check
+ *
+ * @return <code>true</code> if the code is for a drain operation
+ **/
+__attribute__((warn_unused_result))
+static inline bool isDrainOperation(AdminStateCode code)
+{
+  return ((code & ADMIN_FLAG_DRAINING) == ADMIN_FLAG_DRAINING);
+}
+
+/**
+ * Check whether an AdminState is draining.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is draining
+ **/
+__attribute__((warn_unused_result))
+static inline bool isDraining(AdminState *state)
+{
+  return isDrainOperation(state->state);
+}
+
+/**
+ * Check whether an AdminStateCode is a load operation.
+ *
+ * @param code  The AdminStateCode to check
+ *
+ * @return <code>true</code> if the code is for a load operation
+ **/
+__attribute__((warn_unused_result))
+static inline bool isLoadOperation(AdminStateCode code)
+{
+  return ((code & ADMIN_FLAG_LOADING) == ADMIN_FLAG_LOADING);
+}
+
+/**
+ * Check whether an AdminState is loading.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is loading
+ **/
+__attribute__((warn_unused_result))
+static inline bool isLoading(AdminState *state)
+{
+  return isLoadOperation(state->state);
+}
+
+/**
+ * Check whether an AdminStateCode is a resume operation.
+ *
+ * @param code  The AdminStateCode to check
+ *
+ * @return <code>true</code> if the code is for a resume operation
+ **/
+__attribute__((warn_unused_result))
+static inline bool isResumeOperation(AdminStateCode code)
+{
+  return ((code & ADMIN_TYPE_MASK) == ADMIN_TYPE_RESUME);
+}
+
+/**
+ * Check whether an AdminState is resumeing.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is resumeing
+ **/
+__attribute__((warn_unused_result))
+static inline bool isResuming(AdminState *state)
+{
+  return isResumeOperation(state->state);
+}
+
+/**
+ * Check whether an AdminState is doing a clean load.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state is a clean load
+ **/
+__attribute__((warn_unused_result))
+static inline bool isCleanLoad(AdminState *state)
+{
+  return ((state->state == ADMIN_STATE_FORMATTING)
+          || (state->state == ADMIN_STATE_LOADING));
+}
+
+/**
+ * Check whether an AdminStateCode is quiescing.
+ *
+ * param code  The AdminStateCode to check
+ *
+ * @return <code>true</code> is the state is quiescing
+ **/
+__attribute__((warn_unused_result))
+static inline bool isQuiescingCode(AdminStateCode code)
+{
+  return ((code & ADMIN_FLAG_QUIESCING) == ADMIN_FLAG_QUIESCING);
+}
+
+/**
+ * Check whether an AdminState is quiescing.
+ *
+ * @param state  The AdminState to check
+ *
+ * @return <code>true</code> if the state is quiescing
+ **/
+__attribute__((warn_unused_result))
+static inline bool isQuiescing(AdminState *state)
+{
+  return isQuiescingCode(state->state);
+}
+
+/**
+ * Check where an AdminStateCode is quiescent.
+ *
+ * param code  The AdminStateCode to check
+ *
+ * @return <code>true</code> is the state is quiescent
+ **/
+__attribute__((warn_unused_result))
+static inline bool isQuiescentCode(AdminStateCode code)
+{
+  return ((code & ADMIN_FLAG_QUIESCENT) == ADMIN_FLAG_QUIESCENT);
+}
+
+/**
+ * Check whether an AdminState is quiescent.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> is the state is quiescent
+ **/
+__attribute__((warn_unused_result))
+static inline bool isQuiescent(AdminState *state)
+{
+  return isQuiescentCode(state->state);
+}
+
+/**
+ * Check whether an AdminStateCode is a quiescent operation.
+ *
+ * @param code  The code to check
+ *
+ * @return <code>true</code> if the code is a quiescent operation
+ **/
+__attribute__((warn_unused_result))
+static inline bool isQuiescentOperation(AdminStateCode code)
+{
+  return (isQuiescentCode(code) && isOperation(code));
+}
+
+/**
+ * Check that an operation is a drain.
+ *
+ * @param operation  The operation to check
+ * @param waiter     The completion to finish with an error if the operation is
+ *                   not a drain
+ *
+ * @return <code>true</code> if the specified operation is a drain
+ **/
+bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Initiate a drain operation if the current state permits it.
+ *
+ * @param state      The AdminState
+ * @param operation  The type of drain to initiate
+ * @param waiter     The completion to notify when the drain is complete; may
+ *                   be NULL
+ * @param initiator  The AdminInitiator to call if the operation may begin; may
+ *                   be NULL
+ *
+ * @return <code>true</code> if the drain was initiated, if not the waiter
+ *         will be notified
+ **/
+bool startDraining(AdminState     *state,
+                   AdminStateCode  operation,
+                   VDOCompletion  *waiter,
+                   AdminInitiator *initiator);
+
+/**
+ * Finish a drain operation if one was in progress.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state was draining; will notify the waiter
+ *         if so
+ **/
+bool finishDraining(AdminState *state);
+
+/**
+ * Finish a drain operation with a status code.
+ *
+ * @param state   The AdminState to query
+ * @param result  The result of the drain operation
+ *
+ * @return <code>true</code> if the state was draining; will notify the
+ *         waiter if so
+ **/
+bool finishDrainingWithResult(AdminState *state, int result);
+
+/**
+ * Check that an operation is a load.
+ *
+ * @param operation  The operation to check
+ * @param waiter     The completion to finish with an error if the operation is
+ *                   not a load
+ *
+ * @return <code>true</code> if the specified operation is a load
+ **/
+bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Initiate a load operation if the current state permits it.
+ *
+ * @param state      The AdminState
+ * @param operation  The type of load to initiate
+ * @param waiter     The completion to notify when the load is complete; may be
+ *                   NULL
+ * @param initiator  The AdminInitiator to call if the operation may begin; may
+ *                   be NULL
+ *
+ * @return <code>true</code> if the load was initiated, if not the waiter
+ *         will be notified
+ **/
+bool startLoading(AdminState     *state,
+                  AdminStateCode  operation,
+                  VDOCompletion  *waiter,
+                  AdminInitiator *initiator);
+
+/**
+ * Finish a load operation if one was in progress.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state was loading; will notify the waiter
+ *         if so
+ **/
+bool finishLoading(AdminState *state);
+
+/**
+ * Finish a load operation with a status code.
+ *
+ * @param state   The AdminState to query
+ * @param result  The result of the load operation
+ *
+ * @return <code>true</code> if the state was loading; will notify the
+ *         waiter if so
+ **/
+bool finishLoadingWithResult(AdminState *state, int result);
+
+/**
+ * Check whether an AdminStateCode is a resume operation.
+ *
+ * @param operation  The operation to check
+ * @param waiter     The completion to notify if the operation is not a resume
+ *                   operation; may be NULL
+ *
+ * @return <code>true</code> if the code is a resume operation
+ **/
+bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter);
+
+/**
+ * Initiate a resume operation if the current state permits it.
+ *
+ * @param state      The AdminState
+ * @param operation  The type of resume to start
+ * @param waiter     The completion to notify when the resume is complete; may
+ *                   be NULL
+ * @param initiator  The AdminInitiator to call if the operation may begin; may
+ *                   be NULL
+ *
+ * @return <code>true</code> if the resume was initiated, if not the waiter
+ *         will be notified
+ **/
+bool startResuming(AdminState     *state,
+                   AdminStateCode  operation,
+                   VDOCompletion  *waiter,
+                   AdminInitiator *initiator);
+
+/**
+ * Finish a resume operation if one was in progress.
+ *
+ * @param state  The AdminState to query
+ *
+ * @return <code>true</code> if the state was resuming; will notify the waiter
+ *         if so
+ **/
+bool finishResuming(AdminState *state);
+
+/**
+ * Finish a resume operation with a status code.
+ *
+ * @param state   The AdminState to query
+ * @param result  The result of the resume operation
+ *
+ * @return <code>true</code> if the state was resuming; will notify the
+ *         waiter if so
+ **/
+bool finishResumingWithResult(AdminState *state, int result);
+
+/**
+ * Change the state to normal operation if the current state is quiescent.
+ *
+ * @param state  The AdminState to resume
+ *
+ * @return VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise
+ **/
+int resumeIfQuiescent(AdminState *state);
+
+/**
+ * Attempt to start an operation.
+ *
+ * @param state      the AdminState
+ * @param operation  the operation to start
+ *
+ * @return VDO_SUCCESS             if the operation was started
+ *         VDO_INVALID_ADMIN_STATE if not
+ **/
+int startOperation(AdminState *state, AdminStateCode operation);
+
+/**
+ * Attempt to start an operation.
+ *
+ * @param state      the AdminState
+ * @param operation  the operation to start
+ * @param waiter     the completion to notify when the operation completes or
+ *                   fails to start; may be NULL
+ * @param initiator  The AdminInitiator to call if the operation may begin; may
+ *                   be NULL
+ *
+ * @return <code>true</code> if the operation was started
+ **/
+bool startOperationWithWaiter(AdminState     *state,
+                              AdminStateCode  operation,
+                              VDOCompletion  *waiter,
+                              AdminInitiator *initiator);
+
+/**
+ * Finish the current operation. Will notify the operation waiter if there is
+ * one. This method should be used for operations started with
+ * startOperation(). For operations which were started with startDraining(),
+ * use finishDraining() instead.
+ *
+ * @param state  The state whose operation is to be finished
+ *
+ * @return <code>true</code> if there was an operation to finish
+ **/
+bool finishOperation(AdminState *state);
+
+/**
+ * Finish the current operation with a status code. Will notify the operation
+ * waiter if there is one.
+ *
+ * @param state   The state whose operation is to be finished
+ * @param result  The result of the operation
+ **/
+bool finishOperationWithResult(AdminState *state, int result);
+
+/**
+ * Set a result for the current operation.
+ *
+ * @param state  the AdminState
+ * @param result the result to set; if there is no waiter, this is a no-op
+ **/
+static inline void setOperationResult(AdminState *state, int result)
+{
+  if (state->waiter != NULL) {
+    setCompletionResult(state->waiter, result);
+  }
+}
+
+#endif // ADMIN_STATE_H
diff --git a/vdo/base/allocatingVIO.c b/vdo/base/allocatingVIO.c
new file mode 100644
index 0000000..4e0ffa8
--- /dev/null
+++ b/vdo/base/allocatingVIO.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.c#4 $
+ */
+
+#include "allocatingVIO.h"
+
+#include "logger.h"
+
+#include "allocationSelector.h"
+#include "blockAllocator.h"
+#include "dataVIO.h"
+#include "pbnLock.h"
+#include "slabDepot.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vioWrite.h"
+
+/**
+ * Make a single attempt to acquire a write lock on a newly-allocated PBN.
+ *
+ * @param allocatingVIO  The AllocatingVIO that wants a write lock for its
+ *                       newly allocated block
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int attemptPBNWriteLock(AllocatingVIO *allocatingVIO)
+{
+  assertInPhysicalZone(allocatingVIO);
+
+  ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL,
+                  "must not acquire a lock while already referencing one");
+
+  PBNLock *lock;
+  int result = attemptPBNLock(allocatingVIO->zone, allocatingVIO->allocation,
+                              allocatingVIO->writeLockType, &lock);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (lock->holderCount > 0) {
+    // This block is already locked, which should be impossible.
+    return logErrorWithStringError(VDO_LOCK_ERROR,
+                                   "Newly allocated block %" PRIu64
+                                   " was spuriously locked (holderCount=%u)",
+                                   allocatingVIO->allocation,
+                                   lock->holderCount);
+  }
+
+  // We've successfully acquired a new lock, so mark it as ours.
+  lock->holderCount += 1;
+  allocatingVIO->allocationLock = lock;
+  assignProvisionalReference(lock);
+  return VDO_SUCCESS;
+}
+
+/**
+ * Attempt to allocate and lock a physical block. If successful, continue
+ * along the write path.
+ *
+ * @param allocatingVIO  The AllocatingVIO which needs an allocation
+ *
+ * @return VDO_SUCCESS or an error if a block could not be allocated
+ **/
+static int allocateAndLockBlock(AllocatingVIO *allocatingVIO)
+{
+  BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone);
+  int result = allocateBlock(allocator, &allocatingVIO->allocation);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = attemptPBNWriteLock(allocatingVIO);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // We got a block!
+  VIO *vio      = allocatingVIOAsVIO(allocatingVIO);
+  vio->physical = allocatingVIO->allocation;
+  allocatingVIO->allocationCallback(allocatingVIO);
+  return VDO_SUCCESS;
+}
+
+static void allocateBlockForWrite(VDOCompletion *completion);
+
+/**
+ * Retry allocating a block for write.
+ *
+ * @param waiter   The AllocatingVIO that was waiting to allocate
+ * @param context  The context (unused)
+ **/
+static void
+retryAllocateBlockForWrite(Waiter *waiter,
+                           void   *context __attribute__((unused)))
+{
+  AllocatingVIO *allocatingVIO = waiterAsAllocatingVIO(waiter);
+  allocateBlockForWrite(allocatingVIOAsCompletion(allocatingVIO));
+}
+
+/**
+ * Attempt to enqueue an AllocatingVIO to wait for a slab to be scrubbed in the
+ * current allocation zone.
+ *
+ * @param allocatingVIO  The AllocatingVIO which wants to allocate a block
+ *
+ * @return VDO_SUCCESS if the AllocatingVIO was queued, VDO_NO_SPACE if there
+ *         are no slabs to be scrubbed in the current zone, or some other
+ *         error
+ **/
+static int waitForCleanSlab(AllocatingVIO *allocatingVIO)
+{
+  Waiter *waiter   = allocatingVIOAsWaiter(allocatingVIO);
+  waiter->callback = retryAllocateBlockForWrite;
+
+  BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone);
+  int             result    = enqueueForCleanSlab(allocator, waiter);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // We've successfully enqueued, when we come back, pretend like we've
+  // never tried this allocation before.
+  allocatingVIO->waitForCleanSlab   = false;
+  allocatingVIO->allocationAttempts = 0;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Attempt to allocate a block in an AllocatingVIO's current allocation zone.
+ *
+ * @param allocatingVIO  The AllocatingVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int allocateBlockInZone(AllocatingVIO *allocatingVIO)
+{
+  allocatingVIO->allocationAttempts++;
+  int result = allocateAndLockBlock(allocatingVIO);
+  if (result != VDO_NO_SPACE) {
+    return result;
+  }
+
+  if (allocatingVIO->waitForCleanSlab) {
+    result = waitForCleanSlab(allocatingVIO);
+    if (result != VDO_NO_SPACE) {
+      return result;
+    }
+  }
+
+  VDO                *vdo          = getVDOFromAllocatingVIO(allocatingVIO);
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  if (allocatingVIO->allocationAttempts >= threadConfig->physicalZoneCount) {
+    if (allocatingVIO->waitForCleanSlab) {
+      // There were no free blocks in any zone, and no zone had slabs to
+      // scrub.
+      allocatingVIO->allocationCallback(allocatingVIO);
+      return VDO_SUCCESS;
+    }
+
+    allocatingVIO->waitForCleanSlab   = true;
+    allocatingVIO->allocationAttempts = 0;
+  }
+
+  // Try the next zone
+  ZoneCount zoneNumber = getPhysicalZoneNumber(allocatingVIO->zone) + 1;
+  if (zoneNumber == threadConfig->physicalZoneCount) {
+    zoneNumber = 0;
+  }
+  allocatingVIO->zone = vdo->physicalZones[zoneNumber];
+  launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite,
+                             THIS_LOCATION("$F;cb=allocBlockInZone"));
+  return VDO_SUCCESS;
+}
+
+/**
+ * Attempt to allocate a block. This callback is registered in
+ * allocateDataBlock() and allocateBlockInZone().
+ *
+ * @param completion  The AllocatingVIO needing an allocation
+ **/
+static void allocateBlockForWrite(VDOCompletion *completion)
+{
+  AllocatingVIO *allocatingVIO = asAllocatingVIO(completion);
+  assertInPhysicalZone(allocatingVIO);
+  allocatingVIOAddTraceRecord(allocatingVIO, THIS_LOCATION(NULL));
+  int result = allocateBlockInZone(allocatingVIO);
+  if (result != VDO_SUCCESS) {
+    setCompletionResult(completion, result);
+    allocatingVIO->allocationCallback(allocatingVIO);
+  }
+}
+
+/**********************************************************************/
+void allocateDataBlock(AllocatingVIO      *allocatingVIO,
+                       AllocationSelector *selector,
+                       PBNLockType         writeLockType,
+                       AllocationCallback *callback)
+{
+  allocatingVIO->writeLockType      = writeLockType;
+  allocatingVIO->allocationCallback = callback;
+  allocatingVIO->allocationAttempts = 0;
+  allocatingVIO->allocation         = ZERO_BLOCK;
+
+  VIO *vio = allocatingVIOAsVIO(allocatingVIO);
+  allocatingVIO->zone
+    = vio->vdo->physicalZones[getNextAllocationZone(selector)];
+
+  launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite,
+                             THIS_LOCATION("$F;cb=allocDataBlock"));
+}
+
+/**********************************************************************/
+void releaseAllocationLock(AllocatingVIO *allocatingVIO)
+{
+  assertInPhysicalZone(allocatingVIO);
+  PhysicalBlockNumber lockedPBN = allocatingVIO->allocation;
+  if (hasProvisionalReference(allocatingVIO->allocationLock)) {
+    allocatingVIO->allocation = ZERO_BLOCK;
+  }
+
+  releasePBNLock(allocatingVIO->zone, lockedPBN,
+                 &allocatingVIO->allocationLock);
+}
+
+/**********************************************************************/
+void resetAllocation(AllocatingVIO *allocatingVIO)
+{
+  ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL,
+                  "must not reset allocation while holding a PBN lock");
+
+  allocatingVIOAsVIO(allocatingVIO)->physical = ZERO_BLOCK;
+  allocatingVIO->zone                         = NULL;
+  allocatingVIO->allocation                   = ZERO_BLOCK;
+  allocatingVIO->allocationAttempts           = 0;
+  allocatingVIO->waitForCleanSlab             = false;
+}
diff --git a/vdo/base/allocatingVIO.h b/vdo/base/allocatingVIO.h
new file mode 100644
index 0000000..a2f2b7b
--- /dev/null
+++ b/vdo/base/allocatingVIO.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.h#4 $
+ */
+
+#ifndef ALLOCATING_VIO_H
+#define ALLOCATING_VIO_H
+
+#include "atomic.h"
+#include "pbnLock.h"
+#include "physicalZone.h"
+#include "types.h"
+#include "vio.h"
+#include "waitQueue.h"
+
+typedef void AllocationCallback(AllocatingVIO *allocationVIO);
+
+/**
+ * A VIO which can receive an allocation from the block allocator. Currently,
+ * these are used both for servicing external data requests and for compressed
+ * block writes.
+ **/
+struct allocatingVIO {
+  /** The underlying VIO */
+  VIO                 vio;
+
+  /** The WaitQueue entry structure */
+  Waiter              waiter;
+
+  /** The physical zone in which to allocate a physical block */
+  PhysicalZone       *zone;
+
+  /** The block allocated to this VIO */
+  PhysicalBlockNumber allocation;
+
+  /**
+   * If non-NULL, the pooled PBN lock held on the allocated block. Must be a
+   * write lock until the block has been written, after which it will become a
+   * read lock.
+   **/
+  PBNLock            *allocationLock;
+
+  /** The type of write lock to obtain on the allocated block */
+  PBNLockType         writeLockType;
+
+  /** The number of zones in which this VIO has attempted an allocation */
+  ZoneCount           allocationAttempts;
+
+  /** Whether this VIO should wait for a clean slab */
+  bool                waitForCleanSlab;
+
+  /** The function to call once allocation is complete */
+  AllocationCallback *allocationCallback;
+};
+
+/**
+ * Convert a VIO to an AllocatingVIO.
+ *
+ * @param vio  The VIO to convert
+ *
+ * @return The VIO as an AllocatingVIO
+ **/
+static inline AllocatingVIO *vioAsAllocatingVIO(VIO *vio)
+{
+  STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0);
+  ASSERT_LOG_ONLY(((vio->type == VIO_TYPE_DATA)
+                   || (vio->type == VIO_TYPE_COMPRESSED_BLOCK)),
+                  "VIO is an AllocatingVIO");
+  return (AllocatingVIO *) vio;
+}
+
+/**
+ * Convert an AllocatingVIO to a VIO.
+ *
+ * @param allocatingVIO  The AllocatingVIO to convert
+ *
+ * @return The AllocatingVIO as a VIO
+ **/
+static inline VIO *allocatingVIOAsVIO(AllocatingVIO *allocatingVIO)
+{
+  return &allocatingVIO->vio;
+}
+
+/**
+ * Convert a generic VDOCompletion to an AllocatingVIO.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as an AllocatingVIO
+ **/
+static inline AllocatingVIO *asAllocatingVIO(VDOCompletion *completion)
+{
+  return vioAsAllocatingVIO(asVIO(completion));
+}
+
+/**
+ * Convert an AllocatingVIO to a generic completion.
+ *
+ * @param allocatingVIO  The AllocatingVIO to convert
+ *
+ * @return The AllocatingVIO as a completion
+ **/
+static inline
+VDOCompletion *allocatingVIOAsCompletion(AllocatingVIO *allocatingVIO)
+{
+  return vioAsCompletion(allocatingVIOAsVIO(allocatingVIO));
+}
+
+/**
+ * Convert an AllocatingVIO to a generic wait queue entry.
+ *
+ * @param allocatingVIO  The AllocatingVIO to convert
+ *
+ * @return The AllocatingVIO as a wait queue entry
+ **/
+static inline Waiter *allocatingVIOAsWaiter(AllocatingVIO *allocatingVIO)
+{
+  return &allocatingVIO->waiter;
+}
+
+/**
+ * Convert an AllocatingVIO's generic wait queue entry back to the
+ * AllocatingVIO.
+ *
+ * @param waiter  The wait queue entry to convert
+ *
+ * @return The wait queue entry as an AllocatingVIO
+ **/
+static inline AllocatingVIO *waiterAsAllocatingVIO(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+
+  return
+    (AllocatingVIO *) ((uintptr_t) waiter - offsetof(AllocatingVIO, waiter));
+}
+
+/**
+ * Check whether an AllocatingVIO is a compressed block write.
+ *
+ * @param allocatingVIO  The AllocatingVIO to check
+ *
+ * @return <code>true</code> if the AllocatingVIO is a compressed block write
+ **/
+static inline bool isCompressedWriteAllocatingVIO(AllocatingVIO *allocatingVIO)
+{
+  return isCompressedWriteVIO(allocatingVIOAsVIO(allocatingVIO));
+}
+
+/**
+ * Add a trace record for the current source location.
+ *
+ * @param allocatingVIO  The AllocatingVIO structure to be updated
+ * @param location       The source-location descriptor to be recorded
+ **/
+static inline void allocatingVIOAddTraceRecord(AllocatingVIO *allocatingVIO,
+                                               TraceLocation  location)
+{
+  vioAddTraceRecord(allocatingVIOAsVIO(allocatingVIO), location);
+}
+
+/**
+ * Get the VDO from an AllocatingVIO.
+ *
+ * @param allocatingVIO  The AllocatingVIO from which to get the VDO
+ *
+ * @return The VDO to which an AllocatingVIO belongs
+ **/
+static inline VDO *getVDOFromAllocatingVIO(AllocatingVIO *allocatingVIO)
+{
+  return allocatingVIOAsVIO(allocatingVIO)->vdo;
+}
+
+/**
+ * Check that an AllocatingVIO is running on the physical zone thread in
+ * which it did its allocation.
+ *
+ * @param allocatingVIO  The AllocatingVIO in question
+ **/
+static inline void assertInPhysicalZone(AllocatingVIO *allocatingVIO)
+{
+  ThreadID expected = getPhysicalZoneThreadID(allocatingVIO->zone);
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "AllocatingVIO for allocated physical block %" PRIu64
+                  " on thread %u, should be on thread %u",
+                  allocatingVIO->allocation, threadID, expected);
+}
+
+/**
+ * Set a callback as a physical block operation in an AllocatingVIO's allocated
+ * zone.
+ *
+ * @param allocatingVIO  The AllocatingVIO
+ * @param callback       The callback to set
+ * @param location       The tracing info for the call site
+ **/
+static inline void setPhysicalZoneCallback(AllocatingVIO *allocatingVIO,
+                                           VDOAction     *callback,
+                                           TraceLocation  location)
+{
+  setCallback(allocatingVIOAsCompletion(allocatingVIO), callback,
+              getPhysicalZoneThreadID(allocatingVIO->zone));
+  allocatingVIOAddTraceRecord(allocatingVIO, location);
+}
+
+/**
+ * Set a callback as a physical block operation in an AllocatingVIO's allocated
+ * zone and invoke it immediately.
+ *
+ * @param allocatingVIO  The AllocatingVIO
+ * @param callback       The callback to invoke
+ * @param location       The tracing info for the call site
+ **/
+static inline void launchPhysicalZoneCallback(AllocatingVIO *allocatingVIO,
+                                              VDOAction     *callback,
+                                              TraceLocation  location)
+{
+  setPhysicalZoneCallback(allocatingVIO, callback, location);
+  invokeCallback(allocatingVIOAsCompletion(allocatingVIO));
+}
+
+/**
+ * Allocate a data block to an AllocatingVIO.
+ *
+ * @param allocatingVIO  The AllocatingVIO which needs an allocation
+ * @param selector       The allocation selector for deciding which physical
+ *                       zone to allocate from
+ * @param writeLockType  The type of write lock to obtain on the block
+ * @param callback       The function to call once the allocation is complete
+ **/
+void allocateDataBlock(AllocatingVIO      *allocatingVIO,
+                       AllocationSelector *selector,
+                       PBNLockType         writeLockType,
+                       AllocationCallback *callback);
+
+/**
+ * Release the PBN lock on the allocated block. If the reference to the locked
+ * block is still provisional, it will be released as well.
+ *
+ * @param allocatingVIO  The lock holder
+ **/
+void releaseAllocationLock(AllocatingVIO *allocatingVIO);
+
+/**
+ * Reset an AllocatingVIO after it has done an allocation.
+ *
+ * @param allocatingVIO  The AllocatingVIO
+ **/
+void resetAllocation(AllocatingVIO *allocatingVIO);
+
+#endif // ALLOCATING_VIO_H
diff --git a/vdo/base/allocationSelector.c b/vdo/base/allocationSelector.c
new file mode 100644
index 0000000..e703d09
--- /dev/null
+++ b/vdo/base/allocationSelector.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.c#1 $
+ */
+
+#include "allocationSelector.h"
+#include "allocationSelectorInternals.h"
+
+#include "memoryAlloc.h"
+
+#include "types.h"
+
+enum {
+  ALLOCATIONS_PER_ZONE = 128,
+};
+
+/**********************************************************************/
+int makeAllocationSelector(ZoneCount            physicalZoneCount,
+                           ThreadID             threadID,
+                           AllocationSelector **selectorPtr)
+{
+  AllocationSelector *selector;
+  int result = ALLOCATE(1, AllocationSelector, __func__, &selector);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *selector = (AllocationSelector) {
+    .nextAllocationZone = threadID % physicalZoneCount,
+    .lastPhysicalZone   = physicalZoneCount - 1,
+  };
+
+  *selectorPtr = selector;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeAllocationSelector(AllocationSelector **selectorPtr)
+{
+  AllocationSelector *selector = *selectorPtr;
+  if (selector == NULL) {
+    return;
+  }
+
+  FREE(selector);
+  *selectorPtr = NULL;
+}
+
+/**********************************************************************/
+ZoneCount getNextAllocationZone(AllocationSelector *selector)
+{
+  if (selector->lastPhysicalZone > 0) {
+    if (selector->allocationCount < ALLOCATIONS_PER_ZONE) {
+      selector->allocationCount++;
+    } else {
+      selector->allocationCount = 1;
+      if (selector->nextAllocationZone < selector->lastPhysicalZone) {
+        selector->nextAllocationZone++;
+      } else {
+        selector->nextAllocationZone = 0;
+      }
+    }
+  }
+
+  return selector->nextAllocationZone;
+}
diff --git a/vdo/base/allocationSelector.h b/vdo/base/allocationSelector.h
new file mode 100644
index 0000000..7b922e9
--- /dev/null
+++ b/vdo/base/allocationSelector.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.h#1 $
+ */
+
+#ifndef ALLOCATION_SELECTOR_H
+#define ALLOCATION_SELECTOR_H
+
+#include "completion.h"
+
+/**
+ * An AllocationSelector is used by any zone which does data block allocations.
+ * The selector is used to round-robin allocation requests to different
+ * physical zones. Currently, 128 allocations will be made to a given physical
+ * zone before switching to the next.
+ **/
+
+/**
+ * Make a new allocation selector.
+ *
+ * @param [in]  physicalZoneCount  The number of physical zones
+ * @param [in]  threadID           The ID of the thread using this selector
+ * @param [out] selectorPtr        A pointer to receive the new selector
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeAllocationSelector(ZoneCount            physicalZoneCount,
+                           ThreadID             threadID,
+                           AllocationSelector **selectorPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an AllocationSelector and null out the reference to it.
+ *
+ * @param selectorPtr  A reference to the selector to free
+ **/
+void freeAllocationSelector(AllocationSelector **selectorPtr);
+
+/**
+ * Get number of the physical zone from which to allocate next.
+ *
+ * @param selector  The selector to query
+ *
+ * @return The number of the physical zone from which to allocate
+ **/
+ZoneCount getNextAllocationZone(AllocationSelector *selector)
+  __attribute__((warn_unused_result));
+
+#endif /* ALLOCATION_SELECTOR_H */
diff --git a/vdo/base/allocationSelectorInternals.h b/vdo/base/allocationSelectorInternals.h
new file mode 100644
index 0000000..13df50f
--- /dev/null
+++ b/vdo/base/allocationSelectorInternals.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelectorInternals.h#1 $
+ */
+
+#ifndef ALLOCATION_SELECTOR_INTERNALS_H
+#define ALLOCATION_SELECTOR_INTERNALS_H
+
+#include "types.h"
+
+/** Structure used to select which physical zone to allocate from */
+struct allocationSelector {
+  /** The number of allocations done in the current zone */
+  BlockCount     allocationCount;
+  /** The physical zone to allocate from next */
+  ZoneCount      nextAllocationZone;
+  /** The number of the last physical zone */
+  ZoneCount      lastPhysicalZone;
+};
+
+#endif /* ALLOCATION_SELECTOR_INTERNALS_H */
diff --git a/vdo/base/atomic.h b/vdo/base/atomic.h
new file mode 100644
index 0000000..93b7318
--- /dev/null
+++ b/vdo/base/atomic.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/atomic.h#2 $
+ */
+
+#ifndef ATOMIC_H
+#define ATOMIC_H
+
+#include "atomicDefs.h"
+#include "compiler.h"
+#include "typeDefs.h"
+
+#define ATOMIC_INITIALIZER(value) { (value) }
+
+typedef struct {
+  atomic_t value;
+} __attribute__((aligned(4))) Atomic32;
+
+typedef struct {
+  atomic64_t value;
+} __attribute__((aligned(8))) Atomic64;
+
+typedef struct {
+  Atomic32 value;
+} __attribute__((aligned(4))) AtomicBool;
+
+/**
+ * Memory load operations that precede this fence will be prevented from
+ * changing order with any that follow this fence, by either the compiler or
+ * the CPU. This can be used to ensure that the load operations accessing
+ * the fields of a structure are not re-ordered so they actually take effect
+ * before a pointer to the structure is resolved.
+ **/
+static INLINE void loadFence(void)
+{
+  smp_rmb();
+}
+
+/**
+ * Memory store operations that precede this fence will be prevented from
+ * changing order with any that follow this fence, by either the compiler or
+ * the CPU. This can be used to ensure that the store operations initializing
+ * the fields of a structure are not re-ordered so they actually take effect
+ * after a pointer to the structure is published.
+ **/
+static INLINE void storeFence(void)
+{
+  smp_wmb();
+}
+
+/**
+ * Generate a full memory fence for the compiler and CPU. Load and store
+ * operations issued before the fence will not be re-ordered with operations
+ * issued after the fence.
+ **/
+static INLINE void memoryFence(void)
+{
+  smp_mb();
+}
+
+/**
+ * Access the value of a 32-bit atomic variable, ensuring that the load is not
+ * re-ordered by the compiler or CPU with any subsequent load operations.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE uint32_t atomicLoad32(const Atomic32 *atom)
+{
+  uint32_t value = atomic_read(&atom->value);
+  loadFence();
+  return value;
+}
+
+/**
+ * Access the value of a 64-bit atomic variable, ensuring that the memory load
+ * is not re-ordered by the compiler or CPU with any subsequent load
+ * operations.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE uint64_t atomicLoad64(const Atomic64 *atom)
+{
+  uint64_t value = atomic64_read(&atom->value);
+  loadFence();
+  return value;
+}
+
+/**
+ * Access the value of a boolean atomic variable, ensuring that the load is not
+ * re-ordered by the compiler or CPU with any subsequent load operations.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE bool atomicLoadBool(const AtomicBool *atom)
+{
+  return (atomicLoad32(&atom->value) > 0);
+}
+
+/**
+ * Set the value of a 32-bit atomic variable, ensuring that the memory store
+ * operation is not re-ordered by the compiler or CPU with any preceding store
+ * operations.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void atomicStore32(Atomic32 *atom, uint32_t newValue)
+{
+  storeFence();
+  atomic_set(&atom->value, newValue);
+}
+
+/**
+ * Set the value of a 64-bit atomic variable, ensuring that the memory store
+ * operation is not re-ordered by the compiler or CPU with any preceding store
+ * operations.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void atomicStore64(Atomic64 *atom, uint64_t newValue)
+{
+  storeFence();
+  atomic64_set(&atom->value, newValue);
+}
+
+/**
+ * Set the value of a boolean atomic variable, ensuring that the memory store
+ * operation is not re-ordered by the compiler or CPU with any preceding store
+ * operations.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void atomicStoreBool(AtomicBool *atom, bool newValue)
+{
+  atomicStore32(&atom->value, (newValue ? 1 : 0));
+}
+
+/**
+ * Add a 32-bit signed delta to a 32-bit atomic variable.
+ *
+ * @param atom   a pointer to the atomic variable
+ * @param delta  the value to be added (or subtracted) from the variable
+ *
+ * @return       the new value of the atom after the add operation
+ **/
+static INLINE uint32_t atomicAdd32(Atomic32 *atom, int32_t delta)
+{
+  return atomic_add_return(delta, &atom->value);
+}
+
+/**
+ * Add a 64-bit signed delta to a 64-bit atomic variable.
+ *
+ * @param atom   a pointer to the atomic variable
+ * @param delta  the value to be added (or subtracted) from the variable
+ *
+ * @return       the new value of the atom after the add operation
+ **/
+static INLINE uint64_t atomicAdd64(Atomic64 *atom, int64_t delta)
+{
+  return atomic64_add_return(delta, &atom->value);
+}
+
+/**
+ * Atomic 32-bit compare-and-swap. If the atom is identical to a required
+ * value, atomically replace it with the new value and return true, otherwise
+ * do nothing and return false.
+ *
+ * @param atom           a pointer to the atomic variable
+ * @param requiredValue  the value that must be present to perform the swap
+ * @param newValue       the value to be swapped for the required value
+ *
+ * @return               true if the atom was changed, false otherwise
+ **/
+static INLINE bool compareAndSwap32(Atomic32 *atom,
+                                    uint32_t  requiredValue,
+                                    uint32_t  newValue)
+{
+  /*
+   * Our initial implementation, for x86, effectively got a full
+   * memory barrier because of how "lock cmpxchg" operates. The
+   * atomic_cmpxchg interface provides for a full barrier *if* the
+   * exchange is done, but not necessarily if it is not.
+   *
+   * Do we need the full barrier always? We need to investigate that,
+   * as part of (eventually) converting to using that API directly.
+   * For now, play it safe, and ensure the same behavior on other
+   * architectures too.
+   */
+#ifndef __x86_64__
+  smp_mb();
+#endif
+  int oldValue = atomic_cmpxchg(&atom->value, requiredValue, newValue);
+#ifndef __x86_64__
+  smp_mb();
+#endif
+  return requiredValue == (uint32_t) oldValue;
+}
+
+/**
+ * Atomic 64-bit compare-and-swap. If the atom is identical to a required
+ * value, atomically replace it with the new value and return true, otherwise
+ * do nothing and return false.
+ *
+ * @param atom           a pointer to the atomic variable
+ * @param requiredValue  the value that must be present to perform the swap
+ * @param newValue       the value to be swapped for the required value
+ *
+ * @return               true if the atom was changed, false otherwise
+ **/
+static INLINE bool compareAndSwap64(Atomic64 *atom,
+                                    uint64_t  requiredValue,
+                                    uint64_t  newValue)
+{
+#ifndef __x86_64__
+  smp_mb();
+#endif
+  long oldValue = atomic64_cmpxchg(&atom->value, requiredValue, newValue);
+#ifndef __x86_64__
+  smp_mb();
+#endif
+  return requiredValue == (uint64_t) oldValue;
+}
+
+/**
+ * Atomic boolean compare-and-swap. If the atom is identical to a required
+ * value, atomically replace it with the new value and return true, otherwise
+ * do nothing and return false.
+ *
+ * @param atom           a pointer to the atomic variable
+ * @param requiredValue  the value that must be present to perform the swap
+ * @param newValue       the value to be swapped for the required value
+ *
+ * @return               true if the atom was changed, false otherwise
+ **/
+static INLINE bool compareAndSwapBool(AtomicBool *atom,
+                                      bool        requiredValue,
+                                      bool        newValue)
+{
+  return compareAndSwap32(&atom->value, (requiredValue ? 1 : 0),
+                          (newValue ? 1 : 0));
+}
+
+/**
+ * Access the value of a 32-bit atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE uint32_t relaxedLoad32(const Atomic32 *atom)
+{
+  return atomic_read(&atom->value);
+}
+
+/**
+ * Access the value of a 64-bit atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE uint64_t relaxedLoad64(const Atomic64 *atom)
+{
+  return atomic64_read(&atom->value);
+}
+
+/**
+ * Access the value of a boolean atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom  a pointer to the atomic variable to access
+ *
+ * @return the value that was in the atom at the moment it was accessed
+ **/
+static INLINE bool relaxedLoadBool(const AtomicBool *atom)
+{
+  return (relaxedLoad32(&atom->value) > 0);
+}
+
+/**
+ * Set the value of a 32-bit atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void relaxedStore32(Atomic32 *atom, uint32_t newValue)
+{
+  atomic_set(&atom->value, newValue);
+}
+
+/**
+ * Set the value of a 64-bit atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void relaxedStore64(Atomic64 *atom, uint64_t newValue)
+{
+  atomic64_set(&atom->value, newValue);
+}
+
+/**
+ * Set the value of a boolean atomic variable using relaxed memory order,
+ * without any compiler or CPU fences.
+ *
+ * @param atom      a pointer to the atomic variable to modify
+ * @param newValue  the value to assign to the atomic variable
+ **/
+static INLINE void relaxedStoreBool(AtomicBool *atom, bool newValue)
+{
+  relaxedStore32(&atom->value, (newValue ? 1 : 0));
+}
+
+/**
+ * Non-atomically add a 32-bit signed delta to a 32-bit atomic variable,
+ * without any compiler or CPU fences.
+ *
+ * @param atom   a pointer to the atomic variable
+ * @param delta  the value to be added (or subtracted) from the variable
+ *
+ * @return       the new value of the atom after the add operation
+ **/
+static INLINE uint32_t relaxedAdd32(Atomic32 *atom, int32_t delta)
+{
+  uint32_t newValue = (relaxedLoad32(atom) + delta);
+  relaxedStore32(atom, newValue);
+  return newValue;
+}
+
+/**
+ * Non-atomically add a 64-bit signed delta to a 64-bit atomic variable,
+ * without any compiler or CPU fences.
+ *
+ * @param atom   a pointer to the atomic variable
+ * @param delta  the value to be added (or subtracted) from the variable
+ *
+ * @return       the new value of the atom after the add operation
+ **/
+static INLINE uint64_t relaxedAdd64(Atomic64 *atom, int64_t delta)
+{
+  uint64_t newValue = (relaxedLoad64(atom) + delta);
+  relaxedStore64(atom, newValue);
+  return newValue;
+}
+
+#endif /* ATOMIC_H */
diff --git a/vdo/base/blockAllocator.c b/vdo/base/blockAllocator.c
new file mode 100644
index 0000000..a1eaae4
--- /dev/null
+++ b/vdo/base/blockAllocator.c
@@ -0,0 +1,952 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.c#22 $
+ */
+
+#include "blockAllocatorInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "heap.h"
+#include "numUtils.h"
+#include "priorityTable.h"
+#include "readOnlyNotifier.h"
+#include "refCounts.h"
+#include "slab.h"
+#include "slabDepotInternals.h"
+#include "slabIterator.h"
+#include "slabJournalEraser.h"
+#include "slabJournalInternals.h"
+#include "slabScrubber.h"
+#include "slabSummary.h"
+#include "vdoRecovery.h"
+#include "vio.h"
+#include "vioPool.h"
+
+/**
+ * Assert that a block allocator function was called from the correct thread.
+ *
+ * @param threadID      The allocator's thread id
+ * @param functionName  The name of the function
+ **/
+static inline void assertOnAllocatorThread(ThreadID    threadID,
+                                           const char *functionName)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == threadID),
+                  "%s called on correct thread", functionName);
+}
+
+/**
+ * Get the priority for a slab in the allocator's slab queue. Slabs are
+ * essentially prioritized by an approximation of the number of free blocks in
+ * the slab so slabs with lots of free blocks with be opened for allocation
+ * before slabs that have few free blocks.
+ *
+ * @param slab  The slab whose queue priority is desired
+ *
+ * @return the queue priority of the slab
+ **/
+static unsigned int calculateSlabPriority(Slab *slab)
+{
+  BlockCount freeBlocks = getSlabFreeBlockCount(slab);
+
+  // Slabs that are completely full must be the only ones with the lowest
+  // priority: zero.
+  if (freeBlocks == 0) {
+    return 0;
+  }
+
+  /*
+   * Slabs that have never been opened (empty, newly initialized, never been
+   * written to) have lower priority than previously opened slabs that have a
+   * signficant number of free blocks. This ranking causes VDO to avoid
+   * writing physical blocks for the first time until there are very few free
+   * blocks that have been previously written to. That policy makes VDO a
+   * better client of any underlying storage that is thinly-provisioned
+   * [VDOSTORY-123].
+   */
+  unsigned int unopenedSlabPriority = slab->allocator->unopenedSlabPriority;
+  if (isSlabJournalBlank(slab->journal)) {
+    return unopenedSlabPriority;
+  }
+
+  /*
+   * For all other slabs, the priority is derived from the logarithm of the
+   * number of free blocks. Slabs with the same order of magnitude of free
+   * blocks have the same priority. With 2^23 blocks, the priority will range
+   * from 1 to 25. The reserved unopenedSlabPriority divides the range and is
+   * skipped by the logarithmic mapping.
+   */
+  unsigned int priority = (1 + logBaseTwo(freeBlocks));
+  return ((priority < unopenedSlabPriority) ? priority : priority + 1);
+}
+
+/**
+ * Add a slab to the priority queue of slabs available for allocation.
+ *
+ * @param slab  The slab to prioritize
+ **/
+static void prioritizeSlab(Slab *slab)
+{
+  ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode),
+                  "a slab must not already be on a ring when prioritizing");
+  slab->priority = calculateSlabPriority(slab);
+  priorityTableEnqueue(slab->allocator->prioritizedSlabs, slab->priority,
+                       &slab->ringNode);
+}
+
+/**********************************************************************/
+void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab)
+{
+  allocator->slabCount++;
+  allocator->lastSlab = slab->slabNumber;
+}
+
+/**
+ * Get an iterator over all the slabs in the allocator.
+ *
+ * @param allocator  The allocator
+ *
+ * @return An iterator over the allocator's slabs
+ **/
+static SlabIterator getSlabIterator(const BlockAllocator *allocator)
+{
+  return iterateSlabs(allocator->depot->slabs, allocator->lastSlab,
+                      allocator->zoneNumber, allocator->depot->zoneCount);
+}
+
+/**
+ * Notify a block allocator that the VDO has entered read-only mode.
+ *
+ * Implements ReadOnlyNotification.
+ *
+ * @param listener  The block allocator
+ * @param parent    The completion to notify in order to acknowledge the
+ *                  notification
+ **/
+static void notifyBlockAllocatorOfReadOnlyMode(void          *listener,
+                                               VDOCompletion *parent)
+{
+  BlockAllocator *allocator = listener;
+  assertOnAllocatorThread(allocator->threadID, __func__);
+  SlabIterator iterator = getSlabIterator(allocator);
+  while (hasNextSlab(&iterator)) {
+    Slab *slab = nextSlab(&iterator);
+    abortSlabJournalWaiters(slab->journal);
+  }
+
+  completeCompletion(parent);
+}
+
+/**********************************************************************/
+int makeAllocatorPoolVIOs(PhysicalLayer  *layer,
+                          void           *parent,
+                          void           *buffer,
+                          VIO           **vioPtr)
+{
+  return createVIO(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, parent,
+                   buffer, vioPtr);
+}
+
+/**
+ * Allocate those component of the block allocator which are needed only at
+ * load time, not at format time.
+ *
+ * @param allocator           The allocator
+ * @param layer               The physical layer below this allocator
+ * @param vioPoolSize         The VIO pool size
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int allocateComponents(BlockAllocator  *allocator,
+                              PhysicalLayer   *layer,
+                              BlockCount       vioPoolSize)
+{
+  /*
+   * If createVIO is NULL, the block allocator is only being used to format
+   * or audit the VDO. These only require the SuperBlock component, so we can
+   * just skip allocating all the memory needed for runtime components.
+   */
+  if (layer->createMetadataVIO == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  int result = registerReadOnlyListener(allocator->readOnlyNotifier,
+                                        allocator,
+                                        notifyBlockAllocatorOfReadOnlyMode,
+                                        allocator->threadID);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabDepot *depot = allocator->depot;
+  result = initializeEnqueueableCompletion(&allocator->completion,
+                                           BLOCK_ALLOCATOR_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  allocator->summary = getSlabSummaryForZone(depot, allocator->zoneNumber);
+
+  result = makeVIOPool(layer, vioPoolSize, allocator->threadID,
+                       makeAllocatorPoolVIOs, NULL, &allocator->vioPool);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount slabJournalSize = depot->slabConfig.slabJournalBlocks;
+  result = makeSlabScrubber(layer, slabJournalSize,
+                            allocator->readOnlyNotifier,
+                            &allocator->slabScrubber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // The number of data blocks is the maximum number of free blocks that could
+  // be used in calculateSlabPriority().
+  BlockCount maxFreeBlocks = depot->slabConfig.dataBlocks;
+  unsigned int maxPriority = (2 + logBaseTwo(maxFreeBlocks));
+  result = makePriorityTable(maxPriority, &allocator->prioritizedSlabs);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  /*
+   * VDOSTORY-123 requires that we try to open slabs that already have
+   * allocated blocks in preference to slabs that have never been opened. For
+   * reasons we have not been able to fully understand, performance tests on
+   * SSD harvards have been very sensitive (50% reduction in test throughput)
+   * to very slight differences in the timing and locality of block
+   * allocation. Assigning a low priority to unopened slabs (maxPriority/2,
+   * say) would be ideal for the story, but anything less than a very high
+   * threshold (maxPriority - 1) hurts PMI results.
+   *
+   * This sets the free block threshold for preferring to open an unopened
+   * slab to the binary floor of 3/4ths the total number of datablocks in a
+   * slab, which will generally evaluate to about half the slab size, but
+   * avoids degenerate behavior in unit tests where the number of data blocks
+   * is artificially constrained to a power of two.
+   */
+  allocator->unopenedSlabPriority = (1 + logBaseTwo((maxFreeBlocks * 3) / 4));
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeBlockAllocator(SlabDepot         *depot,
+                       ZoneCount          zoneNumber,
+                       ThreadID           threadID,
+                       Nonce              nonce,
+                       BlockCount         vioPoolSize,
+                       PhysicalLayer     *layer,
+                       ReadOnlyNotifier  *readOnlyNotifier,
+                       BlockAllocator   **allocatorPtr)
+{
+
+  BlockAllocator *allocator;
+  int result = ALLOCATE(1, BlockAllocator, __func__, &allocator);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  allocator->depot            = depot;
+  allocator->zoneNumber       = zoneNumber;
+  allocator->threadID         = threadID;
+  allocator->nonce            = nonce;
+  allocator->readOnlyNotifier = readOnlyNotifier;
+  initializeRing(&allocator->dirtySlabJournals);
+
+  result = allocateComponents(allocator, layer, vioPoolSize);
+  if (result != VDO_SUCCESS) {
+    freeBlockAllocator(&allocator);
+    return result;
+  }
+
+  *allocatorPtr = allocator;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeBlockAllocator(BlockAllocator **blockAllocatorPtr)
+{
+  BlockAllocator *allocator = *blockAllocatorPtr;
+  if (allocator == NULL) {
+    return;
+  }
+
+  freeSlabScrubber(&allocator->slabScrubber);
+  freeVIOPool(&allocator->vioPool);
+  freePriorityTable(&allocator->prioritizedSlabs);
+  destroyEnqueueable(&allocator->completion);
+  FREE(allocator);
+  *blockAllocatorPtr = NULL;
+}
+
+/**********************************************************************/
+int replaceVIOPool(BlockAllocator *allocator,
+                   size_t          size,
+                   PhysicalLayer  *layer)
+{
+  freeVIOPool(&allocator->vioPool);
+  return makeVIOPool(layer, size, allocator->threadID, makeAllocatorPoolVIOs,
+                     NULL, &allocator->vioPool);
+}
+
+/**
+ * Get the maximum number of data blocks that can be allocated.
+ *
+ * @param allocator  The block allocator to query
+ *
+ * @return The number of data blocks that can be allocated
+ **/
+__attribute__((warn_unused_result))
+static inline BlockCount getDataBlockCount(const BlockAllocator *allocator)
+{
+  return (allocator->slabCount * allocator->depot->slabConfig.dataBlocks);
+}
+
+/**********************************************************************/
+BlockCount getAllocatedBlocks(const BlockAllocator *allocator)
+{
+  return relaxedLoad64(&allocator->statistics.allocatedBlocks);
+}
+
+/**********************************************************************/
+BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator)
+{
+  return getScrubberSlabCount(allocator->slabScrubber);
+}
+
+/**********************************************************************/
+void queueSlab(Slab *slab)
+{
+  ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode),
+                  "a requeued slab must not already be on a ring");
+  BlockAllocator *allocator  = slab->allocator;
+  BlockCount      freeBlocks = getSlabFreeBlockCount(slab);
+  int result = ASSERT((freeBlocks <= allocator->depot->slabConfig.dataBlocks),
+                      "rebuilt slab %u must have a valid free block count"
+                      " (has %llu, expected maximum %llu)",
+                      slab->slabNumber, freeBlocks,
+                      allocator->depot->slabConfig.dataBlocks);
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(allocator->readOnlyNotifier, result);
+    return;
+  }
+
+  if (isUnrecoveredSlab(slab)) {
+    registerSlabForScrubbing(allocator->slabScrubber, slab, false);
+    return;
+  }
+
+  if (!isSlabResuming(slab)) {
+    // If the slab is resuming, we've already accounted for it here, so don't
+    // do it again.
+    relaxedAdd64(&allocator->statistics.allocatedBlocks, -freeBlocks);
+    if (!isSlabJournalBlank(slab->journal)) {
+      relaxedAdd64(&allocator->statistics.slabsOpened, 1);
+    }
+  }
+
+  // All slabs are kept in a priority queue for allocation.
+  prioritizeSlab(slab);
+}
+
+/**********************************************************************/
+void adjustFreeBlockCount(Slab *slab, bool increment)
+{
+  BlockAllocator *allocator = slab->allocator;
+  // The sense of increment is reversed since allocations are being counted.
+  relaxedAdd64(&allocator->statistics.allocatedBlocks, (increment ? -1 : 1));
+
+  // The open slab doesn't need to be reprioritized until it is closed.
+  if (slab == allocator->openSlab) {
+    return;
+  }
+
+  // The slab priority rarely changes; if no change, then don't requeue it.
+  if (slab->priority == calculateSlabPriority(slab)) {
+    return;
+  }
+
+  // Reprioritize the slab to reflect the new free block count by removing it
+  // from the table and re-enqueuing it with the new priority.
+  priorityTableRemove(allocator->prioritizedSlabs, &slab->ringNode);
+  prioritizeSlab(slab);
+}
+
+/**
+ * Allocate the next free physical block in a slab.
+ *
+ * The block allocated will have a provisional reference and the
+ * reference must be either confirmed with a subsequent call to
+ * incrementReferenceCount() or vacated with a subsequent call to
+ * decrementReferenceCount().
+ *
+ * @param [in]  slab            The slab
+ * @param [out] blockNumberPtr  A pointer to receive the allocated block number
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int allocateSlabBlock(Slab *slab, PhysicalBlockNumber *blockNumberPtr)
+{
+  PhysicalBlockNumber pbn;
+  int result = allocateUnreferencedBlock(slab->referenceCounts, &pbn);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  adjustFreeBlockCount(slab, false);
+
+  *blockNumberPtr = pbn;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int allocateBlock(BlockAllocator *allocator,
+                  PhysicalBlockNumber *blockNumberPtr)
+{
+  if (allocator->openSlab != NULL) {
+    // Try to allocate the next block in the currently open slab.
+    int result = allocateSlabBlock(allocator->openSlab, blockNumberPtr);
+    if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) {
+      return result;
+    }
+
+    // Put the exhausted open slab back into the priority table.
+    prioritizeSlab(allocator->openSlab);
+  }
+
+  // Remove the highest priority slab from the priority table and make it
+  // the open slab.
+  allocator->openSlab
+    = slabFromRingNode(priorityTableDequeue(allocator->prioritizedSlabs));
+
+  if (isSlabJournalBlank(allocator->openSlab->journal)) {
+    relaxedAdd64(&allocator->statistics.slabsOpened, 1);
+    dirtyAllReferenceBlocks(allocator->openSlab->referenceCounts);
+  } else {
+    relaxedAdd64(&allocator->statistics.slabsReopened, 1);
+  }
+
+  // Try allocating again. If we're out of space immediately after opening a
+  // slab, then every slab must be fully allocated.
+  return allocateSlabBlock(allocator->openSlab, blockNumberPtr);
+}
+
+/**********************************************************************/
+void releaseBlockReference(BlockAllocator      *allocator,
+                           PhysicalBlockNumber  pbn,
+                           const char          *why)
+{
+  if (pbn == ZERO_BLOCK) {
+    return;
+  }
+
+  Slab *slab = getSlab(allocator->depot, pbn);
+  ReferenceOperation operation = {
+    .type = DATA_DECREMENT,
+    .pbn  = pbn,
+  };
+  int result = modifySlabReferenceCount(slab, NULL, operation);
+  if (result != VDO_SUCCESS) {
+    logErrorWithStringError(result,
+                            "Failed to release reference to %s "
+                            "physical block %llu",
+                            why, pbn);
+  }
+}
+
+/**
+ * This is a HeapComparator function that orders SlabStatuses using the
+ * 'isClean' field as the primary key and the 'emptiness' field as the
+ * secondary key.
+ *
+ * Slabs need to be pushed onto the rings in the same order they are to be
+ * popped off. Popping should always get the most empty first, so pushing
+ * should be from most empty to least empty. Thus, the comparator order is
+ * the usual sense since Heap returns larger elements before smaller ones.
+ *
+ * @param item1  The first item to compare
+ * @param item2  The second item to compare
+ *
+ * @return  1 if the first item is cleaner or emptier than the second;
+ *          0 if the two items are equally clean and empty;
+           -1 otherwise
+ **/
+static int compareSlabStatuses(const void *item1, const void *item2)
+{
+  const SlabStatus *info1 = (const SlabStatus *) item1;
+  const SlabStatus *info2 = (const SlabStatus *) item2;
+
+  if (info1->isClean != info2->isClean) {
+    return (info1->isClean ? 1 : -1);
+  }
+  if (info1->emptiness != info2->emptiness) {
+    return ((info1->emptiness > info2->emptiness) ? 1 : -1);
+  }
+  return ((info1->slabNumber < info2->slabNumber) ? 1 : -1);
+}
+
+/**
+ * Swap two SlabStatus structures. Implements HeapSwapper.
+ **/
+static void swapSlabStatuses(void *item1, void *item2)
+{
+  SlabStatus *info1 = item1;
+  SlabStatus *info2 = item2;
+  SlabStatus temp = *info1;
+  *info1 = *info2;
+  *info2 = temp;
+}
+
+/**
+ * Inform the allocator that a slab action has finished on some slab. This
+ * callback is registered in applyToSlabs().
+ *
+ * @param completion  The allocator completion
+ **/
+static void slabActionCallback(VDOCompletion *completion)
+{
+  BlockAllocator *allocator = container_of(completion, BlockAllocator,
+                                           completion);
+  SlabActor *actor = &allocator->slabActor;
+  if (--actor->slabActionCount == 0) {
+    actor->callback(completion);
+    return;
+  }
+
+  resetCompletion(completion);
+}
+
+/**
+ * Preserve the error from part of an administrative action and continue.
+ *
+ * @param completion  The allocator completion
+ **/
+static void handleOperationError(VDOCompletion *completion)
+{
+  BlockAllocator *allocator = (BlockAllocator *) completion;
+  setOperationResult(&allocator->state, completion->result);
+  completion->callback(completion);
+}
+
+/**
+ * Perform an administrative action on each of an allocator's slabs in
+ * parallel.
+ *
+ * @param allocator   The allocator
+ * @param callback    The method to call when the action is complete on every
+ *                    slab
+ **/
+static void applyToSlabs(BlockAllocator *allocator, VDOAction *callback)
+{
+  prepareCompletion(&allocator->completion, slabActionCallback,
+                    handleOperationError, allocator->threadID, NULL);
+  allocator->completion.requeue = false;
+
+  // Since we are going to dequeue all of the slabs, the open slab will become
+  // invalid, so clear it.
+  allocator->openSlab = NULL;
+
+  // Ensure that we don't finish before we're done starting.
+  allocator->slabActor = (SlabActor) {
+    .slabActionCount     = 1,
+    .callback            = callback,
+  };
+
+  SlabIterator iterator = getSlabIterator(allocator);
+  while (hasNextSlab(&iterator)) {
+    Slab *slab = nextSlab(&iterator);
+    unspliceRingNode(&slab->ringNode);
+    allocator->slabActor.slabActionCount++;
+    startSlabAction(slab, allocator->state.state, &allocator->completion);
+  }
+
+  slabActionCallback(&allocator->completion);
+}
+
+/**
+ * Inform the allocator that all load I/O has finished.
+ *
+ * @param completion  The allocator completion
+ **/
+static void finishLoadingAllocator(VDOCompletion *completion)
+{
+  BlockAllocator *allocator = (BlockAllocator *) completion;
+  if (allocator->state.state == ADMIN_STATE_LOADING_FOR_RECOVERY) {
+    void *context = getCurrentActionContext(allocator->depot->actionManager);
+    replayIntoSlabJournals(allocator, completion, context);
+    return;
+  }
+
+  finishLoading(&allocator->state);
+}
+
+/**
+ * Initiate a load.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateLoad(AdminState *state)
+{
+  BlockAllocator *allocator = container_of(state, BlockAllocator, state);
+  if (state->state == ADMIN_STATE_LOADING_FOR_REBUILD) {
+    prepareCompletion(&allocator->completion, finishLoadingAllocator,
+                      handleOperationError, allocator->threadID, NULL);
+    eraseSlabJournals(allocator->depot, getSlabIterator(allocator),
+                      &allocator->completion);
+    return;
+  }
+
+  applyToSlabs(allocator, finishLoadingAllocator);
+}
+
+/**********************************************************************/
+void loadBlockAllocator(void          *context,
+                        ZoneCount      zoneNumber,
+                        VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  startLoading(&allocator->state,
+               getCurrentManagerOperation(allocator->depot->actionManager),
+               parent, initiateLoad);
+}
+
+/**********************************************************************/
+void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result)
+{
+  finishLoadingWithResult(&allocator->state, result);
+}
+
+/**********************************************************************/
+int prepareSlabsForAllocation(BlockAllocator *allocator)
+{
+  relaxedStore64(&allocator->statistics.allocatedBlocks,
+                 getDataBlockCount(allocator));
+
+  SlabDepot *depot     = allocator->depot;
+  SlabCount  slabCount = depot->slabCount;
+
+  SlabStatus *slabStatuses;
+  int result = ALLOCATE(slabCount, SlabStatus, __func__, &slabStatuses);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  getSummarizedSlabStatuses(allocator->summary, slabCount, slabStatuses);
+
+  // Sort the slabs by cleanliness, then by emptiness hint.
+  Heap heap;
+  initializeHeap(&heap, compareSlabStatuses, swapSlabStatuses,
+                 slabStatuses, slabCount, sizeof(SlabStatus));
+  buildHeap(&heap, slabCount);
+
+  SlabStatus currentSlabStatus;
+  while (popMaxHeapElement(&heap, &currentSlabStatus)) {
+    Slab *slab = depot->slabs[currentSlabStatus.slabNumber];
+    if (slab->allocator != allocator) {
+      continue;
+    }
+
+    if ((depot->loadType == REBUILD_LOAD)
+        || (!mustLoadRefCounts(allocator->summary, slab->slabNumber)
+            && currentSlabStatus.isClean)) {
+      queueSlab(slab);
+      continue;
+    }
+
+    markSlabUnrecovered(slab);
+    bool highPriority
+      = ((currentSlabStatus.isClean && (depot->loadType == NORMAL_LOAD))
+         || requiresScrubbing(slab->journal));
+    registerSlabForScrubbing(allocator->slabScrubber, slab, highPriority);
+  }
+  FREE(slabStatuses);
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void prepareAllocatorToAllocate(void          *context,
+                                ZoneCount      zoneNumber,
+                                VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  int result = prepareSlabsForAllocation(allocator);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  scrubHighPrioritySlabs(allocator->slabScrubber,
+                         isPriorityTableEmpty(allocator->prioritizedSlabs),
+                         parent, finishParentCallback, finishParentCallback);
+}
+
+/**********************************************************************/
+void registerNewSlabsForAllocator(void          *context,
+                                  ZoneCount      zoneNumber,
+                                  VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  SlabDepot *depot = allocator->depot;
+  for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) {
+    Slab *slab = depot->newSlabs[i];
+    if (slab->allocator == allocator) {
+      registerSlabWithAllocator(allocator, slab);
+    }
+  }
+  completeCompletion(parent);
+}
+
+/**
+ * Perform a step in draining the allocator. This method is its own callback.
+ *
+ * @param completion  The allocator's completion
+ **/
+static void doDrainStep(VDOCompletion *completion)
+{
+  BlockAllocator *allocator = (BlockAllocator *) completion;
+  prepareForRequeue(&allocator->completion, doDrainStep, handleOperationError,
+                    allocator->threadID, NULL);
+  switch (++allocator->drainStep) {
+  case DRAIN_ALLOCATOR_STEP_SCRUBBER:
+    stopScrubbing(allocator->slabScrubber, completion);
+    return;
+
+  case DRAIN_ALLOCATOR_STEP_SLABS:
+    applyToSlabs(allocator, doDrainStep);
+    return;
+
+  case DRAIN_ALLOCATOR_STEP_SUMMARY:
+    drainSlabSummaryZone(allocator->summary, allocator->state.state,
+                         completion);
+    return;
+
+  case DRAIN_ALLOCATOR_STEP_FINISHED:
+    ASSERT_LOG_ONLY(!isVIOPoolBusy(allocator->vioPool), "VIO Pool not busy");
+    finishDrainingWithResult(&allocator->state, completion->result);
+    return;
+
+  default:
+    finishDrainingWithResult(&allocator->state, UDS_BAD_STATE);
+  }
+}
+
+/**
+ * Initiate a drain.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateDrain(AdminState *state)
+{
+  BlockAllocator *allocator = container_of(state, BlockAllocator, state);
+  allocator->drainStep = DRAIN_ALLOCATOR_START;
+  doDrainStep(&allocator->completion);
+}
+
+/**********************************************************************/
+void drainBlockAllocator(void          *context,
+                         ZoneCount      zoneNumber,
+                         VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  startDraining(&allocator->state,
+                getCurrentManagerOperation(allocator->depot->actionManager),
+                parent, initiateDrain);
+}
+
+/**
+ * Perform a step in resuming a quiescent allocator. This method is its own
+ * callback.
+ *
+ * @param completion  The allocator's completion
+ **/
+static void doResumeStep(VDOCompletion *completion)
+{
+  BlockAllocator *allocator = (BlockAllocator *) completion;
+  prepareForRequeue(&allocator->completion, doResumeStep, handleOperationError,
+                    allocator->threadID, NULL);
+  switch (--allocator->drainStep) {
+  case DRAIN_ALLOCATOR_STEP_SUMMARY:
+    resumeSlabSummaryZone(allocator->summary, completion);
+    return;
+
+  case DRAIN_ALLOCATOR_STEP_SLABS:
+    applyToSlabs(allocator, doResumeStep);
+    return;
+
+  case DRAIN_ALLOCATOR_STEP_SCRUBBER:
+    resumeScrubbing(allocator->slabScrubber, completion);
+    return;
+
+  case DRAIN_ALLOCATOR_START:
+    finishResumingWithResult(&allocator->state, completion->result);
+    return;
+
+  default:
+    finishResumingWithResult(&allocator->state, UDS_BAD_STATE);
+  }
+}
+
+/**
+ * Initiate a resume.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateResume(AdminState *state)
+{
+  BlockAllocator *allocator = container_of(state, BlockAllocator, state);
+  allocator->drainStep = DRAIN_ALLOCATOR_STEP_FINISHED;
+  doResumeStep(&allocator->completion);
+}
+
+/**********************************************************************/
+void resumeBlockAllocator(void          *context,
+                          ZoneCount      zoneNumber,
+                          VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  startResuming(&allocator->state,
+                getCurrentManagerOperation(allocator->depot->actionManager),
+                parent, initiateResume);
+}
+
+/**********************************************************************/
+void releaseTailBlockLocks(void          *context,
+                           ZoneCount      zoneNumber,
+                           VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  RingNode       *ring      = &allocator->dirtySlabJournals;
+  while (!isRingEmpty(ring)) {
+    if (!releaseRecoveryJournalLock(slabJournalFromDirtyNode(ring->next),
+                                    allocator->depot->activeReleaseRequest)) {
+      break;
+    }
+  }
+  completeCompletion(parent);
+}
+
+/**********************************************************************/
+SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator)
+{
+  return allocator->summary;
+}
+
+/**********************************************************************/
+int acquireVIO(BlockAllocator *allocator, Waiter *waiter)
+{
+  return acquireVIOFromPool(allocator->vioPool, waiter);
+}
+
+/**********************************************************************/
+void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry)
+{
+  returnVIOToPool(allocator->vioPool, entry);
+}
+
+/**********************************************************************/
+void scrubAllUnrecoveredSlabsInZone(void          *context,
+                                    ZoneCount      zoneNumber,
+                                    VDOCompletion *parent)
+{
+  BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber);
+  scrubSlabs(allocator->slabScrubber, allocator->depot,
+             notifyZoneFinishedScrubbing, noopCallback);
+  completeCompletion(parent);
+}
+
+/**********************************************************************/
+int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter)
+{
+  return enqueueCleanSlabWaiter(allocator->slabScrubber, waiter);
+}
+
+/**********************************************************************/
+void increaseScrubbingPriority(Slab *slab)
+{
+  registerSlabForScrubbing(slab->allocator->slabScrubber, slab, true);
+}
+
+/**********************************************************************/
+void allocateFromAllocatorLastSlab(BlockAllocator *allocator)
+{
+  ASSERT_LOG_ONLY(allocator->openSlab == NULL, "mustn't have an open slab");
+  Slab *lastSlab = allocator->depot->slabs[allocator->lastSlab];
+  priorityTableRemove(allocator->prioritizedSlabs, &lastSlab->ringNode);
+  allocator->openSlab = lastSlab;
+}
+
+/**********************************************************************/
+BlockAllocatorStatistics
+getBlockAllocatorStatistics(const BlockAllocator *allocator)
+{
+  const AtomicAllocatorStatistics *atoms = &allocator->statistics;
+  return (BlockAllocatorStatistics) {
+    .slabCount     = allocator->slabCount,
+    .slabsOpened   = relaxedLoad64(&atoms->slabsOpened),
+    .slabsReopened = relaxedLoad64(&atoms->slabsReopened),
+  };
+}
+
+/**********************************************************************/
+SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator)
+{
+  const AtomicSlabJournalStatistics *atoms = &allocator->slabJournalStatistics;
+  return (SlabJournalStatistics) {
+    .diskFullCount = atomicLoad64(&atoms->diskFullCount),
+    .flushCount    = atomicLoad64(&atoms->flushCount),
+    .blockedCount  = atomicLoad64(&atoms->blockedCount),
+    .blocksWritten = atomicLoad64(&atoms->blocksWritten),
+    .tailBusyCount = atomicLoad64(&atoms->tailBusyCount),
+  };
+}
+
+/**********************************************************************/
+RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator)
+{
+  const AtomicRefCountStatistics *atoms = &allocator->refCountStatistics;
+  return (RefCountsStatistics) {
+    .blocksWritten = atomicLoad64(&atoms->blocksWritten),
+  };
+}
+
+/**********************************************************************/
+void dumpBlockAllocator(const BlockAllocator *allocator)
+{
+  unsigned int pauseCounter = 0;
+  logInfo("BlockAllocator zone %u", allocator->zoneNumber);
+  SlabIterator iterator = getSlabIterator(allocator);
+  while (hasNextSlab(&iterator)) {
+    dumpSlab(nextSlab(&iterator));
+
+    // Wait for a while after each batch of 32 slabs dumped, allowing the
+    // kernel log a chance to be flushed instead of being overrun.
+    if (pauseCounter++ == 31) {
+      pauseCounter = 0;
+      pauseForLogger();
+    }
+  }
+
+  dumpSlabScrubber(allocator->slabScrubber);
+}
diff --git a/vdo/base/blockAllocator.h b/vdo/base/blockAllocator.h
new file mode 100644
index 0000000..cd8eb39
--- /dev/null
+++ b/vdo/base/blockAllocator.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.h#12 $
+ */
+
+#ifndef BLOCK_ALLOCATOR_H
+#define BLOCK_ALLOCATOR_H
+
+#include "completion.h"
+#include "fixedLayout.h"
+#include "statistics.h"
+#include "types.h"
+#include "vioPool.h"
+#include "waitQueue.h"
+
+/**
+ * Create a block allocator.
+ *
+ * @param [in]  depot             The slab depot for this allocator
+ * @param [in]  zoneNumber        The physical zone number for this allocator
+ * @param [in]  threadID          The thread ID for this allocator's zone
+ * @param [in]  nonce             The nonce of the VDO
+ * @param [in]  vioPoolSize       The size of the VIO pool
+ * @param [in]  layer             The physical layer below this allocator
+ * @param [in]  readOnlyNotifier  The context for entering read-only mode
+ * @param [out] allocatorPtr      A pointer to hold the allocator
+ *
+ * @return A success or error code
+ **/
+int makeBlockAllocator(SlabDepot         *depot,
+                       ZoneCount          zoneNumber,
+                       ThreadID           threadID,
+                       Nonce              nonce,
+                       BlockCount         vioPoolSize,
+                       PhysicalLayer     *layer,
+                       ReadOnlyNotifier  *readOnlyNotifier,
+                       BlockAllocator   **allocatorPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a block allocator and null out the reference to it.
+ *
+ * @param blockAllocatorPtr  The reference to the allocator to destroy
+ **/
+void freeBlockAllocator(BlockAllocator **blockAllocatorPtr);
+
+/**
+ * Queue a slab for allocation or scrubbing.
+ *
+ * @param slab  The slab to queue
+ **/
+void queueSlab(Slab *slab);
+
+/**
+ * Update the block allocator to reflect an increment or decrement of the free
+ * block count in a slab. This adjusts the allocated block count and
+ * reprioritizes the slab when appropriate.
+ *
+ * @param slab       The slab whose free block count changed
+ * @param increment  True if the free block count went up by one,
+ *                   false if it went down by one
+ **/
+void adjustFreeBlockCount(Slab *slab, bool increment);
+
+/**
+ * Allocate a physical block.
+ *
+ * The block allocated will have a provisional reference and the
+ * reference must be either confirmed with a subsequent call to
+ * incrementReferenceCount() or vacated with a subsequent call to
+ * decrementReferenceCount().
+ *
+ * @param [in]  allocator       The block allocator
+ * @param [out] blockNumberPtr  A pointer to receive the allocated block number
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int allocateBlock(BlockAllocator      *allocator,
+                  PhysicalBlockNumber *blockNumberPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release an unused provisional reference.
+ *
+ * @param allocator  The block allocator
+ * @param pbn        The block to dereference
+ * @param why        Why the block was referenced (for logging)
+ **/
+void releaseBlockReference(BlockAllocator      *allocator,
+                           PhysicalBlockNumber  pbn,
+                           const char          *why);
+
+/**
+ * Get the number of allocated blocks, which is the total number of
+ * blocks in all slabs that have a non-zero reference count.
+ *
+ * @param allocator  The block allocator
+ *
+ * @return The number of blocks with a non-zero reference count
+ **/
+BlockCount getAllocatedBlocks(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of unrecovered slabs.
+ *
+ * @param allocator  The block allocator
+ *
+ * @return The number of slabs that are unrecovered
+ **/
+BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Load the state of an allocator from disk.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void loadBlockAllocator(void          *context,
+                        ZoneCount      zoneNumber,
+                        VDOCompletion *parent);
+
+/**
+ * Inform a block allocator that its slab journals have been recovered from the
+ * recovery journal.
+ *
+ * @param allocator  The allocator to inform
+ * @param result     The result of the recovery operation
+ **/
+void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result);
+
+/**
+ * Prepare the block allocator to come online and start allocating blocks.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void prepareAllocatorToAllocate(void          *context,
+                                ZoneCount      zoneNumber,
+                                VDOCompletion *parent);
+
+/**
+ * Register a slab with the allocator, ready for use.
+ *
+ * @param allocator  The allocator to use
+ * @param slab       The slab in question
+ **/
+void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab);
+
+/**
+ * Register the new slabs belonging to this allocator.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void registerNewSlabsForAllocator(void          *context,
+                                  ZoneCount      zoneNumber,
+                                  VDOCompletion *parent);
+
+/**
+ * Drain all allocator I/O. Depending upon the type of drain, some or all
+ * dirty metadata may be written to disk. The type of drain will be determined
+ * from the state of the allocator's depot.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void drainBlockAllocator(void          *context,
+                         ZoneCount      zoneNumber,
+                         VDOCompletion *parent);
+
+/**
+ * Resume a quiescent allocator.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void resumeBlockAllocator(void          *context,
+                          ZoneCount      zoneNumber,
+                          VDOCompletion *parent);
+
+/**
+ * Request a commit of all dirty tail blocks which are locking a given recovery
+ * journal block.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void releaseTailBlockLocks(void          *context,
+                           ZoneCount      zoneNumber,
+                           VDOCompletion *parent);
+
+/**
+ * Get the slab summary zone for an allocator.
+ *
+ * @param allocator  The allocator
+ *
+ * @return The SlabSummaryZone for that allocator
+ **/
+SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire a VIO from a block allocator's VIO pool (asynchronous).
+ *
+ * @param allocator  The allocator from which to get a VIO
+ * @param waiter     The object requesting the VIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int acquireVIO(BlockAllocator *allocator, Waiter *waiter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return a VIO to a block allocator's VIO pool
+ *
+ * @param allocator  The block allocator which owns the VIO
+ * @param entry      The VIO being returned
+ **/
+void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry);
+
+/**
+ * Initiate scrubbing all unrecovered slabs.
+ *
+ * <p>Implements ZoneAction.
+ **/
+void scrubAllUnrecoveredSlabsInZone(void          *context,
+                                    ZoneCount      zoneNumber,
+                                    VDOCompletion *parent);
+
+/**
+ * Queue a waiter for a clean slab.
+ *
+ * @param allocator  The allocator to wait on
+ * @param waiter     The waiter
+ *
+ * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no
+ *         slabs to scrub, and some other error otherwise
+ **/
+int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Increase the scrubbing priority of a slab.
+ *
+ * @param slab  The slab
+ **/
+void increaseScrubbingPriority(Slab *slab);
+
+/**
+ * Get the statistics for this allocator.
+ *
+ * @param allocator  The allocator to query
+ *
+ * @return A copy of the current statistics for the allocator
+ **/
+BlockAllocatorStatistics
+getBlockAllocatorStatistics(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the aggregated slab journal statistics for the slabs in this allocator.
+ *
+ * @param allocator  The allocator to query
+ *
+ * @return A copy of the current statistics for the allocator
+ **/
+SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the cumulative RefCounts statistics for the slabs in this allocator.
+ *
+ * @param allocator  The allocator to query
+ *
+ * @return A copy of the current statistics for the allocator
+ **/
+RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump information about a block allocator to the log for debugging.
+ *
+ * @param allocator  The allocator to dump
+ **/
+void dumpBlockAllocator(const BlockAllocator *allocator);
+
+#endif // BLOCK_ALLOCATOR_H
diff --git a/vdo/base/blockAllocatorInternals.h b/vdo/base/blockAllocatorInternals.h
new file mode 100644
index 0000000..83db684
--- /dev/null
+++ b/vdo/base/blockAllocatorInternals.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocatorInternals.h#11 $
+ */
+
+#ifndef BLOCK_ALLOCATOR_INTERNALS_H
+#define BLOCK_ALLOCATOR_INTERNALS_H
+
+#include "adminState.h"
+#include "atomic.h"
+#include "blockAllocator.h"
+#include "priorityTable.h"
+#include "ringNode.h"
+#include "slabScrubber.h"
+#include "vioPool.h"
+
+enum {
+  /*
+   * The number of VIOs in the VIO pool is proportional to the throughput of
+   * the VDO.
+   */
+  VIO_POOL_SIZE = 128,
+};
+
+typedef enum {
+  DRAIN_ALLOCATOR_START = 0,
+  DRAIN_ALLOCATOR_STEP_SCRUBBER,
+  DRAIN_ALLOCATOR_STEP_SLABS,
+  DRAIN_ALLOCATOR_STEP_SUMMARY,
+  DRAIN_ALLOCATOR_STEP_FINISHED,
+} BlockAllocatorDrainStep;
+
+/**
+ * A sub-structure for applying actions in parallel to all an allocator's
+ * slabs.
+ **/
+typedef struct {
+  /** The number of slabs performing a slab action */
+  SlabCount  slabActionCount;
+  /** The method to call when a slab action has been completed by all slabs */
+  VDOAction *callback;
+} SlabActor;
+
+/**
+ * These fields are only modified by the physical zone thread, but are queried
+ * by other threads.
+ **/
+typedef struct atomicAllocatorStatistics {
+  /** The count of allocated blocks in this zone */
+  Atomic64 allocatedBlocks;
+  /** The number of slabs from which blocks have ever been allocated */
+  Atomic64 slabsOpened;
+  /** The number of times since loading that a slab been re-opened */
+  Atomic64 slabsReopened;
+} AtomicAllocatorStatistics;
+
+/**
+ * The statistics for all the slab journals in the slabs owned by this
+ * allocator. These fields are all mutated only by the physical zone thread,
+ * but are read by other threads when gathering statistics for the entire
+ * depot.
+ **/
+typedef struct atomicSlabJournalStatistics {
+  /** Number of times the on-disk journal was full */
+  Atomic64 diskFullCount;
+  /** Number of times an entry was added over the flush threshold */
+  Atomic64 flushCount;
+  /** Number of times an entry was added over the block threshold */
+  Atomic64 blockedCount;
+  /** Number of times the tail block was written */
+  Atomic64 blocksWritten;
+  /** Number of times we had to wait for the tail block commit */
+  Atomic64 tailBusyCount;
+} AtomicSlabJournalStatistics;
+
+/**
+ * The statistics for all the RefCounts in the slabs owned by this
+ * allocator. These fields are all mutated only by the physical zone thread,
+ * but are read by other threads when gathering statistics for the entire
+ * depot.
+ **/
+typedef struct atomicRefCountStatistics {
+  /** Number of blocks written */
+  Atomic64 blocksWritten;
+} AtomicRefCountStatistics;
+
+struct blockAllocator {
+  VDOCompletion                completion;
+  /** The slab depot for this allocator */
+  SlabDepot                   *depot;
+  /** The slab summary zone for this allocator */
+  SlabSummaryZone             *summary;
+  /** The notifier for entering read-only mode */
+  ReadOnlyNotifier            *readOnlyNotifier;
+  /** The nonce of the VDO */
+  Nonce                        nonce;
+  /** The physical zone number of this allocator */
+  ZoneCount                    zoneNumber;
+  /** The thread ID for this allocator's physical zone */
+  ThreadID                     threadID;
+  /** The number of slabs in this allocator */
+  SlabCount                    slabCount;
+  /** The number of the last slab owned by this allocator */
+  SlabCount                    lastSlab;
+  /** The reduced priority level used to preserve unopened slabs */
+  unsigned int                 unopenedSlabPriority;
+  /** The state of this allocator */
+  AdminState                   state;
+  /** The actor for applying an action to all slabs */
+  SlabActor                    slabActor;
+
+  /** The slab from which blocks are currently being allocated */
+  Slab                        *openSlab;
+  /** A priority queue containing all slabs available for allocation */
+  PriorityTable               *prioritizedSlabs;
+  /** The slab scrubber */
+  SlabScrubber                *slabScrubber;
+  /** What phase of the close operation the allocator is to perform */
+  BlockAllocatorDrainStep      drainStep;
+  /** Statistics for this block allocator */
+  AtomicAllocatorStatistics    statistics;
+  /** Cumulative statistics for the slab journals in this zone */
+  AtomicSlabJournalStatistics  slabJournalStatistics;
+  /** Cumulative statistics for the RefCounts in this zone */
+  AtomicRefCountStatistics     refCountStatistics;
+
+  /**
+   * This is the head of a queue of slab journals which have entries in their
+   * tail blocks which have not yet started to commit. When the recovery
+   * journal is under space pressure, slab journals which have uncommitted
+   * entries holding a lock on the recovery journal head are forced to commit
+   * their blocks early. This list is kept in order, with the tail containing
+   * the slab journal holding the most recent recovery journal lock.
+   **/
+  RingNode                     dirtySlabJournals;
+
+  /** The VIO pool for reading and writing block allocator metadata */
+  VIOPool                     *vioPool;
+};
+
+/**
+ * Construct allocator metadata VIOs. Exposed for unit tests.
+ *
+ * Implements VIOConstructor
+ **/
+int makeAllocatorPoolVIOs(PhysicalLayer  *layer,
+                          void           *parent,
+                          void           *buffer,
+                          VIO           **vioPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Replace the VIO pool in a block allocator. This method exists for unit
+ * tests.
+ *
+ * @param allocator  The block allocator
+ * @param size       The number of entries in the pool
+ * @param layer      The physical layer from which to allocate VIOs
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int replaceVIOPool(BlockAllocator *allocator,
+                   size_t          size,
+                   PhysicalLayer  *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prepare slabs for allocation or scrubbing. This method is exposed for
+ * testing.
+ *
+ * @param allocator  The allocator to prepare
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int prepareSlabsForAllocation(BlockAllocator *allocator)
+  __attribute__((warn_unused_result));
+
+/**
+ * Start allocating from the highest numbered slab.
+ *
+ * @param allocator   The allocator
+ **/
+void allocateFromAllocatorLastSlab(BlockAllocator *allocator);
+
+#endif // BLOCK_ALLOCATOR_INTERNALS_H
diff --git a/vdo/base/blockMap.c b/vdo/base/blockMap.c
new file mode 100644
index 0000000..9a13c30
--- /dev/null
+++ b/vdo/base/blockMap.c
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.c#24 $
+ */
+
+#include "blockMap.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "actionManager.h"
+#include "adminState.h"
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "blockMapTree.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "forest.h"
+#include "numUtils.h"
+#include "recoveryJournal.h"
+#include "statusCodes.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vdoPageCache.h"
+
+typedef struct {
+  PhysicalBlockNumber flatPageOrigin;
+  BlockCount          flatPageCount;
+  PhysicalBlockNumber rootOrigin;
+  BlockCount          rootCount;
+} __attribute__((packed)) BlockMapState2_0;
+
+static const Header BLOCK_MAP_HEADER_2_0 = {
+  .id             = BLOCK_MAP,
+  .version        = {
+    .majorVersion = 2,
+    .minorVersion = 0,
+  },
+  .size           = sizeof(BlockMapState2_0),
+};
+
+/**
+ * State associated which each block map page while it is in the VDO page
+ * cache.
+ **/
+typedef struct {
+  /**
+   * The earliest recovery journal block containing uncommitted updates to the
+   * block map page associated with this context. A reference (lock) is held
+   * on that block to prevent it from being reaped. When this value changes,
+   * the reference on the old value must be released and a reference on the
+   * new value must be acquired.
+   **/
+  SequenceNumber recoveryLock;
+} BlockMapPageContext;
+
+/**
+ * Implements VDOPageReadFunction.
+ **/
+static int validatePageOnRead(void                *buffer,
+                              PhysicalBlockNumber  pbn,
+                              BlockMapZone        *zone,
+                              void                *pageContext)
+{
+  BlockMapPage        *page    = buffer;
+  BlockMapPageContext *context = pageContext;
+  Nonce                nonce   = zone->blockMap->nonce;
+
+  BlockMapPageValidity validity = validateBlockMapPage(page, nonce, pbn);
+  if (validity == BLOCK_MAP_PAGE_BAD) {
+    return logErrorWithStringError(VDO_BAD_PAGE,
+                                   "Expected page %" PRIu64
+                                   " but got page %llu instead",
+                                   pbn, getBlockMapPagePBN(page));
+  }
+
+  if (validity == BLOCK_MAP_PAGE_INVALID) {
+    formatBlockMapPage(page, nonce, pbn, false);
+  }
+
+  context->recoveryLock = 0;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Handle journal updates and torn write protection.
+ *
+ * Implements VDOPageWriteFunction.
+ **/
+static bool handlePageWrite(void         *rawPage,
+                            BlockMapZone *zone,
+                            void         *pageContext)
+{
+  BlockMapPage        *page    = rawPage;
+  BlockMapPageContext *context = pageContext;
+
+  if (markBlockMapPageInitialized(page, true)) {
+    // Cause the page to be re-written.
+    return true;
+  }
+
+  // Release the page's references on the recovery journal.
+  releaseRecoveryJournalBlockReference(zone->blockMap->journal,
+                                       context->recoveryLock,
+                                       ZONE_TYPE_LOGICAL, zone->zoneNumber);
+  context->recoveryLock = 0;
+  return false;
+}
+
+/**********************************************************************/
+PageCount computeBlockMapPageCount(BlockCount entries)
+{
+  return computeBucketCount(entries, BLOCK_MAP_ENTRIES_PER_PAGE);
+}
+
+/**********************************************************************/
+int makeBlockMap(BlockCount            logicalBlocks,
+                 const ThreadConfig   *threadConfig,
+                 BlockCount            flatPageCount,
+                 PhysicalBlockNumber   rootOrigin,
+                 BlockCount            rootCount,
+                 BlockMap            **mapPtr)
+{
+  STATIC_ASSERT(BLOCK_MAP_ENTRIES_PER_PAGE
+                == ((VDO_BLOCK_SIZE - sizeof(BlockMapPage))
+                    / sizeof(BlockMapEntry)));
+
+  BlockMap *map;
+  int result = ALLOCATE_EXTENDED(BlockMap, threadConfig->logicalZoneCount,
+                                 BlockMapZone, __func__, &map);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  map->flatPageCount = flatPageCount;
+  map->rootOrigin    = rootOrigin;
+  map->rootCount     = rootCount;
+  map->entryCount    = logicalBlocks;
+
+  ZoneCount       zoneCount    = threadConfig->logicalZoneCount;
+  for (ZoneCount zone = 0; zone < zoneCount; zone++) {
+    BlockMapZone *blockMapZone = &map->zones[zone];
+    blockMapZone->zoneNumber   = zone;
+    blockMapZone->threadID     = getLogicalZoneThread(threadConfig, zone);
+    blockMapZone->blockMap     = map;
+    map->zoneCount++;
+  }
+
+  *mapPtr = map;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Decode block map component state version 2.0 from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param state   The state structure to receive the decoded values
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodeBlockMapState_2_0(Buffer *buffer, BlockMapState2_0 *state)
+{
+  size_t initialLength = contentLength(buffer);
+
+  PhysicalBlockNumber flatPageOrigin;
+  int result = getUInt64LEFromBuffer(buffer, &flatPageOrigin);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BlockCount flatPageCount;
+  result = getUInt64LEFromBuffer(buffer, &flatPageCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  PhysicalBlockNumber rootOrigin;
+  result = getUInt64LEFromBuffer(buffer, &rootOrigin);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BlockCount rootCount;
+  result = getUInt64LEFromBuffer(buffer, &rootCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *state = (BlockMapState2_0) {
+    .flatPageOrigin = flatPageOrigin,
+    .flatPageCount  = flatPageCount,
+    .rootOrigin     = rootOrigin,
+    .rootCount      = rootCount,
+  };
+
+  size_t decodedSize = initialLength - contentLength(buffer);
+  return ASSERT(BLOCK_MAP_HEADER_2_0.size == decodedSize,
+                "decoded block map component size must match header size");
+}
+
+/**********************************************************************/
+int decodeBlockMap(Buffer              *buffer,
+                   BlockCount           logicalBlocks,
+                   const ThreadConfig  *threadConfig,
+                   BlockMap           **mapPtr)
+{
+  Header header;
+  int    result = decodeHeader(buffer, &header);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateHeader(&BLOCK_MAP_HEADER_2_0, &header, true, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockMapState2_0 state;
+  result = decodeBlockMapState_2_0(buffer, &state);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(state.flatPageOrigin == BLOCK_MAP_FLAT_PAGE_ORIGIN,
+                  "Flat page origin must be %u (recorded as %llu)",
+                  BLOCK_MAP_FLAT_PAGE_ORIGIN, state.flatPageOrigin);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BlockMap *map;
+  result = makeBlockMap(logicalBlocks, threadConfig,
+                        state.flatPageCount, state.rootOrigin,
+                        state.rootCount, &map);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *mapPtr = map;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int decodeSodiumBlockMap(Buffer              *buffer,
+                         BlockCount           logicalBlocks,
+                         const ThreadConfig  *threadConfig,
+                         BlockMap           **mapPtr)
+{
+  // Sodium uses state version 2.0.
+  return decodeBlockMap(buffer, logicalBlocks, threadConfig, mapPtr);
+}
+
+/**
+ * Initialize the per-zone portions of the block map.
+ *
+ * @param zone              The zone to initialize
+ * @param layer             The physical layer on which the zone resides
+ * @param readOnlyNotifier  The read-only context for the VDO
+ * @param cacheSize         The size of the page cache for the zone
+ * @param maximumAge        The number of journal blocks before a dirtied page
+ *                          is considered old and must be written out
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int initializeBlockMapZone(BlockMapZone     *zone,
+                                  PhysicalLayer    *layer,
+                                  ReadOnlyNotifier *readOnlyNotifier,
+                                  PageCount         cacheSize,
+                                  BlockCount        maximumAge)
+{
+  zone->readOnlyNotifier = readOnlyNotifier;
+  int result = initializeTreeZone(zone, layer, maximumAge);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return makeVDOPageCache(layer, cacheSize, validatePageOnRead,
+                          handlePageWrite, sizeof(BlockMapPageContext),
+                          maximumAge, zone, &zone->pageCache);
+}
+
+/**********************************************************************/
+BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber)
+{
+  return &map->zones[zoneNumber];
+}
+
+/**
+ * Get the ID of the thread on which a given block map zone operates.
+ *
+ * <p>Implements ZoneThreadGetter.
+ **/
+static ThreadID getBlockMapZoneThreadID(void *context, ZoneCount zoneNumber)
+{
+  return getBlockMapZone(context, zoneNumber)->threadID;
+}
+
+/**
+ * Prepare for an era advance.
+ *
+ * <p>Implements ActionPreamble.
+ **/
+static void prepareForEraAdvance(void *context, VDOCompletion *parent)
+{
+  BlockMap *map = context;
+  map->currentEraPoint = map->pendingEraPoint;
+  completeCompletion(parent);
+}
+
+/**
+ * Update the progress of the era in a zone.
+ *
+ * <p>Implements ZoneAction.
+ **/
+static void advanceBlockMapZoneEra(void          *context,
+                                   ZoneCount      zoneNumber,
+                                   VDOCompletion *parent)
+{
+  BlockMapZone *zone = getBlockMapZone(context, zoneNumber);
+  advanceVDOPageCachePeriod(zone->pageCache, zone->blockMap->currentEraPoint);
+  advanceZoneTreePeriod(&zone->treeZone, zone->blockMap->currentEraPoint);
+  finishCompletion(parent, VDO_SUCCESS);
+}
+
+/**
+ * Schedule an era advance if necessary. This method should not be called
+ * directly. Rather, call scheduleDefaultAction() on the block map's action
+ * manager.
+ *
+ * <p>Implements ActionScheduler.
+ **/
+static bool scheduleEraAdvance(void *context)
+{
+  BlockMap *map = context;
+  if (map->currentEraPoint == map->pendingEraPoint) {
+    return false;
+  }
+
+  return scheduleAction(map->actionManager, prepareForEraAdvance,
+                        advanceBlockMapZoneEra, NULL, NULL);
+}
+
+/**********************************************************************/
+int makeBlockMapCaches(BlockMap         *map,
+                       PhysicalLayer    *layer,
+                       ReadOnlyNotifier *readOnlyNotifier,
+                       RecoveryJournal  *journal,
+                       Nonce             nonce,
+                       PageCount         cacheSize,
+                       BlockCount        maximumAge)
+{
+  int result = ASSERT(cacheSize > 0, "block map cache size is specified");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  map->journal = journal;
+  map->nonce   = nonce;
+
+  result = makeForest(map, map->entryCount);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  replaceForest(map);
+  for (ZoneCount zone = 0; zone < map->zoneCount; zone++) {
+    result = initializeBlockMapZone(&map->zones[zone], layer, readOnlyNotifier,
+                                    cacheSize / map->zoneCount, maximumAge);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return makeActionManager(map->zoneCount, getBlockMapZoneThreadID,
+                           getRecoveryJournalThreadID(journal), map,
+                           scheduleEraAdvance, layer,
+                           &map->actionManager);
+}
+
+/**
+ * Clean up a BlockMapZone.
+ *
+ * @param zone  The zone to uninitialize
+ **/
+static void uninitializeBlockMapZone(BlockMapZone *zone)
+{
+  uninitializeBlockMapTreeZone(&zone->treeZone);
+  freeVDOPageCache(&zone->pageCache);
+}
+
+/**********************************************************************/
+void freeBlockMap(BlockMap **mapPtr)
+{
+  BlockMap *map = *mapPtr;
+  if (map == NULL) {
+    return;
+  }
+
+  for (ZoneCount zone = 0; zone < map->zoneCount; zone++) {
+    uninitializeBlockMapZone(&map->zones[zone]);
+  }
+
+  abandonBlockMapGrowth(map);
+  freeForest(&map->forest);
+  freeActionManager(&map->actionManager);
+
+  FREE(map);
+  *mapPtr = NULL;
+}
+
+/**********************************************************************/
+size_t getBlockMapEncodedSize(void)
+{
+  return ENCODED_HEADER_SIZE + sizeof(BlockMapState2_0);
+}
+
+/**********************************************************************/
+int encodeBlockMap(const BlockMap *map, Buffer *buffer)
+{
+  int result = encodeHeader(&BLOCK_MAP_HEADER_2_0, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t initialLength = contentLength(buffer);
+
+  result = putUInt64LEIntoBuffer(buffer, BLOCK_MAP_FLAT_PAGE_ORIGIN);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, map->flatPageCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, map->rootOrigin);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, map->rootCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t encodedSize = contentLength(buffer) - initialLength;
+  return ASSERT(BLOCK_MAP_HEADER_2_0.size == encodedSize,
+                "encoded block map component size must match header size");
+}
+
+/**********************************************************************/
+void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal)
+{
+  map->currentEraPoint  = getCurrentJournalSequenceNumber(journal);
+  map->pendingEraPoint  = map->currentEraPoint;
+
+  for (ZoneCount zone = 0; zone < map->zoneCount; zone++) {
+    setTreeZoneInitialPeriod(&map->zones[zone].treeZone, map->currentEraPoint);
+    setVDOPageCacheInitialPeriod(map->zones[zone].pageCache,
+                                 map->currentEraPoint);
+  }
+}
+
+/**********************************************************************/
+ZoneCount computeLogicalZone(DataVIO *dataVIO)
+{
+  BlockMap   *map                  = getBlockMap(getVDOFromDataVIO(dataVIO));
+  TreeLock   *treeLock             = &dataVIO->treeLock;
+  PageNumber  pageNumber           = computePageNumber(dataVIO->logical.lbn);
+  treeLock->treeSlots[0].pageIndex = pageNumber;
+  treeLock->rootIndex              = pageNumber % map->rootCount;
+  return (treeLock->rootIndex % map->zoneCount);
+}
+
+/**********************************************************************/
+void findBlockMapSlotAsync(DataVIO   *dataVIO,
+                           VDOAction *callback,
+                           ThreadID   threadID)
+{
+  BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO));
+  if (dataVIO->logical.lbn >= map->entryCount) {
+    finishDataVIO(dataVIO, VDO_OUT_OF_RANGE);
+    return;
+  }
+
+  TreeLock         *treeLock = &dataVIO->treeLock;
+  BlockMapTreeSlot *slot     = &treeLock->treeSlots[0];
+  slot->blockMapSlot.slot    = computeSlot(dataVIO->logical.lbn);
+  if (slot->pageIndex < map->flatPageCount) {
+    slot->blockMapSlot.pbn   = slot->pageIndex + BLOCK_MAP_FLAT_PAGE_ORIGIN;
+    launchCallback(dataVIOAsCompletion(dataVIO), callback, threadID);
+    return;
+  }
+
+  treeLock->callback = callback;
+  treeLock->threadID = threadID;
+  lookupBlockMapPBN(dataVIO);
+}
+
+/**********************************************************************/
+PageCount getNumberOfFixedBlockMapPages(const BlockMap *map)
+{
+  return (map->flatPageCount + map->rootCount);
+}
+
+/**********************************************************************/
+BlockCount getNumberOfBlockMapEntries(const BlockMap *map)
+{
+  return map->entryCount;
+}
+
+/**********************************************************************/
+void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber)
+{
+  if (map == NULL) {
+    return;
+  }
+
+  map->pendingEraPoint = recoveryBlockNumber;
+  scheduleDefaultAction(map->actionManager);
+}
+
+/**********************************************************************/
+void checkForDrainComplete(BlockMapZone *zone)
+{
+  if (isDraining(&zone->state)
+      && !isTreeZoneActive(&zone->treeZone)
+      && !isPageCacheActive(zone->pageCache)) {
+    finishDrainingWithResult(&zone->state,
+                             (isReadOnly(zone->readOnlyNotifier)
+                              ? VDO_READ_ONLY : VDO_SUCCESS));
+  }
+}
+
+/**
+ * Initiate a drain of the trees and page cache of a block map zone.
+ *
+ * Implements AdminInitiator
+ **/
+static void initiateDrain(AdminState *state)
+{
+  BlockMapZone *zone = container_of(state, BlockMapZone, state);
+  drainZoneTrees(&zone->treeZone);
+  drainVDOPageCache(zone->pageCache);
+  checkForDrainComplete(zone);
+}
+
+/**
+ * Drain a zone of the block map.
+ *
+ * <p>Implements ZoneAction.
+ **/
+static void drainZone(void          *context,
+                      ZoneCount      zoneNumber,
+                      VDOCompletion *parent)
+{
+  BlockMapZone *zone = getBlockMapZone(context, zoneNumber);
+  startDraining(&zone->state,
+                getCurrentManagerOperation(zone->blockMap->actionManager),
+                parent, initiateDrain);
+}
+
+/**********************************************************************/
+void drainBlockMap(BlockMap       *map,
+                   AdminStateCode  operation,
+                   VDOCompletion  *parent)
+{
+  scheduleOperation(map->actionManager, operation, NULL, drainZone, NULL,
+                    parent);
+}
+
+/**
+ * Resume a zone of the block map.
+ *
+ * <p>Implements ZoneAction.
+ **/
+static void resumeBlockMapZone(void          *context,
+                               ZoneCount      zoneNumber,
+                               VDOCompletion *parent)
+{
+  BlockMapZone *zone = getBlockMapZone(context, zoneNumber);
+  finishCompletion(parent, resumeIfQuiescent(&zone->state));
+}
+
+/**********************************************************************/
+void resumeBlockMap(BlockMap *map, VDOCompletion *parent)
+{
+  scheduleOperation(map->actionManager, ADMIN_STATE_RESUMING, NULL,
+                    resumeBlockMapZone, NULL, parent);
+}
+
+/**********************************************************************/
+int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks)
+{
+  if (map->nextEntryCount == newLogicalBlocks) {
+    return VDO_SUCCESS;
+  }
+
+  if (map->nextEntryCount > 0) {
+    abandonBlockMapGrowth(map);
+  }
+
+  if (newLogicalBlocks < map->entryCount) {
+    map->nextEntryCount = map->entryCount;
+    return VDO_SUCCESS;
+  }
+
+  return makeForest(map, newLogicalBlocks);
+}
+
+/**********************************************************************/
+BlockCount getNewEntryCount(BlockMap *map)
+{
+  return map->nextEntryCount;
+}
+
+/**
+ * Grow the block map by replacing the forest with the one which was prepared.
+ *
+ * Implements ActionPreamble
+ **/
+static void growForest(void *context, VDOCompletion *completion)
+{
+  replaceForest(context);
+  completeCompletion(completion);
+}
+
+/**********************************************************************/
+void growBlockMap(BlockMap *map, VDOCompletion *parent)
+{
+  scheduleOperation(map->actionManager, ADMIN_STATE_SUSPENDED_OPERATION,
+                    growForest, NULL, NULL, parent);
+}
+
+/**********************************************************************/
+void abandonBlockMapGrowth(BlockMap *map)
+{
+  abandonForest(map);
+}
+
+/**
+ * Finish processing a block map get or put operation. This function releases
+ * the page completion and then continues the requester.
+ *
+ * @param completion  The completion for the page fetch
+ * @param result      The result of the block map operation
+ **/
+static inline void finishProcessingPage(VDOCompletion *completion, int result)
+{
+  VDOCompletion *parent = completion->parent;
+  releaseVDOPageCompletion(completion);
+  continueCompletion(parent, result);
+}
+
+/**
+ * Handle an error fetching a page from the cache. This error handler is
+ * registered in setupMappedBlock().
+ *
+ * @param completion  The page completion which got an error
+ **/
+static void handlePageError(VDOCompletion *completion)
+{
+  finishProcessingPage(completion, completion->result);
+}
+
+/**
+ * Get the mapping page for a get/put mapped block operation and dispatch to
+ * the appropriate handler.
+ *
+ * @param dataVIO     The dataVIO
+ * @param modifiable  Whether we intend to modify the mapping
+ * @param action      The handler to process the mapping page
+ **/
+static void setupMappedBlock(DataVIO   *dataVIO,
+                             bool       modifiable,
+                             VDOAction *action)
+{
+  BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone);
+  if (isDraining(&zone->state)) {
+    finishDataVIO(dataVIO, VDO_SHUTTING_DOWN);
+    return;
+  }
+
+  initVDOPageCompletion(&dataVIO->pageCompletion, zone->pageCache,
+                        dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn,
+                        modifiable, dataVIOAsCompletion(dataVIO), action,
+                        handlePageError);
+  getVDOPageAsync(&dataVIO->pageCompletion.completion);
+}
+
+/**
+ * Decode and validate a block map entry and attempt to use it to set the
+ * mapped location of a DataVIO.
+ *
+ * @param dataVIO  The DataVIO to update with the map entry
+ * @param entry    The block map entry for the logical block
+ *
+ * @return VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid
+ *         or an error code for any other failure
+ **/
+__attribute__((warn_unused_result))
+static int setMappedEntry(DataVIO *dataVIO, const BlockMapEntry *entry)
+{
+  // Unpack the PBN for logging purposes even if the entry is invalid.
+  DataLocation mapped = unpackBlockMapEntry(entry);
+
+  if (isValidLocation(&mapped)) {
+    int result = setMappedLocation(dataVIO, mapped.pbn, mapped.state);
+    /*
+     * Return success and all errors not specifically known to be errors from
+     * validating the location. Yes, this expression is redundant; it is
+     * intentional.
+     */
+    if ((result == VDO_SUCCESS)
+        || ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))) {
+      return result;
+    }
+  }
+
+  // Log the corruption even if we wind up ignoring it for write VIOs,
+  // converting all cases to VDO_BAD_MAPPING.
+  logErrorWithStringError(VDO_BAD_MAPPING, "PBN %" PRIu64
+                          " with state %u read from the block map was invalid",
+                          mapped.pbn, mapped.state);
+
+  // A read VIO has no option but to report the bad mapping--reading
+  // zeros would be hiding known data loss.
+  if (isReadDataVIO(dataVIO)) {
+    return VDO_BAD_MAPPING;
+  }
+
+  // A write VIO only reads this mapping to decref the old block. Treat
+  // this as an unmapped entry rather than fail the write.
+  clearMappedLocation(dataVIO);
+  return VDO_SUCCESS;
+}
+
+/**
+ * This callback is registered in getMappedBlockAsync().
+ **/
+static void getMappingFromFetchedPage(VDOCompletion *completion)
+{
+  if (completion->result != VDO_SUCCESS) {
+    finishProcessingPage(completion, completion->result);
+    return;
+  }
+
+  const BlockMapPage *page   = dereferenceReadableVDOPage(completion);
+  int                 result = ASSERT(page != NULL, "page available");
+  if (result != VDO_SUCCESS) {
+    finishProcessingPage(completion, result);
+    return;
+  }
+
+  DataVIO             *dataVIO  = asDataVIO(completion->parent);
+  BlockMapTreeSlot    *treeSlot = &dataVIO->treeLock.treeSlots[0];
+  const BlockMapEntry *entry    = &page->entries[treeSlot->blockMapSlot.slot];
+
+  result = setMappedEntry(dataVIO, entry);
+  finishProcessingPage(completion, result);
+}
+
+/**
+ * This callback is registered in putMappedBlockAsync().
+ **/
+static void putMappingInFetchedPage(VDOCompletion *completion)
+{
+  if (completion->result != VDO_SUCCESS) {
+    finishProcessingPage(completion, completion->result);
+    return;
+  }
+
+  BlockMapPage *page   = dereferenceWritableVDOPage(completion);
+  int           result = ASSERT(page != NULL, "page available");
+  if (result != VDO_SUCCESS) {
+    finishProcessingPage(completion, result);
+    return;
+  }
+
+  DataVIO *dataVIO = asDataVIO(completion->parent);
+  BlockMapPageContext *context = getVDOPageCompletionContext(completion);
+  SequenceNumber oldLock = context->recoveryLock;
+  updateBlockMapPage(page, dataVIO, dataVIO->newMapped.pbn,
+                     dataVIO->newMapped.state, &context->recoveryLock);
+  markCompletedVDOPageDirty(completion, oldLock, context->recoveryLock);
+  finishProcessingPage(completion, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+void getMappedBlockAsync(DataVIO *dataVIO)
+{
+  if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) {
+    // We know that the block map page for this LBN has not been allocated,
+    // so the block must be unmapped.
+    clearMappedLocation(dataVIO);
+    continueDataVIO(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  setupMappedBlock(dataVIO, false, getMappingFromFetchedPage);
+}
+
+/**********************************************************************/
+void putMappedBlockAsync(DataVIO *dataVIO)
+{
+  setupMappedBlock(dataVIO, true, putMappingInFetchedPage);
+}
+
+/**********************************************************************/
+BlockMapStatistics getBlockMapStatistics(BlockMap *map)
+{
+  BlockMapStatistics stats;
+  memset(&stats, 0, sizeof(BlockMapStatistics));
+
+  for (ZoneCount zone = 0; zone < map->zoneCount; zone++) {
+    const AtomicPageCacheStatistics *atoms
+      = getVDOPageCacheStatistics(map->zones[zone].pageCache);
+    stats.dirtyPages      += atomicLoad64(&atoms->counts.dirtyPages);
+    stats.cleanPages      += atomicLoad64(&atoms->counts.cleanPages);
+    stats.freePages       += atomicLoad64(&atoms->counts.freePages);
+    stats.failedPages     += atomicLoad64(&atoms->counts.failedPages);
+    stats.incomingPages   += atomicLoad64(&atoms->counts.incomingPages);
+    stats.outgoingPages   += atomicLoad64(&atoms->counts.outgoingPages);
+
+    stats.cachePressure   += atomicLoad64(&atoms->cachePressure);
+    stats.readCount       += atomicLoad64(&atoms->readCount);
+    stats.writeCount      += atomicLoad64(&atoms->writeCount);
+    stats.failedReads     += atomicLoad64(&atoms->failedReads);
+    stats.failedWrites    += atomicLoad64(&atoms->failedWrites);
+    stats.reclaimed       += atomicLoad64(&atoms->reclaimed);
+    stats.readOutgoing    += atomicLoad64(&atoms->readOutgoing);
+    stats.foundInCache    += atomicLoad64(&atoms->foundInCache);
+    stats.discardRequired += atomicLoad64(&atoms->discardRequired);
+    stats.waitForPage     += atomicLoad64(&atoms->waitForPage);
+    stats.fetchRequired   += atomicLoad64(&atoms->fetchRequired);
+    stats.pagesLoaded     += atomicLoad64(&atoms->pagesLoaded);
+    stats.pagesSaved      += atomicLoad64(&atoms->pagesSaved);
+    stats.flushCount      += atomicLoad64(&atoms->flushCount);
+  }
+
+  return stats;
+}
diff --git a/vdo/base/blockMap.h b/vdo/base/blockMap.h
new file mode 100644
index 0000000..48073a9
--- /dev/null
+++ b/vdo/base/blockMap.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.h#4 $
+ */
+
+#ifndef BLOCK_MAP_H
+#define BLOCK_MAP_H
+
+#include "adminState.h"
+#include "blockMapEntry.h"
+#include "completion.h"
+#include "fixedLayout.h"
+#include "statistics.h"
+#include "types.h"
+
+/**
+ * Create a block map.
+ *
+ * @param [in]  logicalBlocks    The number of logical blocks for the VDO
+ * @param [in]  threadConfig     The thread configuration of the VDO
+ * @param [in]  flatPageCount    The number of flat pages
+ * @param [in]  rootOrigin       The absolute PBN of the first root page
+ * @param [in]  rootCount        The number of tree roots
+ * @param [out] mapPtr           The pointer to hold the new block map
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeBlockMap(BlockCount           logicalBlocks,
+                 const ThreadConfig  *threadConfig,
+                 BlockCount           flatPageCount,
+                 PhysicalBlockNumber  rootOrigin,
+                 BlockCount           rootCount,
+                 BlockMap           **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Quiesce all block map I/O, possibly writing out all dirty metadata.
+ *
+ * @param map        The block map to drain
+ * @param operation  The type of drain to perform
+ * @param parent     The completion to notify when the drain is complete
+ **/
+void drainBlockMap(BlockMap       *map,
+                   AdminStateCode  operation,
+                   VDOCompletion  *parent);
+
+/**
+ * Resume I/O for a quiescent block map.
+ *
+ * @param map     The block map to resume
+ * @param parent  The completion to notify when the resume is complete
+ **/
+void resumeBlockMap(BlockMap *map, VDOCompletion *parent);
+
+/**
+ * Prepare to grow the block map by allocating an expanded collection of trees.
+ *
+ * @param map               The block map to grow
+ * @param newLogicalBlocks  The new logical size of the VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the logical size to which this block map is prepared to grow.
+ *
+ * @param map  The block map
+ *
+ * @return The new number of entries the block map will be grown to or 0 if
+ *         the block map is not prepared to grow
+ **/
+BlockCount getNewEntryCount(BlockMap *map)
+  __attribute__((warn_unused_result));
+
+/**
+ * Grow a block map on which prepareToGrowBlockMap() has already been called.
+ *
+ * @param map     The block map to grow
+ * @param parent  The object to notify when the growth is complete
+ **/
+void growBlockMap(BlockMap *map, VDOCompletion *parent);
+
+/**
+ * Abandon any preparations which were made to grow this block map.
+ *
+ * @param map  The map which won't be grown
+ **/
+void abandonBlockMapGrowth(BlockMap *map);
+
+/**
+ * Decode the state of a block map saved in a buffer, without creating page
+ * caches.
+ *
+ * @param [in]  buffer         A buffer containing the super block state
+ * @param [in]  logicalBlocks  The number of logical blocks for the VDO
+ * @param [in]  threadConfig   The thread configuration of the VDO
+ * @param [out] mapPtr         The pointer to hold the new block map
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int decodeBlockMap(Buffer              *buffer,
+                   BlockCount           logicalBlocks,
+                   const ThreadConfig  *threadConfig,
+                   BlockMap           **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a block map from the saved state of a Sodium block map, and do any
+ * necessary upgrade work.
+ *
+ * @param [in]  buffer         A buffer containing the super block state
+ * @param [in]  logicalBlocks  The number of logical blocks for the VDO
+ * @param [in]  threadConfig   The thread configuration of the VDO
+ * @param [out] mapPtr         The pointer to hold the new block map
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int decodeSodiumBlockMap(Buffer              *buffer,
+                         BlockCount           logicalBlocks,
+                         const ThreadConfig  *threadConfig,
+                         BlockMap           **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate the page caches for a block map.
+ *
+ * @param map               The block map needing caches.
+ * @param layer             The physical layer for the cache
+ * @param readOnlyNotifier  The read only mode context
+ * @param journal           The recovery journal (may be NULL)
+ * @param nonce             The nonce to distinguish initialized pages
+ * @param cacheSize         The block map cache size, in pages
+ * @param maximumAge        The number of journal blocks before a dirtied page
+ *                          is considered old and must be written out
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeBlockMapCaches(BlockMap         *map,
+                       PhysicalLayer    *layer,
+                       ReadOnlyNotifier *readOnlyNotifier,
+                       RecoveryJournal  *journal,
+                       Nonce             nonce,
+                       PageCount         cacheSize,
+                       BlockCount        maximumAge)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a block map and null out the reference to it.
+ *
+ * @param mapPtr  A pointer to the block map to free
+ **/
+void freeBlockMap(BlockMap **mapPtr);
+
+/**
+ * Get the size of the encoded state of a block map.
+ *
+ * @return The encoded size of the map's state
+ **/
+size_t getBlockMapEncodedSize(void)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode the state of a block map into a buffer.
+ *
+ * @param map     The block map to encode
+ * @param buffer  The buffer to encode into
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeBlockMap(const BlockMap *map, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Obtain any necessary state from the recovery journal that is needed for
+ * normal block map operation.
+ *
+ * @param map      The map in question
+ * @param journal  The journal to initialize from
+ **/
+void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal);
+
+/**
+ * Get the portion of the block map for a given logical zone.
+ *
+ * @param map         The map
+ * @param zoneNumber  The number of the zone
+ *
+ * @return The requested block map zone
+ **/
+BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Compute the logical zone on which the entry for a DataVIO
+ * resides
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return The logical zone number for the DataVIO
+ **/
+ZoneCount computeLogicalZone(DataVIO *dataVIO);
+
+/**
+ * Compute the block map slot in which the block map entry for a DataVIO
+ * resides, and cache that number in the DataVIO.
+ *
+ * @param dataVIO  The DataVIO
+ * @param callback The function to call once the slot has been found
+ * @param threadID The thread on which to run the callback
+ **/
+void findBlockMapSlotAsync(DataVIO   *dataVIO,
+                           VDOAction *callback,
+                           ThreadID   threadID);
+
+/**
+ * Get number of block map pages at predetermined locations.
+ *
+ * @param map  The block map
+ *
+ * @return The number of fixed pages used by the map
+ **/
+PageCount getNumberOfFixedBlockMapPages(const BlockMap *map)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get number of block map entries.
+ *
+ * @param map  The block map
+ *
+ * @return The number of entries stored in the map
+ **/
+BlockCount getNumberOfBlockMapEntries(const BlockMap *map)
+  __attribute__((warn_unused_result));
+
+/**
+ * Notify the block map that the recovery journal has finished a new block.
+ * This method must be called from the journal zone thread.
+ *
+ * @param map                  The block map
+ * @param recoveryBlockNumber  The sequence number of the finished recovery
+ *                             journal block
+ **/
+void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber);
+
+/**
+ * Get the block number of the physical block containing the data for the
+ * specified logical block number. All blocks are mapped to physical block
+ * zero by default, which is conventionally the zero block.
+ *
+ * @param dataVIO  The DataVIO of the block to map
+ **/
+void getMappedBlockAsync(DataVIO *dataVIO);
+
+/**
+ * Associate the logical block number for a block represented by a DataVIO
+ * with the physical block number in its newMapped field.
+ *
+ * @param dataVIO  The DataVIO of the block to map
+ **/
+void putMappedBlockAsync(DataVIO *dataVIO);
+
+/**
+ * Get the stats for the block map page cache.
+ *
+ * @param map  The block map containing the cache
+ *
+ * @return The block map statistics
+ **/
+BlockMapStatistics getBlockMapStatistics(BlockMap *map)
+  __attribute__((warn_unused_result));
+
+#endif // BLOCK_MAP_H
diff --git a/vdo/base/blockMapEntry.h b/vdo/base/blockMapEntry.h
new file mode 100644
index 0000000..78304e9
--- /dev/null
+++ b/vdo/base/blockMapEntry.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapEntry.h#4 $
+ */
+
+#ifndef BLOCK_MAP_ENTRY_H
+#define BLOCK_MAP_ENTRY_H
+
+#include "blockMappingState.h"
+#include "constants.h"
+#include "numeric.h"
+#include "types.h"
+
+/**
+ * The entry for each logical block in the block map is encoded into five
+ * bytes, which saves space in both the on-disk and in-memory layouts. It
+ * consists of the 36 low-order bits of a PhysicalBlockNumber (addressing 256
+ * terabytes with a 4KB block size) and a 4-bit encoding of a
+ * BlockMappingState.
+ **/
+typedef union __attribute__((packed)) blockMapEntry {
+  struct __attribute__((packed)) {
+    /**
+     * Bits 7..4: The four highest bits of the 36-bit physical block number
+     * Bits 3..0: The 4-bit BlockMappingState
+     **/
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    unsigned mappingState  : 4;
+    unsigned pbnHighNibble : 4;
+#else
+    unsigned pbnHighNibble : 4;
+    unsigned mappingState  : 4;
+#endif
+
+    /** 32 low-order bits of the 36-bit PBN, in little-endian byte order */
+    byte pbnLowWord[4];
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[5];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    unsigned mappingState  : 4;
+    unsigned pbnHighNibble : 4;
+    uint32_t pbnLowWord;
+  } littleEndian;
+#endif
+} BlockMapEntry;
+
+/**
+ * Unpack the fields of a BlockMapEntry, returning them as a DataLocation.
+ *
+ * @param entry   A pointer to the entry to unpack
+ *
+ * @return the location of the data mapped by the block map entry
+ **/
+static inline DataLocation unpackBlockMapEntry(const BlockMapEntry *entry)
+{
+  PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord);
+  PhysicalBlockNumber high4 = entry->fields.pbnHighNibble;
+  return (DataLocation) {
+    .pbn   = ((high4 << 32) | low32),
+    .state = entry->fields.mappingState,
+  };
+}
+
+/**********************************************************************/
+static inline bool isMappedLocation(const DataLocation *location)
+{
+  return (location->state != MAPPING_STATE_UNMAPPED);
+}
+
+/**********************************************************************/
+static inline bool isValidLocation(const DataLocation *location)
+{
+  if (location->pbn == ZERO_BLOCK) {
+    return !isCompressed(location->state);
+  } else {
+    return isMappedLocation(location);
+  }
+}
+
+/**
+ * Pack a PhysicalBlockNumber into a BlockMapEntry.
+ *
+ * @param pbn            The physical block number to convert to its
+ *                       packed five-byte representation
+ * @param mappingState   The mapping state of the block
+ *
+ * @return the packed representation of the block number and mapping state
+ *
+ * @note unrepresentable high bits of the unpacked PBN are silently truncated
+ **/
+static inline BlockMapEntry packPBN(PhysicalBlockNumber pbn,
+                                    BlockMappingState   mappingState)
+{
+  BlockMapEntry entry;
+  entry.fields.mappingState  = (mappingState & 0x0F);
+  entry.fields.pbnHighNibble = ((pbn >> 32) & 0x0F),
+  storeUInt32LE(entry.fields.pbnLowWord, pbn & UINT_MAX);
+  return entry;
+}
+
+#endif // BLOCK_MAP_ENTRY_H
diff --git a/vdo/base/blockMapInternals.h b/vdo/base/blockMapInternals.h
new file mode 100644
index 0000000..9b2f7a5
--- /dev/null
+++ b/vdo/base/blockMapInternals.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapInternals.h#12 $
+ */
+
+#ifndef BLOCK_MAP_INTERNALS_H
+#define BLOCK_MAP_INTERNALS_H
+
+#include "adminState.h"
+#include "blockMapEntry.h"
+#include "blockMapTree.h"
+#include "completion.h"
+#include "dirtyLists.h"
+#include "header.h"
+#include "intMap.h"
+#include "ringNode.h"
+#include "types.h"
+#include "vdoPageCache.h"
+#include "vioPool.h"
+
+/**
+ * The per-zone fields used by the block map tree.
+ **/
+struct blockMapTreeZone {
+  /** The BlockMapZone which owns this tree zone */
+  BlockMapZone        *mapZone;
+  /** The lists of dirty tree pages */
+  DirtyLists          *dirtyLists;
+  /** The number of tree lookups in progress */
+  VIOCount             activeLookups;
+  /** The map of pages currently being loaded */
+  IntMap              *loadingPages;
+  /** The pool of VIOs for tree I/O */
+  VIOPool             *vioPool;
+  /** The tree page which has issued or will be issuing a flush */
+  TreePage            *flusher;
+  /** The queue of pages waiting for a flush so they can be written out */
+  WaitQueue            flushWaiters;
+  /** The generation after the most recent flush */
+  uint8_t              generation;
+  /** The oldest active generation */
+  uint8_t              oldestGeneration;
+  /** The counts of dirty pages in each generation */
+  uint32_t             dirtyPageCounts[256];
+};
+
+/**
+ * The per-zone fields of the block map.
+ **/
+struct blockMapZone {
+  /** The number of the zone this is */
+  ZoneCount         zoneNumber;
+  /** The ID of this zone's logical thread */
+  ThreadID          threadID;
+  /** The BlockMap which owns this BlockMapZone */
+  BlockMap         *blockMap;
+  /** The ReadOnlyNotifier of the VDO */
+  ReadOnlyNotifier *readOnlyNotifier;
+  /** The page cache for this zone */
+  VDOPageCache     *pageCache;
+  /** The per-zone portion of the tree for this zone */
+  BlockMapTreeZone  treeZone;
+  /** The administrative state of the zone */
+  AdminState        state;
+};
+
+struct blockMap {
+  /** The manager for block map actions */
+  ActionManager       *actionManager;
+  /** The count of pages in the linear part of the block map */
+  BlockCount           flatPageCount;
+  /** The absolute PBN of the first root of the tree part of the block map */
+  PhysicalBlockNumber  rootOrigin;
+  /** The count of root pages of the tree part of the block map */
+  BlockCount           rootCount;
+
+  /** The era point we are currently distributing to the zones */
+  SequenceNumber       currentEraPoint;
+  /** The next era point, not yet distributed to any zone */
+  SequenceNumber       pendingEraPoint;
+
+  /** The number of entries in block map */
+  BlockCount           entryCount;
+  /** The VDO's nonce, for the pages */
+  Nonce                nonce;
+  /** The recovery journal for this map */
+  RecoveryJournal     *journal;
+
+  /** The trees for finding block map pages */
+  Forest              *forest;
+  /** The expanded trees awaiting growth */
+  Forest              *nextForest;
+  /** The number of entries after growth */
+  BlockCount           nextEntryCount;
+
+  /** The number of logical zones */
+  ZoneCount            zoneCount;
+  /** The per zone block map structure */
+  BlockMapZone         zones[];
+};
+
+/**
+ * Compute the number of pages required for a block map with the specified
+ * parameters.
+ *
+ * @param entries   The number of block map entries
+ *
+ * @return The number of pages required
+ **/
+PageCount computeBlockMapPageCount(BlockCount entries);
+
+/**
+ * Compute the number of the block map page on which the entry for a given
+ * logical block resides.
+ *
+ * @param lbn  The logical block number whose page is desired
+ *
+ * @return The number of the block map page containing the entry for
+ *         the given logical block number
+ **/
+__attribute__((warn_unused_result))
+static inline PageNumber computePageNumber(LogicalBlockNumber lbn)
+{
+  return (lbn / BLOCK_MAP_ENTRIES_PER_PAGE);
+}
+
+/**
+ * Find the block map page slot in which the entry for a given logical
+ * block resides.
+ *
+ * @param lbn  The logical block number whose slot
+ *
+ * @return The slot containing the entry for the given logical block number
+ **/
+__attribute__((warn_unused_result))
+static inline SlotNumber computeSlot(LogicalBlockNumber lbn)
+{
+  return (lbn % BLOCK_MAP_ENTRIES_PER_PAGE);
+}
+
+/**
+ * Check whether a zone of the block map has drained, and if so, send a
+ * notification thereof.
+ *
+ * @param zone  The zone to check
+ **/
+void checkForDrainComplete(BlockMapZone *zone);
+
+
+#endif // BLOCK_MAP_INTERNALS_H
diff --git a/vdo/base/blockMapPage.c b/vdo/base/blockMapPage.c
new file mode 100644
index 0000000..8272e12
--- /dev/null
+++ b/vdo/base/blockMapPage.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.c#8 $
+ */
+
+#include "blockMapPage.h"
+
+#include "permassert.h"
+
+#include "blockMap.h"
+#include "blockMapInternals.h"
+#include "blockMapTree.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "recoveryJournal.h"
+#include "statusCodes.h"
+#include "types.h"
+
+enum {
+  PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1,
+};
+
+static const VersionNumber BLOCK_MAP_4_1 = {
+  .majorVersion = 4,
+  .minorVersion = 1,
+};
+
+/**********************************************************************/
+bool isCurrentBlockMapPage(const BlockMapPage *page)
+{
+  return areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version));
+}
+
+/**********************************************************************/
+BlockMapPage *formatBlockMapPage(void                *buffer,
+                                 Nonce                nonce,
+                                 PhysicalBlockNumber  pbn,
+                                 bool                 initialized)
+{
+  memset(buffer, 0, VDO_BLOCK_SIZE);
+  BlockMapPage *page = (BlockMapPage *) buffer;
+  page->version = packVersionNumber(BLOCK_MAP_4_1);
+  storeUInt64LE(page->header.fields.nonce, nonce);
+  storeUInt64LE(page->header.fields.pbn, pbn);
+  page->header.fields.initialized = initialized;
+  return page;
+}
+
+/**********************************************************************/
+BlockMapPageValidity validateBlockMapPage(BlockMapPage        *page,
+                                          Nonce                nonce,
+                                          PhysicalBlockNumber  pbn)
+{
+  // Make sure the page layout isn't accidentally changed by changing the
+  // length of the page header.
+  STATIC_ASSERT_SIZEOF(PageHeader, PAGE_HEADER_4_1_SIZE);
+
+  if (!areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version))
+      || !isBlockMapPageInitialized(page)
+      || (nonce != getUInt64LE(page->header.fields.nonce))) {
+    return BLOCK_MAP_PAGE_INVALID;
+  }
+
+  if (pbn != getBlockMapPagePBN(page)) {
+    return BLOCK_MAP_PAGE_BAD;
+  }
+
+  return BLOCK_MAP_PAGE_VALID;
+}
+
+/**********************************************************************/
+void updateBlockMapPage(BlockMapPage        *page,
+                        DataVIO             *dataVIO,
+                        PhysicalBlockNumber  pbn,
+                        BlockMappingState    mappingState,
+                        SequenceNumber      *recoveryLock)
+{
+  // Encode the new mapping.
+  TreeLock *treeLock = &dataVIO->treeLock;
+  SlotNumber slot = treeLock->treeSlots[treeLock->height].blockMapSlot.slot;
+  page->entries[slot] = packPBN(pbn, mappingState);
+
+  // Adjust references (locks) on the recovery journal blocks.
+  BlockMapZone    *zone      = getBlockMapForZone(dataVIO->logical.zone);
+  BlockMap        *blockMap  = zone->blockMap;
+  RecoveryJournal *journal   = blockMap->journal;
+  SequenceNumber   oldLocked = *recoveryLock;
+  SequenceNumber   newLocked = dataVIO->recoverySequenceNumber;
+
+  if ((oldLocked == 0) || (oldLocked > newLocked)) {
+    // Acquire a lock on the newly referenced journal block.
+    acquireRecoveryJournalBlockReference(journal, newLocked, ZONE_TYPE_LOGICAL,
+                                         zone->zoneNumber);
+
+    // If the block originally held a newer lock, release it.
+    if (oldLocked > 0) {
+      releaseRecoveryJournalBlockReference(journal, oldLocked,
+                                           ZONE_TYPE_LOGICAL,
+                                           zone->zoneNumber);
+    }
+
+    *recoveryLock = newLocked;
+  }
+
+  // Release the transferred lock from the DataVIO.
+  releasePerEntryLockFromOtherZone(journal, newLocked);
+  dataVIO->recoverySequenceNumber = 0;
+}
diff --git a/vdo/base/blockMapPage.h b/vdo/base/blockMapPage.h
new file mode 100644
index 0000000..ee011b3
--- /dev/null
+++ b/vdo/base/blockMapPage.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.h#8 $
+ */
+
+#ifndef BLOCK_MAP_PAGE_H
+#define BLOCK_MAP_PAGE_H
+
+#include "numeric.h"
+
+#include "blockMapEntry.h"
+#include "header.h"
+#include "types.h"
+
+/**
+ * The packed, on-disk representation of a block map page header.
+ **/
+typedef union __attribute__((packed)) {
+  struct __attribute__((packed)) {
+    /**
+     * The 64-bit nonce of the current VDO, in little-endian byte order. Used
+     * to determine whether or not a page has been formatted.
+     **/
+    byte nonce[8];
+
+    /** The 64-bit PBN of this page, in little-endian byte order */
+    byte pbn[8];
+
+    /** Formerly recoverySequenceNumber; may be non-zero on disk */
+    byte unusedLongWord[8];
+
+    /** Whether this page has been initialized on disk (i.e. written twice) */
+    bool initialized;
+
+    /** Formerly entryOffset; now unused since it should always be zero */
+    byte unusedByte1;
+
+    /** Formerly interiorTreePageWriting; may be non-zero on disk */
+    byte unusedByte2;
+
+    /** Formerly generation (for dirty tree pages); may be non-zero on disk */
+    byte unusedByte3;
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[8 + 8 + 8 + 1 + 1 + 1 + 1];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    uint64_t            nonce;
+    PhysicalBlockNumber pbn;
+    uint64_t            unusedLongWord;
+    bool                initialized;
+    uint8_t             unusedByte1;
+    uint8_t             unusedByte2;
+    uint8_t             unusedByte3;
+  } littleEndian;
+#endif
+} PageHeader;
+
+/**
+ * The format of a block map page.
+ **/
+typedef struct __attribute__((packed)) {
+  PackedVersionNumber version;
+  PageHeader          header;
+  BlockMapEntry       entries[];
+} BlockMapPage;
+
+typedef enum {
+  // A block map page is correctly initialized
+  BLOCK_MAP_PAGE_VALID,
+  // A block map page is uninitialized
+  BLOCK_MAP_PAGE_INVALID,
+  // A block map page is intialized, but is the wrong page
+  BLOCK_MAP_PAGE_BAD,
+} BlockMapPageValidity;
+
+/**
+ * Check whether a block map page has been initialized.
+ *
+ * @param page  The page to check
+ *
+ * @return <code>true</code> if the page has been initialized
+ **/
+__attribute__((warn_unused_result))
+static inline bool isBlockMapPageInitialized(const BlockMapPage *page)
+{
+  return page->header.fields.initialized;
+}
+
+/**
+ * Mark whether a block map page has been initialized.
+ *
+ * @param page         The page to mark
+ * @param initialized  The state to set
+ *
+ * @return <code>true</code> if the initialized flag was modified
+ **/
+static inline bool markBlockMapPageInitialized(BlockMapPage *page,
+                                               bool          initialized)
+{
+  if (initialized == page->header.fields.initialized) {
+    return false;
+  }
+
+  page->header.fields.initialized = initialized;
+  return true;
+}
+
+/**
+ * Get the physical block number where a block map page is stored.
+ *
+ * @param page  The page to query
+ *
+ * @return the page's physical block number
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber getBlockMapPagePBN(const BlockMapPage *page)
+{
+  return getUInt64LE(page->header.fields.pbn);
+}
+
+/**
+ * Check whether a block map page is of the current version.
+ *
+ * @param page  The page to check
+ *
+ * @return <code>true</code> if the page has the current version
+ **/
+bool isCurrentBlockMapPage(const BlockMapPage *page)
+  __attribute__((warn_unused_result));
+
+/**
+ * Format a block map page in memory.
+ *
+ * @param buffer       The buffer which holds the page
+ * @param nonce        The VDO nonce
+ * @param pbn          The absolute PBN of the page
+ * @param initialized  Whether the page should be marked as initialized
+ *
+ * @return the buffer pointer, as a block map page (for convenience)
+ **/
+BlockMapPage *formatBlockMapPage(void                *buffer,
+                                 Nonce                nonce,
+                                 PhysicalBlockNumber  pbn,
+                                 bool                 initialized);
+
+/**
+ * Check whether a newly read page is valid, upgrading its in-memory format if
+ * possible and necessary. If the page is valid, clear fields which are not
+ * meaningful on disk.
+ *
+ * @param page   The page to validate
+ * @param nonce  The VDO nonce
+ * @param pbn    The expected absolute PBN of the page
+ *
+ * @return The validity of the page
+ **/
+BlockMapPageValidity validateBlockMapPage(BlockMapPage        *page,
+                                          Nonce                nonce,
+                                          PhysicalBlockNumber  pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Update an entry on a block map page.
+ *
+ * @param [in]     page          The page to update
+ * @param [in]     dataVIO       The DataVIO making the update
+ * @param [in]     pbn           The new PBN for the entry
+ * @param [in]     mappingState  The new mapping state for the entry
+ * @param [in,out] recoveryLock  A reference to the current recovery sequence
+ *                               number lock held by the page. Will be updated
+ *                               if the lock changes to protect the new entry
+ **/
+void updateBlockMapPage(BlockMapPage        *page,
+                        DataVIO             *dataVIO,
+                        PhysicalBlockNumber  pbn,
+                        BlockMappingState    mappingState,
+                        SequenceNumber      *recoveryLock);
+
+#endif // BLOCK_MAP_PAGE_H
diff --git a/vdo/base/blockMapRecovery.c b/vdo/base/blockMapRecovery.c
new file mode 100644
index 0000000..f70be42
--- /dev/null
+++ b/vdo/base/blockMapRecovery.c
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $
+ */
+
+#include "blockMapRecovery.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "heap.h"
+#include "numUtils.h"
+#include "refCounts.h"
+#include "slabDepot.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vdoPageCache.h"
+
+/**
+ * A completion to manage recovering the block map from the recovery journal.
+ * Note that the page completions kept in this structure are not immediately
+ * freed, so the corresponding pages will be locked down in the page cache
+ * until the recovery frees them.
+ **/
+typedef struct {
+  /** completion header */
+  VDOCompletion         completion;
+  /** the completion for flushing the block map */
+  VDOCompletion         subTaskCompletion;
+  /** the thread from which the block map may be flushed */
+  ThreadID              adminThread;
+  /** the thread on which all block map operations must be done */
+  ThreadID              logicalThreadID;
+  /** the block map */
+  BlockMap             *blockMap;
+  /** whether this recovery has been aborted */
+  bool                  aborted;
+  /** whether we are currently launching the initial round of requests */
+  bool                  launching;
+
+  // Fields for the journal entries.
+  /** the journal entries to apply */
+  NumberedBlockMapping *journalEntries;
+  /**
+   * a heap wrapping journalEntries. It re-orders and sorts journal entries in
+   * ascending LBN order, then original journal order. This permits efficient
+   * iteration over the journal entries in order.
+   **/
+  Heap                  replayHeap;
+
+  // Fields tracking progress through the journal entries.
+  /** a pointer to the next journal entry to apply */
+  NumberedBlockMapping *currentEntry;
+  /** the next entry for which the block map page has not been requested */
+  NumberedBlockMapping *currentUnfetchedEntry;
+
+  // Fields tracking requested pages.
+  /** the absolute PBN of the current page being processed */
+  PhysicalBlockNumber   pbn;
+  /** number of pending (non-ready) requests */
+  PageCount             outstanding;
+  /** number of page completions */
+  PageCount             pageCount;
+  /** array of requested, potentially ready page completions */
+  VDOPageCompletion     pageCompletions[];
+} BlockMapRecoveryCompletion;
+
+/**
+ * This is a HeapComparator function that orders NumberedBlockMappings using
+ * the 'blockMapSlot' field as the primary key and the mapping 'number' field
+ * as the secondary key. Using the mapping number preserves the journal order
+ * of entries for the same slot, allowing us to sort by slot while still
+ * ensuring we replay all entries with the same slot in the exact order as they
+ * appeared in the journal.
+ *
+ * <p>The comparator order is reversed from the usual sense since Heap is a
+ * max-heap, returning larger elements before smaller ones, but we want to pop
+ * entries off the heap in ascending LBN order.
+ **/
+static int compareMappings(const void *item1, const void *item2)
+{
+  const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1;
+  const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2;
+
+  if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) {
+    return
+      ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1);
+  }
+
+  if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) {
+    return
+      ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1);
+  }
+
+  if (mapping1->number != mapping2->number) {
+    return ((mapping1->number < mapping2->number) ? 1 : -1);
+  }
+
+  return 0;
+}
+
+/**
+ * Swap two NumberedBlockMapping structures. Implements HeapSwapper.
+ **/
+static void swapMappings(void *item1, void *item2)
+{
+  NumberedBlockMapping *mapping1 = item1;
+  NumberedBlockMapping *mapping2 = item2;
+  NumberedBlockMapping temp = *mapping1;
+  *mapping1 = *mapping2;
+  *mapping2 = temp;
+}
+
+/**
+ * Convert a VDOCompletion to a BlockMapRecoveryCompletion.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a BlockMapRecoveryCompletion
+ **/
+__attribute__((warn_unused_result))
+static inline BlockMapRecoveryCompletion *
+asBlockMapRecoveryCompletion(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0);
+  assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION);
+  return (BlockMapRecoveryCompletion *) completion;
+}
+
+/**
+ * Free a BlockMapRecoveryCompletion and null out the reference to it.
+ *
+ * @param completionPtr  a pointer to the completion to free
+ **/
+static void freeRecoveryCompletion(VDOCompletion **completionPtr)
+{
+  VDOCompletion *completion = *completionPtr;
+  if (completion == NULL) {
+    return;
+  }
+
+  BlockMapRecoveryCompletion *recovery
+    = asBlockMapRecoveryCompletion(*completionPtr);
+  destroyEnqueueable(completion);
+  destroyEnqueueable(&recovery->subTaskCompletion);
+  FREE(recovery);
+  *completionPtr = NULL;
+}
+
+/**
+ * Free the BlockMapRecoveryCompletion and notify the parent that the block map
+ * recovery is done. This callback is registered in makeRecoveryCompletion().
+ *
+ * @param completion  The BlockMapRecoveryCompletion
+ **/
+static void finishBlockMapRecovery(VDOCompletion *completion)
+{
+  int            result = completion->result;
+  VDOCompletion *parent = completion->parent;
+  freeRecoveryCompletion(&completion);
+  finishCompletion(parent, result);
+}
+
+/**
+ * Make a new block map recovery completion.
+ *
+ * @param [in]  vdo             The VDO
+ * @param [in]  entryCount      The number of journal entries
+ * @param [in]  journalEntries  An array of journal entries to process
+ * @param [in]  parent          The parent of the recovery completion
+ * @param [out] recoveryPtr     The new block map recovery completion
+ *
+ * @return a success or error code
+ **/
+static int makeRecoveryCompletion(VDO                         *vdo,
+                                  BlockCount                   entryCount,
+                                  NumberedBlockMapping        *journalEntries,
+                                  VDOCompletion               *parent,
+                                  BlockMapRecoveryCompletion **recoveryPtr)
+{
+  BlockMap *blockMap = getBlockMap(vdo);
+  PageCount pageCount
+    = minPageCount(getConfiguredCacheSize(vdo) >> 1,
+                   MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS);
+
+  BlockMapRecoveryCompletion *recovery;
+  int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount,
+                                 VDOPageCompletion, __func__, &recovery);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&recovery->completion,
+                                           BLOCK_MAP_RECOVERY_COMPLETION,
+                                           vdo->layer);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &recovery->completion;
+    freeRecoveryCompletion(&completion);
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&recovery->subTaskCompletion,
+                                           SUB_TASK_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &recovery->completion;
+    freeRecoveryCompletion(&completion);
+    return result;
+  }
+
+  recovery->blockMap       = blockMap;
+  recovery->journalEntries = journalEntries;
+  recovery->pageCount      = pageCount;
+  recovery->currentEntry   = &recovery->journalEntries[entryCount - 1];
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  recovery->adminThread     = getAdminThread(threadConfig);
+  recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0);
+
+  // Organize the journal entries into a binary heap so we can iterate over
+  // them in sorted order incrementally, avoiding an expensive sort call.
+  initializeHeap(&recovery->replayHeap, compareMappings, swapMappings,
+                 journalEntries, entryCount, sizeof(NumberedBlockMapping));
+  buildHeap(&recovery->replayHeap, entryCount);
+
+  ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID),
+                  "%s must be called on logical thread %u (not %u)", __func__,
+                  recovery->logicalThreadID, getCallbackThreadID());
+  prepareCompletion(&recovery->completion, finishBlockMapRecovery,
+                    finishBlockMapRecovery, recovery->logicalThreadID, parent);
+
+  // This message must be recognizable by VDOTest::RebuildBase.
+  logInfo("Replaying %zu recovery entries into block map",
+          recovery->replayHeap.count);
+
+  *recoveryPtr = recovery;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void flushBlockMap(VDOCompletion *completion)
+{
+  logInfo("Flushing block map changes");
+  BlockMapRecoveryCompletion *recovery
+    = asBlockMapRecoveryCompletion(completion->parent);
+  ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread),
+                  "flushBlockMap() called on admin thread");
+
+  prepareToFinishParent(completion, completion->parent);
+  drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion);
+}
+
+/**
+ * Check whether the recovery is done. If so, finish it by either flushing the
+ * block map (if the recovery was successful), or by cleaning up (if it
+ * wasn't).
+ *
+ * @param recovery  The recovery completion
+ *
+ * @return <code>true</code> if the recovery or recovery is complete
+ **/
+static bool finishIfDone(BlockMapRecoveryCompletion *recovery)
+{
+  // Pages are still being launched or there is still work to do
+  if (recovery->launching || (recovery->outstanding > 0)
+      || (!recovery->aborted
+          && (recovery->currentEntry >= recovery->journalEntries))) {
+    return false;
+  }
+
+  if (recovery->aborted) {
+    /*
+     * We need to be careful here to only free completions that exist. But
+     * since we know none are outstanding, we just go through the ready ones.
+     */
+    for (size_t i = 0; i < recovery->pageCount; i++) {
+      VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i];
+      if (recovery->pageCompletions[i].ready) {
+        releaseVDOPageCompletion(&pageCompletion->completion);
+      }
+    }
+    completeCompletion(&recovery->completion);
+  } else {
+    launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap,
+                             recovery->adminThread, &recovery->completion);
+  }
+
+  return true;
+}
+
+/**
+ * Note that there has been an error during the recovery and finish it if there
+ * is nothing else outstanding.
+ *
+ * @param recovery  The BlockMapRecoveryCompletion
+ * @param result    The error result to use, if one is not already saved
+ **/
+static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result)
+{
+  recovery->aborted = true;
+  setCompletionResult(&recovery->completion, result);
+  finishIfDone(recovery);
+}
+
+/**
+ * Find the first journal entry after a given entry which is not on the same
+ * block map page.
+ *
+ * @param recovery      the BlockMapRecoveryCompletion
+ * @param currentEntry  the entry to search from
+ * @param needsSort     Whether sorting is needed to proceed
+ *
+ * @return Pointer to the first later journal entry on a different block map
+ *         page, or a pointer to just before the journal entries if no
+ *         subsequent entry is on a different block map page.
+ **/
+static NumberedBlockMapping *
+findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery,
+                          NumberedBlockMapping       *currentEntry,
+                          bool                        needsSort)
+{
+  // If currentEntry is invalid, return immediately.
+  if (currentEntry < recovery->journalEntries) {
+    return currentEntry;
+  }
+  size_t currentPage = currentEntry->blockMapSlot.pbn;
+
+  // Decrement currentEntry until it's out of bounds or on a different page.
+  while ((currentEntry >= recovery->journalEntries)
+         && (currentEntry->blockMapSlot.pbn == currentPage)) {
+    if (needsSort) {
+      NumberedBlockMapping *justSortedEntry
+        = sortNextHeapElement(&recovery->replayHeap);
+      ASSERT_LOG_ONLY(justSortedEntry < currentEntry,
+                      "heap is returning elements in an unexpected order");
+    }
+    currentEntry--;
+  }
+  return currentEntry;
+}
+
+/**
+ * Apply a range of journal entries to a block map page.
+ *
+ * @param page           The block map page being modified
+ * @param startingEntry  The first journal entry to apply
+ * @param endingEntry    The entry just past the last journal entry to apply
+ **/
+static void applyJournalEntriesToPage(BlockMapPage         *page,
+                                      NumberedBlockMapping *startingEntry,
+                                      NumberedBlockMapping *endingEntry)
+{
+  NumberedBlockMapping *currentEntry  = startingEntry;
+  while (currentEntry != endingEntry) {
+    page->entries[currentEntry->blockMapSlot.slot]
+      = currentEntry->blockMapEntry;
+    currentEntry--;
+  }
+}
+
+/**********************************************************************/
+static void recoverReadyPages(BlockMapRecoveryCompletion *recovery,
+                              VDOCompletion              *completion);
+
+/**
+ * Note that a page is now ready and attempt to process pages. This callback is
+ * registered in fetchPage().
+ *
+ * @param completion  The VDOPageCompletion for the fetched page
+ **/
+static void pageLoaded(VDOCompletion *completion)
+{
+  BlockMapRecoveryCompletion *recovery
+    = asBlockMapRecoveryCompletion(completion->parent);
+  recovery->outstanding--;
+  if (!recovery->launching) {
+    recoverReadyPages(recovery, completion);
+  }
+}
+
+/**
+ * Handle an error loading a page.
+ *
+ * @param completion  The VDOPageCompletion
+ **/
+static void handlePageLoadError(VDOCompletion *completion)
+{
+  BlockMapRecoveryCompletion *recovery
+    = asBlockMapRecoveryCompletion(completion->parent);
+  recovery->outstanding--;
+  abortRecovery(recovery, completion->result);
+}
+
+/**
+ * Fetch a page from the block map.
+ *
+ * @param recovery    the BlockMapRecoveryCompletion
+ * @param completion  the page completion to use
+ **/
+static void fetchPage(BlockMapRecoveryCompletion *recovery,
+                      VDOCompletion              *completion)
+{
+  if (recovery->currentUnfetchedEntry < recovery->journalEntries) {
+    // Nothing left to fetch.
+    return;
+  }
+
+  // Fetch the next page we haven't yet requested.
+  PhysicalBlockNumber newPBN
+    = recovery->currentUnfetchedEntry->blockMapSlot.pbn;
+  recovery->currentUnfetchedEntry
+    = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry,
+                                true);
+  initVDOPageCompletion(((VDOPageCompletion *) completion),
+                        recovery->blockMap->zones[0].pageCache,
+                        newPBN, true, &recovery->completion,
+                        pageLoaded, handlePageLoadError);
+  recovery->outstanding++;
+  getVDOPageAsync(completion);
+}
+
+/**
+ * Get the next page completion to process. If it isn't ready, we'll try again
+ * when it is.
+ *
+ * @param recovery    The recovery completion
+ * @param completion  The current page completion
+ *
+ * @return The next page completion to process
+ **/
+static VDOPageCompletion *
+getNextPageCompletion(BlockMapRecoveryCompletion *recovery,
+                      VDOPageCompletion          *completion)
+{
+  completion++;
+  if (completion == (&recovery->pageCompletions[recovery->pageCount])) {
+    completion = &recovery->pageCompletions[0];
+  }
+  return completion;
+}
+
+/**
+ * Recover from as many pages as possible.
+ *
+ * @param recovery    The recovery completion
+ * @param completion  The first page completion to process
+ **/
+static void recoverReadyPages(BlockMapRecoveryCompletion *recovery,
+                              VDOCompletion              *completion)
+{
+  if (finishIfDone(recovery)) {
+    return;
+  }
+
+  VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion;
+  if (recovery->pbn != pageCompletion->pbn) {
+    return;
+  }
+
+  while (pageCompletion->ready) {
+    BlockMapPage *page   = dereferenceWritableVDOPage(completion);
+    int           result = ASSERT(page != NULL, "page available");
+    if (result != VDO_SUCCESS) {
+      abortRecovery(recovery, result);
+      return;
+    }
+
+    NumberedBlockMapping *startOfNextPage
+      = findEntryStartingNextPage(recovery, recovery->currentEntry, false);
+    applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage);
+    recovery->currentEntry = startOfNextPage;
+    requestVDOPageWrite(completion);
+    releaseVDOPageCompletion(completion);
+
+    if (finishIfDone(recovery)) {
+      return;
+    }
+
+    recovery->pbn = recovery->currentEntry->blockMapSlot.pbn;
+    fetchPage(recovery, completion);
+    pageCompletion = getNextPageCompletion(recovery, pageCompletion);
+    completion     = &pageCompletion->completion;
+  }
+}
+
+/**********************************************************************/
+void recoverBlockMap(VDO                  *vdo,
+                     BlockCount            entryCount,
+                     NumberedBlockMapping *journalEntries,
+                     VDOCompletion        *parent)
+{
+  BlockMapRecoveryCompletion *recovery;
+  int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent,
+                                      &recovery);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  if (isHeapEmpty(&recovery->replayHeap)) {
+    finishCompletion(&recovery->completion, VDO_SUCCESS);
+    return;
+  }
+
+  NumberedBlockMapping *firstSortedEntry
+    = sortNextHeapElement(&recovery->replayHeap);
+  ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry,
+                  "heap is returning elements in an unexpected order");
+
+  // Prevent any page from being processed until all pages have been launched.
+  recovery->launching = true;
+  recovery->pbn       = recovery->currentEntry->blockMapSlot.pbn;
+  recovery->currentUnfetchedEntry = recovery->currentEntry;
+  for (PageCount i = 0; i < recovery->pageCount; i++) {
+    if (recovery->currentUnfetchedEntry < recovery->journalEntries) {
+      break;
+    }
+
+    fetchPage(recovery, &recovery->pageCompletions[i].completion);
+  }
+  recovery->launching = false;
+
+  // Process any ready pages.
+  recoverReadyPages(recovery, &recovery->pageCompletions[0].completion);
+}
diff --git a/vdo/base/blockMapRecovery.h b/vdo/base/blockMapRecovery.h
new file mode 100644
index 0000000..9029bf0
--- /dev/null
+++ b/vdo/base/blockMapRecovery.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.h#1 $
+ */
+
+#ifndef BLOCK_MAP_RECOVERY_H
+#define BLOCK_MAP_RECOVERY_H
+
+#include "blockMap.h"
+#include "blockMappingState.h"
+#include "types.h"
+
+/**
+ * An explicitly numbered block mapping. Numbering the mappings allows them to
+ * be sorted by logical block number during recovery while still preserving
+ * the relative order of journal entries with the same logical block number.
+ **/
+typedef struct {
+  BlockMapSlot       blockMapSlot;   // Block map slot to map
+  BlockMapEntry      blockMapEntry;  // The encoded block map entry for the LBN
+  uint32_t           number;         // The serial number to use during replay
+} __attribute__((packed)) NumberedBlockMapping;
+
+/**
+ * Recover the block map (normal rebuild).
+ *
+ * @param vdo             The VDO
+ * @param entryCount      The number of journal entries
+ * @param journalEntries  An array of journal entries to process
+ * @param parent          The completion to notify when the rebuild is complete
+ **/
+void recoverBlockMap(VDO                   *vdo,
+                     BlockCount             entryCount,
+                     NumberedBlockMapping  *journalEntries,
+                     VDOCompletion         *parent);
+
+#endif // BLOCK_MAP_RECOVERY_H
diff --git a/vdo/base/blockMapTree.c b/vdo/base/blockMapTree.c
new file mode 100644
index 0000000..fb2b4f4
--- /dev/null
+++ b/vdo/base/blockMapTree.c
@@ -0,0 +1,1272 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.c#21 $
+ */
+
+#include "blockMapTree.h"
+
+#include "logger.h"
+
+#include "blockMap.h"
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "blockMapTreeInternals.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "dirtyLists.h"
+#include "forest.h"
+#include "numUtils.h"
+#include "recoveryJournal.h"
+#include "referenceOperation.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vdoPageCache.h"
+#include "vioPool.h"
+
+enum {
+  BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+typedef struct __attribute__((packed)) {
+  RootCount  rootIndex;
+  Height     height;
+  PageNumber pageIndex;
+  SlotNumber slot;
+} PageDescriptor;
+
+typedef union {
+  PageDescriptor descriptor;
+  uint64_t       key;
+} PageKey;
+
+typedef struct {
+  BlockMapTreeZone *zone;
+  uint8_t           generation;
+} WriteIfNotDirtiedContext;
+
+/**
+ * An invalid PBN used to indicate that the page holding the location of a
+ * tree root has been "loaded".
+ **/
+const PhysicalBlockNumber INVALID_PBN = 0xFFFFFFFFFFFFFFFF;
+
+/**
+ * Convert a RingNode to a TreePage.
+ *
+ * @param ringNode The RingNode to convert
+ *
+ * @return The TreePage which owns the RingNode
+ **/
+static inline TreePage *treePageFromRingNode(RingNode *ringNode)
+{
+  return (TreePage *) ((byte *) ringNode - offsetof(TreePage, node));
+}
+
+/**********************************************************************/
+static void writeDirtyPagesCallback(RingNode *expired, void *context);
+
+/**
+ * Make VIOs for reading, writing, and allocating the arboreal block map.
+ *
+ * Implements VIOConstructor.
+ **/
+__attribute__((warn_unused_result))
+static int makeBlockMapVIOs(PhysicalLayer  *layer,
+                            void           *parent,
+                            void           *buffer,
+                            VIO           **vioPtr)
+{
+  return createVIO(layer, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA,
+                   parent, buffer, vioPtr);
+}
+
+/**********************************************************************/
+int initializeTreeZone(BlockMapZone  *zone,
+                       PhysicalLayer *layer,
+                       BlockCount     eraLength)
+{
+  STATIC_ASSERT_SIZEOF(PageDescriptor, sizeof(uint64_t));
+  BlockMapTreeZone *treeZone = &zone->treeZone;
+  treeZone->mapZone          = zone;
+
+  int result = makeDirtyLists(eraLength, writeDirtyPagesCallback, treeZone,
+                              &treeZone->dirtyLists);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeIntMap(LOCK_MAP_CAPACITY, 0, &treeZone->loadingPages);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return makeVIOPool(layer, BLOCK_MAP_VIO_POOL_SIZE, zone->threadID,
+                     makeBlockMapVIOs, treeZone, &treeZone->vioPool);
+}
+
+/**********************************************************************/
+int replaceTreeZoneVIOPool(BlockMapTreeZone *zone,
+                           PhysicalLayer    *layer,
+                           size_t            poolSize)
+{
+  freeVIOPool(&zone->vioPool);
+  return makeVIOPool(layer, poolSize, zone->mapZone->threadID,
+                     makeBlockMapVIOs, zone, &zone->vioPool);
+}
+
+/**********************************************************************/
+void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone)
+{
+  freeDirtyLists(&treeZone->dirtyLists);
+  freeVIOPool(&treeZone->vioPool);
+  freeIntMap(&treeZone->loadingPages);
+}
+
+/**********************************************************************/
+void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone,
+                              SequenceNumber    period)
+{
+  setCurrentPeriod(treeZone->dirtyLists, period);
+}
+
+/**
+ * Get the BlockMapTreeZone in which a DataVIO is operating.
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return The BlockMapTreeZone
+ **/
+__attribute__((warn_unused_result))
+static inline BlockMapTreeZone *getBlockMapTreeZone(DataVIO *dataVIO)
+{
+  return &(getBlockMapForZone(dataVIO->logical.zone)->treeZone);
+}
+
+/**
+ * Get the TreePage for a given lock. This will be the page referred to by the
+ * lock's tree slot for the lock's current height.
+ *
+ * @param zone  The tree zone of the tree
+ * @param lock  The lock describing the page to get
+ *
+ * @return The requested page
+ **/
+static inline TreePage *getTreePage(const BlockMapTreeZone *zone,
+                                    const TreeLock         *lock)
+{
+  return getTreePageByIndex(zone->mapZone->blockMap->forest,
+                            lock->rootIndex,
+                            lock->height,
+                            lock->treeSlots[lock->height].pageIndex);
+}
+
+/**********************************************************************/
+bool copyValidPage(char                *buffer,
+                   Nonce                nonce,
+                   PhysicalBlockNumber  pbn,
+                   BlockMapPage        *page)
+{
+  BlockMapPage         *loaded   = (BlockMapPage *) buffer;
+  BlockMapPageValidity  validity = validateBlockMapPage(loaded, nonce, pbn);
+  if (validity == BLOCK_MAP_PAGE_VALID) {
+    memcpy(page, loaded, VDO_BLOCK_SIZE);
+    return true;
+  }
+
+  if (validity == BLOCK_MAP_PAGE_BAD) {
+    logErrorWithStringError(VDO_BAD_PAGE,
+                            "Expected page %" PRIu64
+                            " but got page %llu instead",
+                            pbn, getBlockMapPagePBN(loaded));
+  }
+
+  return false;
+}
+
+/**********************************************************************/
+bool isTreeZoneActive(BlockMapTreeZone *zone)
+{
+  return ((zone->activeLookups != 0)
+          || hasWaiters(&zone->flushWaiters)
+          || isVIOPoolBusy(zone->vioPool));
+}
+
+/**
+ * Put the VDO in read-only mode and wake any VIOs waiting for a flush.
+ *
+ * @param zone    The zone
+ * @param result  The error which is causing read-only mode
+ **/
+static void enterZoneReadOnlyMode(BlockMapTreeZone *zone, int result)
+{
+  enterReadOnlyMode(zone->mapZone->readOnlyNotifier, result);
+
+  // We are in read-only mode, so we won't ever write any page out. Just take
+  // all waiters off the queue so the tree zone can be closed.
+  while (hasWaiters(&zone->flushWaiters)) {
+    dequeueNextWaiter(&zone->flushWaiters);
+  }
+
+  checkForDrainComplete(zone->mapZone);
+}
+
+/**
+ * Check whether a generation is strictly older than some other generation in
+ * the context of a zone's current generation range.
+ *
+ * @param zone  The zone in which to do the comparison
+ * @param a     The generation in question
+ * @param b     The generation to compare to
+ *
+ * @return <code>true</code> if generation a is not strictly older than
+ *         generation b in the context of the zone
+ **/
+__attribute__((warn_unused_result))
+static bool isNotOlder(BlockMapTreeZone *zone, uint8_t a, uint8_t b)
+{
+  int result = ASSERT((inCyclicRange(zone->oldestGeneration, a,
+                                     zone->generation, 1 << 8)
+                       && inCyclicRange(zone->oldestGeneration, b,
+                                        zone->generation, 1 << 8)),
+                      "generation(s) %u, %u are out of range [%u, %u]",
+                      a, b, zone->oldestGeneration, zone->generation);
+  if (result != VDO_SUCCESS) {
+    enterZoneReadOnlyMode(zone, result);
+    return true;
+  }
+
+  return inCyclicRange(b, a, zone->generation, 1 << 8);
+}
+
+/**
+ * Decrement the count for a generation and roll the oldest generation if there
+ * are no longer any active pages in it.
+ *
+ * @param zone        The zone
+ * @param generation  The generation to release
+ **/
+static void releaseGeneration(BlockMapTreeZone *zone, uint8_t generation)
+{
+  int result = ASSERT((zone->dirtyPageCounts[generation] > 0),
+                      "dirty page count underflow for generation %u",
+                      generation);
+  if (result != VDO_SUCCESS) {
+    enterZoneReadOnlyMode(zone, result);
+    return;
+  }
+
+  zone->dirtyPageCounts[generation]--;
+  while ((zone->dirtyPageCounts[zone->oldestGeneration] == 0)
+         && (zone->oldestGeneration != zone->generation)) {
+    zone->oldestGeneration++;
+  }
+}
+
+/**
+ * Set the generation of a page and update the dirty page count in the zone.
+ *
+ * @param zone           The zone which owns the page
+ * @param page           The page
+ * @param newGeneration  The generation to set
+ * @param decrementOld   Whether to decrement the count of the page's old
+ *                       generation
+ **/
+static void setGeneration(BlockMapTreeZone *zone,
+                          TreePage         *page,
+                          uint8_t           newGeneration,
+                          bool              decrementOld)
+{
+  uint8_t oldGeneration = page->generation;
+  if (decrementOld && (oldGeneration == newGeneration)) {
+    return;
+  }
+
+  page->generation = newGeneration;
+  uint32_t newCount = ++zone->dirtyPageCounts[newGeneration];
+  int result = ASSERT((newCount != 0),
+                      "dirty page count overflow for generation %u",
+                      newGeneration);
+  if (result != VDO_SUCCESS) {
+    enterZoneReadOnlyMode(zone, result);
+    return;
+  }
+
+  if (decrementOld) {
+    releaseGeneration(zone, oldGeneration);
+  }
+}
+
+/**********************************************************************/
+static void writePage(TreePage *treePage, VIOPoolEntry *entry);
+
+/**
+ * Write out a dirty page if it is still covered by the most recent flush
+ * or if it is the flusher.
+ *
+ * <p>Implements WaiterCallback
+ *
+ * @param waiter   The page to write
+ * @param context  The VIOPoolEntry with which to do the write
+ **/
+static void writePageCallback(Waiter *waiter, void *context)
+{
+  STATIC_ASSERT(offsetof(TreePage, waiter) == 0);
+  writePage((TreePage *) waiter, (VIOPoolEntry *) context);
+}
+
+/**
+ * Acquire a VIO for writing a dirty page.
+ *
+ * @param waiter  The page which needs a VIO
+ * @param zone    The zone
+ **/
+static void acquireVIO(Waiter *waiter, BlockMapTreeZone *zone)
+{
+  waiter->callback = writePageCallback;
+  int result = acquireVIOFromPool(zone->vioPool, waiter);
+  if (result != VDO_SUCCESS) {
+    enterZoneReadOnlyMode(zone, result);
+  }
+}
+
+/**
+ * Attempt to increment the generation.
+ *
+ * @param zone  The zone whose generation is to be incremented
+ *
+ * @return <code>true</code> if all possible generations were not already
+ *         active
+ **/
+static bool attemptIncrement(BlockMapTreeZone *zone)
+{
+  uint8_t generation = zone->generation + 1;
+  if (zone->oldestGeneration == generation) {
+    return false;
+  }
+
+  zone->generation = generation;
+  return true;
+}
+
+/**
+ * Enqueue a page to either launch a flush or wait for the current flush which
+ * is already in progress.
+ *
+ * @param page  The page to enqueue
+ * @param zone  The zone
+ **/
+static void enqueuePage(TreePage *page, BlockMapTreeZone *zone)
+{
+  if ((zone->flusher == NULL) && attemptIncrement(zone)) {
+    zone->flusher = page;
+    acquireVIO(&page->waiter, zone);
+    return;
+  }
+
+  int result = enqueueWaiter(&zone->flushWaiters, &page->waiter);
+  if (result != VDO_SUCCESS) {
+    enterZoneReadOnlyMode(zone, result);
+  }
+}
+
+/**
+ * Write pages which were waiting for a flush and have not been redirtied.
+ * Requeue those pages which were redirtied.
+ *
+ * <p>Implements WaiterCallback.
+ *
+ * @param waiter   The dirty page
+ * @param context  The zone and generation
+ **/
+static void writePageIfNotDirtied(Waiter *waiter, void *context)
+{
+  STATIC_ASSERT(offsetof(TreePage, waiter) == 0);
+  TreePage *page = (TreePage *) waiter;
+  WriteIfNotDirtiedContext *writeContext = context;
+  if (page->generation == writeContext->generation) {
+    acquireVIO(waiter, writeContext->zone);
+    return;
+  }
+
+  enqueuePage(page, writeContext->zone);
+}
+
+/**
+ * Return a VIO to the zone's pool.
+ *
+ * @param zone   The zone which owns the pool
+ * @param entry  The pool entry to return
+ **/
+static void returnToPool(BlockMapTreeZone *zone, VIOPoolEntry *entry)
+{
+  returnVIOToPool(zone->vioPool, entry);
+  checkForDrainComplete(zone->mapZone);
+}
+
+/**
+ * Handle the successful write of a tree page. This callback is registered in
+ * writeInitializedPage().
+ *
+ * @param completion  The VIO doing the write
+ **/
+static void finishPageWrite(VDOCompletion *completion)
+{
+  VIOPoolEntry     *entry = completion->parent;
+  TreePage         *page  = entry->parent;
+  BlockMapTreeZone *zone  = entry->context;
+  releaseRecoveryJournalBlockReference(zone->mapZone->blockMap->journal,
+                                       page->writingRecoveryLock,
+                                       ZONE_TYPE_LOGICAL,
+                                       zone->mapZone->zoneNumber);
+
+  bool dirty    = (page->writingGeneration != page->generation);
+  releaseGeneration(zone, page->writingGeneration);
+  page->writing = false;
+
+  if (zone->flusher == page) {
+    WriteIfNotDirtiedContext context = {
+      .zone       = zone,
+      .generation = page->writingGeneration,
+    };
+    notifyAllWaiters(&zone->flushWaiters, writePageIfNotDirtied, &context);
+    if (dirty && attemptIncrement(zone)) {
+      writePage(page, entry);
+      return;
+    }
+
+    zone->flusher = NULL;
+  }
+
+  if (dirty) {
+    enqueuePage(page, zone);
+  } else if ((zone->flusher == NULL)
+             && hasWaiters(&zone->flushWaiters)
+             && attemptIncrement(zone)) {
+    zone->flusher = (TreePage *) dequeueNextWaiter(&zone->flushWaiters);
+    writePage(zone->flusher, entry);
+    return;
+  }
+
+  returnToPool(zone, entry);
+}
+
+/**
+ * Handle an error writing a tree page. This error handler is registered in
+ * writePage() and writeInitializedPage().
+ *
+ * @param completion  The VIO doing the write
+ **/
+static void handleWriteError(VDOCompletion *completion)
+{
+  int               result = completion->result;
+  VIOPoolEntry     *entry  = completion->parent;
+  BlockMapTreeZone *zone   = entry->context;
+  enterZoneReadOnlyMode(zone, result);
+  returnToPool(zone, entry);
+}
+
+/**
+ * Write a page which has been written at least once. This callback is
+ * registered in (or called directly from) writePage().
+ *
+ * @param completion  The VIO which will do the write
+ **/
+static void writeInitializedPage(VDOCompletion *completion)
+{
+  VIOPoolEntry     *entry    = completion->parent;
+  BlockMapTreeZone *zone     = (BlockMapTreeZone *) entry->context;
+  TreePage         *treePage = (TreePage *) entry->parent;
+
+  /*
+   * Set the initialized field of the copy of the page we are writing to true.
+   * We don't want to set it true on the real page in memory until after this
+   * write succeeds.
+   */
+  BlockMapPage *page = (BlockMapPage *) entry->buffer;
+  markBlockMapPageInitialized(page, true);
+  launchWriteMetadataVIOWithFlush(entry->vio, getBlockMapPagePBN(page),
+                                  finishPageWrite, handleWriteError,
+                                  (zone->flusher == treePage), false);
+}
+
+/**
+ * Write a dirty tree page now that we have a VIO with which to write it.
+ *
+ * @param treePage  The page to write
+ * @param entry     The VIOPoolEntry with which to write
+ **/
+static void writePage(TreePage *treePage, VIOPoolEntry *entry)
+{
+  BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context;
+  if ((zone->flusher != treePage)
+      && (isNotOlder(zone, treePage->generation, zone->generation))) {
+    // This page was re-dirtied after the last flush was  issued, hence we need
+    // to do another flush.
+    enqueuePage(treePage, zone);
+    returnToPool(zone, entry);
+    return;
+  }
+
+  entry->parent = treePage;
+  memcpy(entry->buffer, treePage->pageBuffer, VDO_BLOCK_SIZE);
+
+  VDOCompletion *completion    = vioAsCompletion(entry->vio);
+  completion->callbackThreadID = zone->mapZone->threadID;
+
+  treePage->writing             = true;
+  treePage->writingGeneration   = treePage->generation;
+  treePage->writingRecoveryLock = treePage->recoveryLock;
+
+  // Clear this now so that we know this page is not on any dirty list.
+  treePage->recoveryLock = 0;
+
+  BlockMapPage *page = asBlockMapPage(treePage);
+  if (!markBlockMapPageInitialized(page, true)) {
+    writeInitializedPage(completion);
+    return;
+  }
+
+  launchWriteMetadataVIO(entry->vio, getBlockMapPagePBN(page),
+                         writeInitializedPage, handleWriteError);
+}
+
+/**
+ * Schedule a batch of dirty pages for writing.
+ *
+ * <p>Implements DirtyListsCallback.
+ *
+ * @param expired  The pages to write
+ * @param context  The zone
+ **/
+static void writeDirtyPagesCallback(RingNode *expired, void *context)
+{
+  BlockMapTreeZone *zone       = (BlockMapTreeZone *) context;
+  uint8_t           generation = zone->generation;
+  while (!isRingEmpty(expired)) {
+    TreePage       *page       = treePageFromRingNode(chopRingNode(expired));
+
+    int result = ASSERT(!isWaiting(&page->waiter),
+                        "Newly expired page not already waiting to write");
+    if (result != VDO_SUCCESS) {
+      enterZoneReadOnlyMode(zone, result);
+      continue;
+    }
+
+    setGeneration(zone, page, generation, false);
+    if (!page->writing) {
+      enqueuePage(page, zone);
+    }
+  }
+}
+
+/**********************************************************************/
+void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period)
+{
+  advancePeriod(zone->dirtyLists, period);
+}
+
+/**********************************************************************/
+void drainZoneTrees(BlockMapTreeZone *zone)
+{
+  ASSERT_LOG_ONLY((zone->activeLookups == 0),
+                  "drainZoneTrees() called with no active lookups");
+  if (!isSuspending(&zone->mapZone->state)) {
+    flushDirtyLists(zone->dirtyLists);
+  }
+}
+
+/**
+ * Release a lock on a page which was being loaded or allocated.
+ *
+ * @param dataVIO  The DataVIO releasing the page lock
+ * @param what     What the DataVIO was doing (for logging)
+ **/
+static void releasePageLock(DataVIO *dataVIO, char *what)
+{
+  TreeLock *lock = &dataVIO->treeLock;
+  ASSERT_LOG_ONLY(lock->locked,
+                  "release of unlocked block map page %s for key %" PRIu64
+                  " in tree %u",
+                  what, lock->key, lock->rootIndex);
+  BlockMapTreeZone *zone       = getBlockMapTreeZone(dataVIO);
+  TreeLock         *lockHolder = intMapRemove(zone->loadingPages, lock->key);
+  ASSERT_LOG_ONLY((lockHolder == lock),
+                  "block map page %s mismatch for key %llu in tree %u",
+                  what, lock->key, lock->rootIndex);
+  lock->locked = false;
+}
+
+/**
+ * Continue a DataVIO now that the lookup is complete.
+ *
+ * @param dataVIO  The DataVIO
+ * @param result   The result of the lookup
+ **/
+static void finishLookup(DataVIO *dataVIO, int result)
+{
+  dataVIO->treeLock.height = 0;
+
+  BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO);
+  --zone->activeLookups;
+
+  VDOCompletion *completion = dataVIOAsCompletion(dataVIO);
+  setCompletionResult(completion, result);
+  launchCallback(completion, dataVIO->treeLock.callback,
+                 dataVIO->treeLock.threadID);
+}
+
+/**
+ * Abort a block map PBN lookup due to an error in the load or allocation on
+ * which we were waiting.
+ *
+ * @param waiter   The DataVIO which was waiting for a page load or allocation
+ * @param context  The error which caused the abort
+ **/
+static void abortLookupForWaiter(Waiter *waiter, void *context)
+{
+  DataVIO *dataVIO = waiterAsDataVIO(waiter);
+  int      result  = *((int *) context);
+  if (isReadDataVIO(dataVIO)) {
+    if (result == VDO_NO_SPACE) {
+      result = VDO_SUCCESS;
+    }
+  } else if (result != VDO_NO_SPACE) {
+    result = VDO_READ_ONLY;
+  }
+
+  finishLookup(dataVIO, result);
+}
+
+/**
+ * Abort a block map PBN lookup due to an error loading or allocating a page.
+ *
+ * @param dataVIO  The DataVIO which was loading or allocating a page
+ * @param result   The error code
+ * @param what     What the DataVIO was doing (for logging)
+ **/
+static void abortLookup(DataVIO *dataVIO, int result, char *what)
+{
+  if (result != VDO_NO_SPACE) {
+    enterZoneReadOnlyMode(getBlockMapTreeZone(dataVIO), result);
+  }
+
+  if (dataVIO->treeLock.locked) {
+    releasePageLock(dataVIO, what);
+    notifyAllWaiters(&dataVIO->treeLock.waiters, abortLookupForWaiter,
+                     &result);
+  }
+
+  finishLookup(dataVIO, result);
+}
+
+/**
+ * Abort a block map PBN lookup due to an error loading a page.
+ *
+ * @param dataVIO  The DataVIO doing the page load
+ * @param result   The error code
+ **/
+static void abortLoad(DataVIO *dataVIO, int result)
+{
+  abortLookup(dataVIO, result, "load");
+}
+
+/**
+ * Determine if a location represents a valid mapping for a tree page.
+ *
+ * @param vdo      The VDO
+ * @param mapping  The DataLocation to check
+ * @param height   The height of the entry in the tree
+ *
+ * @return <code>true</code> if the entry represents a invalid page mapping
+ **/
+__attribute__((warn_unused_result))
+static bool isInvalidTreeEntry(const VDO          *vdo,
+                               const DataLocation *mapping,
+                               Height              height)
+{
+  if (!isValidLocation(mapping)
+      || isCompressed(mapping->state)
+      || (isMappedLocation(mapping) && (mapping->pbn == ZERO_BLOCK))) {
+    return true;
+  }
+
+  // Roots aren't physical data blocks, so we can't check their PBNs.
+  if (height == BLOCK_MAP_TREE_HEIGHT) {
+    return false;
+  }
+
+  return !isPhysicalDataBlock(vdo->depot, mapping->pbn);
+}
+
+/**********************************************************************/
+static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO);
+static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO);
+
+/**
+ * Continue a block map PBN lookup now that a page has been loaded by
+ * descending one level in the tree.
+ *
+ * @param dataVIO  The DataVIO doing the lookup
+ * @param page     The page which was just loaded
+ **/
+static void continueWithLoadedPage(DataVIO *dataVIO, BlockMapPage *page)
+{
+  TreeLock         *lock = &dataVIO->treeLock;
+  BlockMapTreeSlot  slot = lock->treeSlots[lock->height];
+  DataLocation mapping
+    = unpackBlockMapEntry(&page->entries[slot.blockMapSlot.slot]);
+  if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) {
+    logErrorWithStringError(VDO_BAD_MAPPING,
+                            "Invalid block map tree PBN: %llu with "
+                            "state %u for page index %u at height %u",
+                            mapping.pbn, mapping.state,
+                            lock->treeSlots[lock->height - 1].pageIndex,
+                            lock->height - 1);
+    abortLoad(dataVIO, VDO_BAD_MAPPING);
+    return;
+  }
+
+  if (!isMappedLocation(&mapping)) {
+    // The page we need is unallocated
+    allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO);
+    return;
+  }
+
+  lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn;
+  if (lock->height == 1) {
+    finishLookup(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  // We know what page we need to load next
+  loadBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO);
+}
+
+/**
+ * Continue a block map PBN lookup now that the page load we were waiting on
+ * has finished.
+ *
+ * @param waiter   The DataVIO waiting for a page to be loaded
+ * @param context  The page which was just loaded
+ **/
+static void continueLoadForWaiter(Waiter *waiter, void *context)
+{
+  DataVIO *dataVIO = waiterAsDataVIO(waiter);
+  dataVIO->treeLock.height--;
+  continueWithLoadedPage(dataVIO, (BlockMapPage *) context);
+}
+
+/**
+ * Finish loading a page now that it has been read in from disk. This callback
+ * is registered in loadPage().
+ *
+ * @param completion  The VIO doing the page read
+ **/
+static void finishBlockMapPageLoad(VDOCompletion *completion)
+{
+  VIOPoolEntry     *entry    = completion->parent;
+  DataVIO          *dataVIO  = entry->parent;
+  BlockMapTreeZone *zone     = (BlockMapTreeZone *) entry->context;
+  TreeLock         *treeLock = &dataVIO->treeLock;
+
+  treeLock->height--;
+  PhysicalBlockNumber pbn
+    = treeLock->treeSlots[treeLock->height].blockMapSlot.pbn;
+  TreePage     *treePage = getTreePage(zone, treeLock);
+  BlockMapPage *page     = (BlockMapPage *) treePage->pageBuffer;
+  Nonce         nonce    = zone->mapZone->blockMap->nonce;
+  if (!copyValidPage(entry->buffer, nonce, pbn, page)) {
+    formatBlockMapPage(page, nonce, pbn, false);
+  }
+  returnVIOToPool(zone->vioPool, entry);
+
+  // Release our claim to the load and wake any waiters
+  releasePageLock(dataVIO, "load");
+  notifyAllWaiters(&treeLock->waiters, continueLoadForWaiter, page);
+  continueWithLoadedPage(dataVIO, page);
+}
+
+/**
+ * Handle an error loading a tree page.
+ *
+ * @param completion  The VIO doing the page read
+ **/
+static void handleIOError(VDOCompletion *completion)
+{
+  int               result  = completion->result;
+  VIOPoolEntry     *entry   = completion->parent;
+  DataVIO          *dataVIO = entry->parent;
+  BlockMapTreeZone *zone    = (BlockMapTreeZone *) entry->context;
+  returnVIOToPool(zone->vioPool, entry);
+  abortLoad(dataVIO, result);
+}
+
+/**
+ * Read a tree page from disk now that we've gotten a VIO with which to do the
+ * read. This WaiterCallback is registered in loadBlockMapPage().
+ *
+ * @param waiter   The DataVIO which requires a page load
+ * @param context  The VIOPool entry with which to do the read
+ **/
+static void loadPage(Waiter *waiter, void *context)
+{
+  VIOPoolEntry *entry   = context;
+  DataVIO      *dataVIO = waiterAsDataVIO(waiter);
+
+  entry->parent = dataVIO;
+  entry->vio->completion.callbackThreadID
+    = getBlockMapForZone(dataVIO->logical.zone)->threadID;
+
+  TreeLock *lock = &dataVIO->treeLock;
+  launchReadMetadataVIO(entry->vio,
+                        lock->treeSlots[lock->height - 1].blockMapSlot.pbn,
+                        finishBlockMapPageLoad, handleIOError);
+}
+
+/**
+ * Attempt to acquire a lock on a page in the block map tree. If the page is
+ * already locked, queue up to wait for the lock to be released. If the lock is
+ * acquired, the DataVIO's treeLock.locked field will be set to true.
+ *
+ * @param zone     The BlockMapTreeZone in which the DataVIO operates
+ * @param dataVIO  The DataVIO which desires a page lock
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int attemptPageLock(BlockMapTreeZone *zone, DataVIO *dataVIO)
+{
+  TreeLock         *lock     = &dataVIO->treeLock;
+  Height            height   = lock->height;
+  BlockMapTreeSlot  treeSlot = lock->treeSlots[height];
+  PageKey           key;
+  key.descriptor = (PageDescriptor) {
+    .rootIndex = lock->rootIndex,
+    .height    = height,
+    .pageIndex = treeSlot.pageIndex,
+    .slot      = treeSlot.blockMapSlot.slot,
+  };
+  lock->key = key.key;
+
+  TreeLock *lockHolder;
+  int result = intMapPut(zone->loadingPages, lock->key, lock, false,
+                         (void **) &lockHolder);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (lockHolder == NULL) {
+    // We got the lock
+    dataVIO->treeLock.locked = true;
+    return VDO_SUCCESS;
+  }
+
+  // Someone else is loading or allocating the page we need
+  return enqueueDataVIO(&lockHolder->waiters, dataVIO,
+                        THIS_LOCATION("$F;cb=blockMapTreePage"));
+}
+
+/**
+ * Load a block map tree page from disk.
+ *
+ * @param zone     The BlockMapTreeZone in which the DataVIO operates
+ * @param dataVIO  The DataVIO which requires a page to be loaded
+ **/
+static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO)
+{
+  int result = attemptPageLock(zone, dataVIO);
+  if (result != VDO_SUCCESS) {
+    abortLoad(dataVIO, result);
+    return;
+  }
+
+  if (dataVIO->treeLock.locked) {
+    Waiter *waiter   = dataVIOAsWaiter(dataVIO);
+    waiter->callback = loadPage;
+    result = acquireVIOFromPool(zone->vioPool, waiter);
+    if (result != VDO_SUCCESS) {
+      abortLoad(dataVIO, result);
+    }
+  }
+}
+
+/**
+ * Set the callback of a DataVIO after it has allocated a block map page.
+ *
+ * @param dataVIO  The DataVIO
+ **/
+static void setPostAllocationCallback(DataVIO *dataVIO)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), dataVIO->treeLock.callback,
+              dataVIO->treeLock.threadID);
+}
+
+/**
+ * Abort a block map PBN lookup due to an error allocating a page.
+ *
+ * @param dataVIO  The DataVIO doing the page allocation
+ * @param result   The error code
+ **/
+static void abortAllocation(DataVIO *dataVIO, int result)
+{
+  setPostAllocationCallback(dataVIO);
+  abortLookup(dataVIO, result, "allocation");
+}
+
+/**
+ * Callback to handle an error while attempting to allocate a page. This
+ * callback is used to transfer back to the logical zone along the block map
+ * page allocation path.
+ *
+ * @param completion  The DataVIO doing the allocation
+ **/
+static void allocationFailure(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  abortAllocation(dataVIO, completion->result);
+}
+
+/**
+ * Continue with page allocations now that a parent page has been allocated.
+ *
+ * @param waiter   The DataVIO which was waiting for a page to be allocated
+ * @param context  The physical block number of the page which was just
+ *                 allocated
+ **/
+static void continueAllocationForWaiter(Waiter *waiter, void *context)
+{
+  DataVIO             *dataVIO  = waiterAsDataVIO(waiter);
+  TreeLock            *treeLock = &dataVIO->treeLock;
+  PhysicalBlockNumber  pbn      = *((PhysicalBlockNumber *) context);
+
+  treeLock->height--;
+  dataVIO->treeLock.treeSlots[treeLock->height].blockMapSlot.pbn = pbn;
+
+  if (treeLock->height == 0) {
+    finishLookup(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO);
+}
+
+/**
+ * Finish the page allocation process by recording the allocation in the tree
+ * and waking any waiters now that the write lock has been released. This
+ * callback is registered in releaseBlockMapWriteLock().
+ *
+ * @param completion  The DataVIO doing the allocation
+ **/
+static void finishBlockMapAllocation(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  if (completion->result != VDO_SUCCESS) {
+    allocationFailure(completion);
+    return;
+  }
+
+  BlockMapTreeZone *zone     = getBlockMapTreeZone(dataVIO);
+  TreeLock         *treeLock = &dataVIO->treeLock;
+  TreePage         *treePage = getTreePage(zone, treeLock);
+  Height            height   = treeLock->height;
+
+  PhysicalBlockNumber pbn = treeLock->treeSlots[height - 1].blockMapSlot.pbn;
+
+  // Record the allocation.
+  BlockMapPage   *page    = (BlockMapPage *) treePage->pageBuffer;
+  SequenceNumber  oldLock = treePage->recoveryLock;
+  updateBlockMapPage(page, dataVIO, pbn, MAPPING_STATE_UNCOMPRESSED,
+                     &treePage->recoveryLock);
+
+  if (isWaiting(&treePage->waiter)) {
+    // This page is waiting to be written out.
+    if (zone->flusher != treePage) {
+      // The outstanding flush won't cover the update we just made, so mark
+      // the page as needing another flush.
+      setGeneration(zone, treePage, zone->generation, true);
+    }
+  } else {
+    // Put the page on a dirty list
+    if (oldLock == 0) {
+      initializeRing(&treePage->node);
+    }
+    addToDirtyLists(zone->dirtyLists, &treePage->node, oldLock,
+                    treePage->recoveryLock);
+  }
+
+  treeLock->height--;
+  if (height > 1) {
+    // Format the interior node we just allocated (in memory).
+    treePage = getTreePage(zone, treeLock);
+    formatBlockMapPage(treePage->pageBuffer, zone->mapZone->blockMap->nonce,
+                       pbn, false);
+  }
+
+  // Release our claim to the allocation and wake any waiters
+  releasePageLock(dataVIO, "allocation");
+  notifyAllWaiters(&treeLock->waiters, continueAllocationForWaiter, &pbn);
+  if (treeLock->height == 0) {
+    finishLookup(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  allocateBlockMapPage(zone, dataVIO);
+}
+
+/**
+ * Release the write lock on a newly allocated block map page now that we
+ * have made its journal entries and reference count updates. This callback
+ * is registered in setBlockMapPageReferenceCount().
+ *
+ * @param completion  The DataVIO doing the allocation
+ **/
+static void releaseBlockMapWriteLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO);
+  assertInAllocatedZone(dataVIO);
+  if (completion->result != VDO_SUCCESS) {
+    launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL));
+    return;
+  }
+
+  releaseAllocationLock(allocatingVIO);
+  resetAllocation(allocatingVIO);
+  launchLogicalCallback(dataVIO, finishBlockMapAllocation,
+                        THIS_LOCATION("$F;cb=finishBlockMapAllocation"));
+}
+
+/**
+ * Set the reference count of a newly allocated block map page to
+ * MAXIMUM_REFERENCES now that we have made a recovery journal entry for it.
+ * MAXIMUM_REFERENCES is used to prevent deduplication against the block after
+ * we release the write lock on it, but before we write out the page.
+ *
+ * @param completion  The DataVIO doing the allocation
+ **/
+static void setBlockMapPageReferenceCount(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInAllocatedZone(dataVIO);
+  if (completion->result != VDO_SUCCESS) {
+    launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL));
+    return;
+  }
+
+  TreeLock *lock = &dataVIO->treeLock;
+  PhysicalBlockNumber pbn = lock->treeSlots[lock->height - 1].blockMapSlot.pbn;
+  completion->callback = releaseBlockMapWriteLock;
+  addSlabJournalEntry(getSlabJournal(getVDOFromDataVIO(dataVIO)->depot, pbn),
+                      dataVIO);
+}
+
+/**
+ * Make a recovery journal entry for a newly allocated block map page.
+ * This callback is registered in continueBlockMapPageAllocation().
+ *
+ * @param completion  The DataVIO doing the allocation
+ **/
+static void journalBlockMapAllocation(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (completion->result != VDO_SUCCESS) {
+    launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL));
+    return;
+  }
+
+  setAllocatedZoneCallback(dataVIO, setBlockMapPageReferenceCount,
+                           THIS_LOCATION(NULL));
+  addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal,
+                          dataVIO);
+}
+
+/**
+ * Continue the process of allocating a block map page now that the
+ * BlockAllocator has given us a block. This method is supplied as the callback
+ * to allocateDataBlock() by allocateBlockMapPage().
+ *
+ * @param allocatingVIO  The DataVIO which is doing the allocation
+ **/
+static void continueBlockMapPageAllocation(AllocatingVIO *allocatingVIO)
+{
+  DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO);
+  if (!hasAllocation(dataVIO)) {
+    setLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL));
+    continueDataVIO(dataVIO, VDO_NO_SPACE);
+    return;
+  }
+
+  PhysicalBlockNumber  pbn  = allocatingVIO->allocation;
+  TreeLock            *lock = &dataVIO->treeLock;
+  lock->treeSlots[lock->height - 1].blockMapSlot.pbn = pbn;
+  setUpReferenceOperationWithLock(BLOCK_MAP_INCREMENT, pbn,
+                                  MAPPING_STATE_UNCOMPRESSED,
+                                  allocatingVIO->allocationLock,
+                                  &dataVIO->operation);
+  launchJournalCallback(dataVIO, journalBlockMapAllocation,
+                        THIS_LOCATION("$F;cb=journalBlockMapAllocation"));
+}
+
+/**
+ * Allocate a block map page.
+ *
+ * @param zone     The zone in which the DataVIO is operating
+ * @param dataVIO  The DataVIO which needs to allocate a page
+ **/
+static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO)
+{
+  if (!isWriteDataVIO(dataVIO) || isTrimDataVIO(dataVIO)) {
+    // This is a pure read, the read phase of a read-modify-write, or a trim,
+    // so there's nothing left to do here.
+    finishLookup(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  int result = attemptPageLock(zone, dataVIO);
+  if (result != VDO_SUCCESS) {
+    abortAllocation(dataVIO, result);
+    return;
+  }
+
+  if (!dataVIO->treeLock.locked) {
+    return;
+  }
+
+  allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO),
+                    getAllocationSelector(dataVIO->logical.zone),
+                    VIO_BLOCK_MAP_WRITE_LOCK,
+                    continueBlockMapPageAllocation);
+}
+
+/**********************************************************************/
+void lookupBlockMapPBN(DataVIO *dataVIO)
+{
+  BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO);
+  zone->activeLookups++;
+  if (isDraining(&zone->mapZone->state)) {
+    finishLookup(dataVIO, VDO_SHUTTING_DOWN);
+    return;
+  }
+
+  TreeLock *lock = &dataVIO->treeLock;
+  PageNumber pageIndex
+    = ((lock->treeSlots[0].pageIndex - zone->mapZone->blockMap->flatPageCount)
+       / zone->mapZone->blockMap->rootCount);
+  BlockMapTreeSlot treeSlot = {
+    .pageIndex = pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE,
+    .blockMapSlot = {
+      .pbn  = 0,
+      .slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE,
+    },
+  };
+
+  BlockMapPage *page = NULL;
+  for (lock->height = 1; lock->height <= BLOCK_MAP_TREE_HEIGHT;
+       lock->height++) {
+    lock->treeSlots[lock->height] = treeSlot;
+    page = (BlockMapPage *) (getTreePage(zone, lock)->pageBuffer);
+    PhysicalBlockNumber pbn = getBlockMapPagePBN(page);
+    if (pbn != ZERO_BLOCK) {
+      lock->treeSlots[lock->height].blockMapSlot.pbn = pbn;
+      break;
+    }
+
+    // Calculate the index and slot for the next level.
+    treeSlot.blockMapSlot.slot
+      = treeSlot.pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE;
+    treeSlot.pageIndex
+      = treeSlot.pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE;
+  }
+
+  // The page at this height has been allocated and loaded.
+  DataLocation mapping
+    = unpackBlockMapEntry(&page->entries[treeSlot.blockMapSlot.slot]);
+  if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) {
+    logErrorWithStringError(VDO_BAD_MAPPING,
+                            "Invalid block map tree PBN: %llu with "
+                            "state %u for page index %u at height %u",
+                            mapping.pbn, mapping.state,
+                            lock->treeSlots[lock->height - 1].pageIndex,
+                            lock->height - 1);
+    abortLoad(dataVIO, VDO_BAD_MAPPING);
+    return;
+  }
+
+  if (!isMappedLocation(&mapping)) {
+    // The page we want one level down has not been allocated, so allocate it.
+    allocateBlockMapPage(zone, dataVIO);
+    return;
+  }
+
+  lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn;
+  if (lock->height == 1) {
+    // This is the ultimate block map page, so we're done
+    finishLookup(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  // We know what page we need to load.
+  loadBlockMapPage(zone, dataVIO);
+}
+
+/**********************************************************************/
+PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber)
+{
+  if (pageNumber < map->flatPageCount) {
+    return (BLOCK_MAP_FLAT_PAGE_ORIGIN + pageNumber);
+  }
+
+  RootCount  rootIndex = pageNumber % map->rootCount;
+  PageNumber pageIndex = ((pageNumber - map->flatPageCount) / map->rootCount);
+  SlotNumber slot      = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE;
+  pageIndex /= BLOCK_MAP_ENTRIES_PER_PAGE;
+
+  TreePage *treePage
+    = getTreePageByIndex(map->forest, rootIndex, 1, pageIndex);
+  BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer;
+  if (!isBlockMapPageInitialized(page)) {
+    return ZERO_BLOCK;
+  }
+
+  DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]);
+  if (!isValidLocation(&mapping) || isCompressed(mapping.state)) {
+    return ZERO_BLOCK;
+  }
+  return mapping.pbn;
+}
+
+/**********************************************************************/
+void writeTreePage(TreePage *page, BlockMapTreeZone *zone)
+{
+  bool waiting = isWaiting(&page->waiter);
+  if (waiting && (zone->flusher == page)) {
+    return;
+  }
+
+  setGeneration(zone, page, zone->generation, waiting);
+  if (waiting || page->writing) {
+    return;
+  }
+
+  enqueuePage(page, zone);
+}
diff --git a/vdo/base/blockMapTree.h b/vdo/base/blockMapTree.h
new file mode 100644
index 0000000..c581454
--- /dev/null
+++ b/vdo/base/blockMapTree.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.h#7 $
+ */
+
+#ifndef BLOCK_MAP_TREE_H
+#define BLOCK_MAP_TREE_H
+
+#include "constants.h"
+#include "types.h"
+
+typedef struct treePage TreePage;
+
+/**
+ * Intialize a BlockMapTreeZone.
+ *
+ * @param zone              The BlockMapZone of the tree zone to intialize
+ * @param layer             The physical layer
+ * @param maximumAge        The number of journal blocks before a dirtied page
+ *                          is considered old and may be written out
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int initializeTreeZone(BlockMapZone     *zone,
+                       PhysicalLayer    *layer,
+                       BlockCount        maximumAge)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up a BlockMapTreeZone.
+ *
+ * @param treeZone  The zone to clean up
+ **/
+void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone);
+
+/**
+ * Set the initial dirty period for a tree zone.
+ *
+ * @param treeZone  The tree zone
+ * @param period    The initial dirty period to set
+ **/
+void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone,
+                              SequenceNumber    period);
+
+/**
+ * Check whether a tree zone is active (i.e. has any active lookups,
+ * outstanding I/O, or pending I/O).
+ *
+ * @param zone  The zone to check
+ *
+ * @return <code>true</code> if the zone is active
+ **/
+bool isTreeZoneActive(BlockMapTreeZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Advance the dirty period for a tree zone.
+ *
+ * @param zone    The BlockMapTreeZone to advance
+ * @param period  The new dirty period
+ **/
+void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period);
+
+/**
+ * Drain the zone trees, i.e. ensure that all I/O is quiesced. If required by
+ * the drain type, all dirty block map trees will be written to disk. This
+ * method must not be called when lookups are active.
+ *
+ * @param zone  The BlockMapTreeZone to drain
+ **/
+void drainZoneTrees(BlockMapTreeZone *zone);
+
+/**
+ * Look up the PBN of the block map page for a DataVIO's LBN in the arboreal
+ * block map. If necessary, the block map page will be allocated. Also, the
+ * ancestors of the block map page will be allocated or loaded if necessary.
+ *
+ * @param dataVIO  The DataVIO requesting the lookup
+ **/
+void lookupBlockMapPBN(DataVIO *dataVIO);
+
+/**
+ * Find the PBN of a leaf block map page. This method may only be used after
+ * all allocated tree pages have been loaded, otherwise, it may give the wrong
+ * answer (0).
+ *
+ * @param map         The block map containing the forest
+ * @param pageNumber  The page number of the desired block map page
+ *
+ * @return The PBN of the page
+ **/
+PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber);
+
+/**
+ * Write a tree page or indicate that it has been re-dirtied if it is already
+ * being written. This method is used when correcting errors in the tree during
+ * read-only rebuild.
+ *
+ * @param page  The page to write
+ * @param zone  The tree zone managing the page
+ **/
+void writeTreePage(TreePage *page, BlockMapTreeZone *zone);
+
+#endif // BLOCK_MAP_TREE_H
diff --git a/vdo/base/blockMapTreeInternals.h b/vdo/base/blockMapTreeInternals.h
new file mode 100644
index 0000000..49b69eb
--- /dev/null
+++ b/vdo/base/blockMapTreeInternals.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTreeInternals.h#4 $
+ */
+
+#ifndef BLOCK_MAP_TREE_INTERNALS_H
+#define BLOCK_MAP_TREE_INTERNALS_H
+
+#include "blockMapTree.h"
+
+#include "blockMapPage.h"
+#include "types.h"
+
+/** A single page of a block map tree */
+struct treePage {
+  /** Waiter for a VIO to write out this page */
+  Waiter         waiter;
+
+  /** Dirty list node */
+  RingNode       node;
+
+  /**
+   * If this is a dirty tree page, the tree zone flush generation in which it
+   * was last dirtied.
+   */
+  uint8_t        generation;
+
+  /** Whether this page is an interior tree page being written out. */
+  bool           writing;
+
+  /**
+   * If this page is being written, the tree zone flush generation of the
+   * copy of the page being written.
+   **/
+  uint8_t        writingGeneration;
+
+  /** The earliest journal block containing uncommitted updates to this page */
+  SequenceNumber recoveryLock;
+
+  /** The value of recoveryLock when the this page last started writing */
+  SequenceNumber writingRecoveryLock;
+
+  /** The buffer to hold the on-disk representation of this page */
+  char           pageBuffer[VDO_BLOCK_SIZE];
+};
+
+typedef struct {
+  PageNumber levels[BLOCK_MAP_TREE_HEIGHT];
+} Boundary;
+
+/**
+ * An invalid PBN used to indicate that the page holding the location of a
+ * tree root has been "loaded".
+ **/
+extern const PhysicalBlockNumber INVALID_PBN;
+
+/**
+ * Extract the BlockMapPage from a TreePage.
+ *
+ * @param treePage  The TreePage
+ *
+ * @return The BlockMapPage of the TreePage
+ **/
+__attribute__((warn_unused_result))
+static inline BlockMapPage *asBlockMapPage(TreePage *treePage)
+{
+  return (BlockMapPage *) treePage->pageBuffer;
+}
+
+/**
+ * Replace the VIOPool in a tree zone. This method is used by unit tests.
+ *
+ * @param zone      The zone whose pool is to be replaced
+ * @param layer     The physical layer from which to make VIOs
+ * @param poolSize  The size of the new pool
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int replaceTreeZoneVIOPool(BlockMapTreeZone *zone,
+                           PhysicalLayer    *layer,
+                           size_t            poolSize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a buffer contains a valid page. If the page is bad, log an
+ * error. If the page is valid, copy it to the supplied page.
+ *
+ * @param buffer  The buffer to validate (and copy)
+ * @param nonce   The VDO nonce
+ * @param pbn     The absolute PBN of the page
+ * @param page    The page to copy into if valid
+ *
+ * @return <code>true</code> if the page was copied (valid)
+ **/
+bool copyValidPage(char                *buffer,
+                   Nonce                nonce,
+                   PhysicalBlockNumber  pbn,
+                   BlockMapPage        *page);
+
+#endif // BLOCK_MAP_TREE_INTERNALS_H
diff --git a/vdo/base/blockMappingState.h b/vdo/base/blockMappingState.h
new file mode 100644
index 0000000..ad2460a
--- /dev/null
+++ b/vdo/base/blockMappingState.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMappingState.h#1 $
+ */
+
+#ifndef BLOCK_MAPPING_STATE_H
+#define BLOCK_MAPPING_STATE_H
+
+#include "common.h"
+
+/**
+ * Four bits of each five-byte block map entry contain a mapping state value
+ * used to distinguish unmapped or trimmed logical blocks (which are treated
+ * as mapped to the zero block) from entries that have been mapped to a
+ * physical block, including the zero block.
+ **/
+typedef enum {
+  MAPPING_STATE_UNMAPPED        = 0,  // Must be zero to be the default value
+  MAPPING_STATE_UNCOMPRESSED    = 1,  // A normal (uncompressed) block
+  MAPPING_STATE_COMPRESSED_BASE = 2,  // Compressed in slot 0
+  MAPPING_STATE_COMPRESSED_MAX  = 15, // Compressed in slot 13
+} BlockMappingState;
+
+/**
+ * The total number of compressed blocks that can live in a physical block.
+ **/
+enum {
+  MAX_COMPRESSION_SLOTS =
+    MAPPING_STATE_COMPRESSED_MAX - MAPPING_STATE_COMPRESSED_BASE + 1,
+};
+
+/**********************************************************************/
+static inline BlockMappingState getStateForSlot(byte slotNumber)
+{
+  return (slotNumber + MAPPING_STATE_COMPRESSED_BASE);
+}
+
+/**********************************************************************/
+static inline byte getSlotFromState(BlockMappingState mappingState)
+{
+  return (mappingState - MAPPING_STATE_COMPRESSED_BASE);
+}
+
+/**********************************************************************/
+static inline bool isCompressed(const BlockMappingState mappingState)
+{
+  return (mappingState > MAPPING_STATE_UNCOMPRESSED);
+}
+
+#endif // BLOCK_MAPPING_STATE_H
diff --git a/vdo/base/completion.c b/vdo/base/completion.c
new file mode 100644
index 0000000..d27fd72
--- /dev/null
+++ b/vdo/base/completion.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.c#10 $
+ */
+
+#include "completion.h"
+
+#include "logger.h"
+#include "statusCodes.h"
+
+static const char *VDO_COMPLETION_TYPE_NAMES[] = {
+  // Keep UNSET_COMPLETION_TYPE at the top.
+  "UNSET_COMPLETION_TYPE",
+
+  // Keep the rest of these in sorted order. If you add or remove an entry,
+  // be sure to update the corresponding list in completion.h.
+  "ACTION_COMPLETION",
+  "ADMIN_COMPLETION",
+  "ASYNC_ACTION_CONTEXT",
+  "BLOCK_ALLOCATOR_COMPLETION",
+  "BLOCK_MAP_RECOVERY_COMPLETION",
+  "CHECK_IDENTIFIER_COMPLETION",
+  "EXTERNAL_COMPLETION",
+  "FLUSH_NOTIFICATION_COMPLETION",
+  "GENERATION_FLUSHED_COMPLETION",
+  "HEARTBEAT_COMPLETION",
+  "LOCK_COUNTER_COMPLETION",
+  "PARTITION_COPY_COMPLETION",
+  "READ_ONLY_MODE_COMPLETION",
+  "READ_ONLY_REBUILD_COMPLETION",
+  "RECOVERY_COMPLETION",
+  "REFERENCE_COUNT_REBUILD_COMPLETION",
+  "SLAB_SCRUBBER_COMPLETION",
+  "SUB_TASK_COMPLETION",
+  "TEST_COMPLETION",
+  "VDO_COMMAND_COMPLETION",
+  "VDO_COMMAND_SUB_COMPLETION",
+  "VDO_EXTENT_COMPLETION",
+  "VDO_PAGE_COMPLETION",
+  "VIO_COMPLETION",
+  "WRAPPING_COMPLETION",
+};
+
+/**********************************************************************/
+void initializeCompletion(VDOCompletion         *completion,
+                          VDOCompletionType      type,
+                          PhysicalLayer         *layer)
+{
+  memset(completion, 0, sizeof(*completion));
+  completion->layer = layer;
+  completion->type  = type;
+  resetCompletion(completion);
+}
+
+/**********************************************************************/
+int initializeEnqueueableCompletion(VDOCompletion      *completion,
+                                    VDOCompletionType   type,
+                                    PhysicalLayer      *layer)
+{
+  initializeCompletion(completion, type, layer);
+  return ((layer->createEnqueueable == NULL)
+          ? VDO_SUCCESS : layer->createEnqueueable(completion));
+}
+
+/**********************************************************************/
+void resetCompletion(VDOCompletion *completion)
+{
+  completion->result   = VDO_SUCCESS;
+  completion->complete = false;
+}
+
+/**
+ * Assert that a completion is not complete.
+ *
+ * @param completion The completion to check
+ **/
+static inline void assertIncomplete(VDOCompletion *completion)
+{
+  ASSERT_LOG_ONLY(!completion->complete, "completion is not complete");
+}
+
+/**********************************************************************/
+void setCompletionResult(VDOCompletion *completion, int result)
+{
+  assertIncomplete(completion);
+  if (completion->result == VDO_SUCCESS) {
+    completion->result = result;
+  }
+}
+
+/**
+ * Check whether a completion's callback must be enqueued, or if it can be run
+ * on the current thread. Side effect: clears the requeue flag if it is set,
+ * so the caller MUST requeue if this returns true.
+ *
+ * @param completion  The completion whose callback is to be invoked
+ *
+ * @return <code>false</code> if the callback must be run on this thread
+ *         <code>true</code>  if the callback must be enqueued
+ **/
+__attribute__((warn_unused_result))
+static inline bool requiresEnqueue(VDOCompletion *completion)
+{
+  if (completion->requeue) {
+    completion->requeue = false;
+    return true;
+  }
+
+  ThreadID callbackThread = completion->callbackThreadID;
+  return (callbackThread != completion->layer->getCurrentThreadID());
+}
+
+/**********************************************************************/
+void invokeCallback(VDOCompletion *completion)
+{
+  if (requiresEnqueue(completion)) {
+    if (completion->enqueueable != NULL) {
+      completion->layer->enqueue(completion->enqueueable);
+      return;
+    }
+    ASSERT_LOG_ONLY(false,
+                    "non-enqueueable completion (type %s) on correct thread",
+                    getCompletionTypeName(completion->type));
+  }
+
+  runCallback(completion);
+}
+
+/**********************************************************************/
+void continueCompletion(VDOCompletion *completion, int result)
+{
+  setCompletionResult(completion, result);
+  invokeCallback(completion);
+}
+
+/**********************************************************************/
+void completeCompletion(VDOCompletion *completion)
+{
+  assertIncomplete(completion);
+  completion->complete = true;
+  if (completion->callback != NULL) {
+    invokeCallback(completion);
+  }
+}
+
+/**********************************************************************/
+void releaseCompletion(VDOCompletion **completionPtr)
+{
+  VDOCompletion *completion = *completionPtr;
+  if (completion == NULL) {
+    return;
+  }
+
+  *completionPtr = NULL;
+  completeCompletion(completion);
+}
+
+/**********************************************************************/
+void releaseCompletionWithResult(VDOCompletion **completionPtr, int result)
+{
+  if (*completionPtr == NULL) {
+    return;
+  }
+
+  setCompletionResult(*completionPtr, result);
+  releaseCompletion(completionPtr);
+}
+
+/**********************************************************************/
+void finishParentCallback(VDOCompletion *completion)
+{
+  finishCompletion((VDOCompletion *) completion->parent, completion->result);
+}
+
+/**********************************************************************/
+void preserveErrorAndContinue(VDOCompletion *completion)
+{
+  if (completion->parent != NULL) {
+    setCompletionResult(completion->parent, completion->result);
+  }
+
+  resetCompletion(completion);
+  invokeCallback(completion);
+}
+
+/**********************************************************************/
+const char *getCompletionTypeName(VDOCompletionType completionType)
+{
+  // Try to catch failures to update the array when the enum values change.
+  STATIC_ASSERT(COUNT_OF(VDO_COMPLETION_TYPE_NAMES)
+                == (MAX_COMPLETION_TYPE - UNSET_COMPLETION_TYPE));
+
+  if (completionType >= MAX_COMPLETION_TYPE) {
+    static char numeric[100];
+    snprintf(numeric, 99, "%d (%#x)", completionType, completionType);
+    return numeric;
+  }
+
+  return VDO_COMPLETION_TYPE_NAMES[completionType];
+}
+
+/**********************************************************************/
+void destroyEnqueueable(VDOCompletion *completion)
+{
+  if ((completion == NULL) || (completion->layer == NULL)
+      || (completion->layer->destroyEnqueueable == NULL)) {
+    return;
+  }
+
+  completion->layer->destroyEnqueueable(&completion->enqueueable);
+}
+
+/**********************************************************************/
+int assertCompletionType(VDOCompletionType actual,
+                         VDOCompletionType expected)
+{
+  return ASSERT((expected == actual),
+                "completion type is %s instead of %s",
+                getCompletionTypeName(actual),
+                getCompletionTypeName(expected));
+}
diff --git a/vdo/base/completion.h b/vdo/base/completion.h
new file mode 100644
index 0000000..d245814
--- /dev/null
+++ b/vdo/base/completion.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.h#11 $
+ */
+
+#ifndef COMPLETION_H
+#define COMPLETION_H
+
+#include "permassert.h"
+
+#include "physicalLayer.h"
+#include "ringNode.h"
+#include "types.h"
+
+typedef enum __attribute__((packed)) {
+  // Keep UNSET_COMPLETION_TYPE at the top.
+  UNSET_COMPLETION_TYPE = 0,
+
+  // Keep the rest of these in sorted order. If you add or remove an entry,
+  // be sure to update the corresponding list in completion.c.
+  ACTION_COMPLETION,
+  ADMIN_COMPLETION,
+  ASYNC_ACTION_CONTEXT,
+  BLOCK_ALLOCATOR_COMPLETION,
+  BLOCK_MAP_RECOVERY_COMPLETION,
+  CHECK_IDENTIFIER_COMPLETION,
+  EXTERNAL_COMPLETION,
+  FLUSH_NOTIFICATION_COMPLETION,
+  GENERATION_FLUSHED_COMPLETION,
+  HEARTBEAT_COMPLETION,
+  LOCK_COUNTER_COMPLETION,
+  PARTITION_COPY_COMPLETION,
+  READ_ONLY_MODE_COMPLETION,
+  READ_ONLY_REBUILD_COMPLETION,
+  RECOVERY_COMPLETION,
+  REFERENCE_COUNT_REBUILD_COMPLETION,
+  SLAB_SCRUBBER_COMPLETION,
+  SUB_TASK_COMPLETION,
+  TEST_COMPLETION,                      // each unit test may define its own
+  VDO_COMMAND_COMPLETION,
+  VDO_COMMAND_SUB_COMPLETION,
+  VDO_EXTENT_COMPLETION,
+  VDO_PAGE_COMPLETION,
+  VIO_COMPLETION,
+  WRAPPING_COMPLETION,
+
+  // Keep MAX_COMPLETION_TYPE at the bottom.
+  MAX_COMPLETION_TYPE
+} VDOCompletionType;
+
+/**
+ * An asynchronous VDO operation.
+ *
+ * @param completion    the completion of the operation
+ **/
+typedef void VDOAction(VDOCompletion *completion);
+
+struct vdoCompletion {
+  /** The type of completion this is */
+  VDOCompletionType  type;
+
+  /**
+   * <code>true</code> once the processing of the operation is complete.
+   * This flag should not be used by waiters external to the VDO base as
+   * it is used to gate calling the callback.
+   **/
+  bool               complete;
+
+  /**
+   * If true, queue this completion on the next callback invocation, even if
+   * it is already running on the correct thread.
+   **/
+  bool               requeue;
+
+  /** The ID of the thread which should run the next callback */
+  ThreadID           callbackThreadID;
+
+  /** The result of the operation */
+  int                result;
+
+  /** The physical layer on which this completion operates */
+  PhysicalLayer     *layer;
+
+  /** The callback which will be called once the operation is complete */
+  VDOAction         *callback;
+
+  /** The callback which, if set, will be called if an error result is set */
+  VDOAction         *errorHandler;
+
+  /** The parent object, if any, that spawned this completion */
+  void              *parent;
+
+  /** The enqueueable for this completion (may be NULL) */
+  Enqueueable       *enqueueable;
+};
+
+/**
+ * Actually run the callback. This function must be called from the correct
+ * callback thread.
+ **/
+static inline void runCallback(VDOCompletion *completion)
+{
+  if ((completion->result != VDO_SUCCESS)
+      && (completion->errorHandler != NULL)) {
+    completion->errorHandler(completion);
+    return;
+  }
+
+  completion->callback(completion);
+}
+
+/**
+ * Set the result of a completion. Older errors will not be masked.
+ *
+ * @param completion The completion whose result is to be set
+ * @param result     The result to set
+ **/
+void setCompletionResult(VDOCompletion *completion, int result);
+
+/**
+ * Initialize a completion to a clean state, for reused completions.
+ *
+ * @param completion The completion to initialize
+ * @param type       The type of the completion
+ * @param layer      The physical layer of the completion
+ **/
+void initializeCompletion(VDOCompletion      *completion,
+                          VDOCompletionType   type,
+                          PhysicalLayer      *layer);
+
+/**
+ * Initialize a completion to a clean state and make an enqueueable for it.
+ *
+ * @param completion The completion to initialize
+ * @param type       The type of the completion
+ * @param layer      The physical layer of the completion
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int initializeEnqueueableCompletion(VDOCompletion      *completion,
+                                    VDOCompletionType   type,
+                                    PhysicalLayer      *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Reset a completion to a clean state, while keeping
+ * the type, layer and parent information.
+ *
+ * @param completion the completion to reset
+ **/
+void resetCompletion(VDOCompletion *completion);
+
+/**
+ * Invoke the callback of a completion. If called on the correct thread (i.e.
+ * the one specified in the completion's callbackThreadID field), the
+ * completion will be run immediately. Otherwise, the completion will be
+ * enqueued on the correct callback thread.
+ **/
+void invokeCallback(VDOCompletion *completion);
+
+/**
+ * Continue processing a completion by setting the current result and calling
+ * invokeCallback().
+ *
+ * @param completion  The completion to continue
+ * @param result      The current result (will not mask older errors)
+ **/
+void continueCompletion(VDOCompletion *completion, int result);
+
+/**
+ * Complete a completion.
+ *
+ * @param completion  The completion to complete
+ **/
+void completeCompletion(VDOCompletion *completion);
+
+/**
+ * Finish a completion.
+ *
+ * @param completion The completion to finish
+ * @param result     The result of the completion (will not mask older errors)
+ **/
+static inline void finishCompletion(VDOCompletion *completion, int result)
+{
+  setCompletionResult(completion, result);
+  completeCompletion(completion);
+}
+
+/**
+ * Complete a completion and NULL out the reference to it.
+ *
+ * @param completionPtr  A pointer to the completion to release
+ **/
+void releaseCompletion(VDOCompletion **completionPtr);
+
+/**
+ * Finish a completion and NULL out the reference to it.
+ *
+ * @param completionPtr  A pointer to the completion to release
+ * @param result         The result of the completion
+ **/
+void releaseCompletionWithResult(VDOCompletion **completionPtr, int result);
+
+/**
+ * A callback to finish the parent of a completion.
+ *
+ * @param completion  The completion which has finished and whose parent should
+ *                    be finished
+ **/
+void finishParentCallback(VDOCompletion *completion);
+
+/**
+ * Error handler which preserves an error in the parent (if there is one),
+ * and then resets the failing completion and calls its non-error callback.
+ *
+ * @param completion  The completion which failed
+ **/
+void preserveErrorAndContinue(VDOCompletion *completion);
+
+/**
+ * A callback which does nothing. This callback is intended to be set as an
+ * error handler in the case where an error should do nothing.
+ *
+ * @param completion  The completion being called back
+ **/
+static inline
+void noopCallback(VDOCompletion *completion __attribute__((unused)))
+{
+}
+
+/**
+ * Destroy the enqueueable associated with this completion.
+ *
+ * @param completion  The completion
+ **/
+void destroyEnqueueable(VDOCompletion *completion);
+
+/**
+ * Assert that a completion is of the correct type
+ *
+ * @param actual    The actual completion type
+ * @param expected  The expected completion type
+ *
+ * @return          VDO_SUCCESS or VDO_PARAMETER_MISMATCH
+ **/
+int assertCompletionType(VDOCompletionType actual,
+                         VDOCompletionType expected);
+
+/**
+ * Return the name of a completion type.
+ *
+ * @param completionType        the completion type
+ *
+ * @return a pointer to a static string; if the completionType is unknown
+ *         this is to a static buffer that may be overwritten.
+ **/
+const char *getCompletionTypeName(VDOCompletionType completionType);
+
+/**
+ * Set the callback for a completion.
+ *
+ * @param completion  The completion
+ * @param callback    The callback to register
+ * @param threadID    The ID of the thread on which the callback should run
+ **/
+static inline void setCallback(VDOCompletion *completion,
+                               VDOAction     *callback,
+                               ThreadID       threadID)
+{
+  completion->callback         = callback;
+  completion->callbackThreadID = threadID;
+}
+
+/**
+ * Set the callback for a completion and invoke it immediately.
+ *
+ * @param completion  The completion
+ * @param callback    The callback to register
+ * @param threadID    The ID of the thread on which the callback should run
+ **/
+static inline void launchCallback(VDOCompletion *completion,
+                                  VDOAction     *callback,
+                                  ThreadID       threadID)
+{
+  setCallback(completion, callback, threadID);
+  invokeCallback(completion);
+}
+
+/**
+ * Set the callback and parent for a completion.
+ *
+ * @param completion  The completion
+ * @param callback    The callback to register
+ * @param threadID    The ID of the thread on which the callback should run
+ * @param parent      The new parent of the completion
+ **/
+static inline void setCallbackWithParent(VDOCompletion *completion,
+                                         VDOAction     *callback,
+                                         ThreadID       threadID,
+                                         void          *parent)
+{
+  setCallback(completion, callback, threadID);
+  completion->parent = parent;
+}
+
+/**
+ * Set the callback and parent for a completion and invoke the callback
+ * immediately.
+ *
+ * @param completion  The completion
+ * @param callback    The callback to register
+ * @param threadID    The ID of the thread on which the callback should run
+ * @param parent      The new parent of the completion
+ **/
+static inline void launchCallbackWithParent(VDOCompletion *completion,
+                                            VDOAction     *callback,
+                                            ThreadID       threadID,
+                                            void          *parent)
+{
+  setCallbackWithParent(completion, callback, threadID, parent);
+  invokeCallback(completion);
+}
+
+/**
+ * Prepare a completion for launch. Reset it, and then set its callback, error
+ * handler, callback thread, and parent.
+ *
+ * @param completion    The completion
+ * @param callback      The callback to register
+ * @param errorHandler  The error handler to register
+ * @param threadID      The ID of the thread on which the callback should run
+ * @param parent        The new parent of the completion
+ **/
+static inline void prepareCompletion(VDOCompletion *completion,
+                                     VDOAction     *callback,
+                                     VDOAction     *errorHandler,
+                                     ThreadID       threadID,
+                                     void          *parent)
+{
+  resetCompletion(completion);
+  setCallbackWithParent(completion, callback, threadID, parent);
+  completion->errorHandler = errorHandler;
+}
+
+/**
+ * Prepare a completion for launch ensuring that it will always be requeued.
+ * Reset it, and then set its callback, error handler, callback thread, and
+ * parent.
+ *
+ * @param completion    The completion
+ * @param callback      The callback to register
+ * @param errorHandler  The error handler to register
+ * @param threadID      The ID of the thread on which the callback should run
+ * @param parent        The new parent of the completion
+ **/
+static inline void prepareForRequeue(VDOCompletion *completion,
+                                     VDOAction     *callback,
+                                     VDOAction     *errorHandler,
+                                     ThreadID       threadID,
+                                     void          *parent)
+{
+  prepareCompletion(completion, callback, errorHandler, threadID, parent);
+  completion->requeue = true;
+}
+
+/**
+ * Prepare a completion for launch which will complete its parent when
+ * finished.
+ *
+ * @param completion  The completion
+ * @param parent      The parent to complete
+ **/
+static inline void prepareToFinishParent(VDOCompletion *completion,
+                                         VDOCompletion *parent)
+{
+  prepareCompletion(completion, finishParentCallback, finishParentCallback,
+                    parent->callbackThreadID, parent);
+}
+
+#endif // COMPLETION_H
diff --git a/vdo/base/compressedBlock.c b/vdo/base/compressedBlock.c
new file mode 100644
index 0000000..d9f93e8
--- /dev/null
+++ b/vdo/base/compressedBlock.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.c#3 $
+ */
+
+#include "compressedBlock.h"
+
+#include "memoryAlloc.h"
+#include "numeric.h"
+
+static const VersionNumber COMPRESSED_BLOCK_1_0 = {
+  .majorVersion = 1,
+  .minorVersion = 0,
+};
+
+/**********************************************************************/
+void resetCompressedBlockHeader(CompressedBlockHeader *header)
+{
+  STATIC_ASSERT(sizeof(header->fields) == sizeof(header->raw));
+
+  header->fields.version = packVersionNumber(COMPRESSED_BLOCK_1_0);
+  memset(header->fields.sizes, 0, sizeof(header->fields.sizes));
+}
+
+/**********************************************************************/
+static uint16_t
+getCompressedFragmentSize(const CompressedBlockHeader *header, byte slot)
+{
+  return getUInt16LE(header->fields.sizes[slot]);
+}
+
+/**********************************************************************/
+int getCompressedBlockFragment(BlockMappingState  mappingState,
+                               char              *buffer,
+                               BlockSize          blockSize,
+                               uint16_t          *fragmentOffset,
+                               uint16_t          *fragmentSize)
+{
+  if (!isCompressed(mappingState)) {
+    return VDO_INVALID_FRAGMENT;
+  }
+
+  CompressedBlockHeader *header = (CompressedBlockHeader *) buffer;
+  VersionNumber version = unpackVersionNumber(header->fields.version);
+  if (!areSameVersion(version, COMPRESSED_BLOCK_1_0)) {
+    return VDO_INVALID_FRAGMENT;
+  }
+
+  byte slot = getSlotFromState(mappingState);
+  if (slot >= MAX_COMPRESSION_SLOTS) {
+    return VDO_INVALID_FRAGMENT;
+  }
+
+  uint16_t compressedSize = getCompressedFragmentSize(header, slot);
+  uint16_t offset         = sizeof(CompressedBlockHeader);
+  for (unsigned int i = 0; i < slot; i++) {
+    offset += getCompressedFragmentSize(header, i);
+    if (offset >= blockSize) {
+      return VDO_INVALID_FRAGMENT;
+    }
+  }
+
+  if ((offset + compressedSize) > blockSize) {
+    return VDO_INVALID_FRAGMENT;
+  }
+
+  *fragmentOffset = offset;
+  *fragmentSize   = compressedSize;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void putCompressedBlockFragment(CompressedBlock *block,
+                                unsigned int     fragment,
+                                uint16_t         offset,
+                                const char      *data,
+                                uint16_t         size)
+{
+  storeUInt16LE(block->header.fields.sizes[fragment], size);
+  memcpy(&block->data[offset], data, size);
+}
diff --git a/vdo/base/compressedBlock.h b/vdo/base/compressedBlock.h
new file mode 100644
index 0000000..603841f
--- /dev/null
+++ b/vdo/base/compressedBlock.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.h#3 $
+ */
+
+#ifndef COMPRESSED_BLOCK_H
+#define COMPRESSED_BLOCK_H
+
+#include "blockMappingState.h"
+#include "header.h"
+
+/**
+ * The header of a compressed block.
+ **/
+typedef union __attribute__((packed)) {
+  struct __attribute__((packed)) {
+    /** Unsigned 32-bit major and minor versions, in little-endian byte order */
+    PackedVersionNumber version;
+
+    /** List of unsigned 16-bit compressed block sizes, in little-endian order */
+    byte sizes[MAX_COMPRESSION_SLOTS][2];
+  } fields;
+
+  // A raw view of the packed encoding.
+  byte raw[4 + 4 + (2 * MAX_COMPRESSION_SLOTS)];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining compressed block headers in GDB.
+  struct __attribute__((packed)) {
+    VersionNumber version;
+    uint16_t      sizes[MAX_COMPRESSION_SLOTS];
+  } littleEndian;
+#endif
+} CompressedBlockHeader;
+
+/**
+ * The compressed block overlay.
+ **/
+typedef struct {
+  CompressedBlockHeader header;
+  char                  data[];
+} __attribute__((packed)) CompressedBlock;
+
+/**
+ * Initializes/resets a compressed block header.
+ *
+ * @param header        the header
+ *
+ * When done, the version number is set to the current version, and all
+ * fragments are empty.
+ **/
+void resetCompressedBlockHeader(CompressedBlockHeader *header);
+
+/**
+ * Get a reference to a compressed fragment from a compression block.
+ *
+ * @param [in]  mappingState    the mapping state for the look up
+ * @param [in]  buffer          buffer that contains compressed data
+ * @param [in]  blockSize       size of a data block
+ * @param [out] fragmentOffset  the offset of the fragment within a
+ *                              compressed block
+ * @param [out] fragmentSize    the size of the fragment
+ *
+ * @return If a valid compressed fragment is found, VDO_SUCCESS;
+ *         otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid.
+ **/
+int getCompressedBlockFragment(BlockMappingState  mappingState,
+                               char              *buffer,
+                               BlockSize          blockSize,
+                               uint16_t          *fragmentOffset,
+                               uint16_t          *fragmentSize);
+
+/**
+ * Copy a fragment into the compressed block.
+ *
+ * @param block      the compressed block
+ * @param fragment   the number of the fragment
+ * @param offset     the byte offset of the fragment in the data area
+ * @param data       a pointer to the compressed data
+ * @param size       the size of the data
+ *
+ * @note no bounds checking -- the data better fit without smashing other stuff
+ **/
+void putCompressedBlockFragment(CompressedBlock *block,
+                                unsigned int     fragment,
+                                uint16_t         offset,
+                                const char      *data,
+                                uint16_t         size);
+
+#endif // COMPRESSED_BLOCK_H
diff --git a/vdo/base/compressionState.c b/vdo/base/compressionState.c
new file mode 100644
index 0000000..d773756
--- /dev/null
+++ b/vdo/base/compressionState.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.c#2 $
+ */
+
+#include "compressionStateInternals.h"
+
+#include "dataVIO.h"
+#include "packer.h"
+
+static const uint32_t STATUS_MASK           = 0xff;
+static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000;
+
+/**********************************************************************/
+VIOCompressionState getCompressionState(DataVIO *dataVIO)
+{
+  uint32_t packedValue = atomicLoad32(&dataVIO->compression.state);
+  return (VIOCompressionState) {
+    .status         = packedValue & STATUS_MASK,
+    .mayNotCompress = ((packedValue & MAY_NOT_COMPRESS_MASK) != 0),
+  };
+}
+
+/**
+ * Convert a VIOCompressionState into a uint32_t which may be stored
+ * atomically.
+ *
+ * @param state  The state to convert
+ *
+ * @return The compression state packed into a uint32_t
+ **/
+__attribute__((warn_unused_result))
+static uint32_t packState(VIOCompressionState state)
+{
+  return state.status | (state.mayNotCompress ? MAY_NOT_COMPRESS_MASK : 0);
+}
+
+/**********************************************************************/
+bool setCompressionState(DataVIO             *dataVIO,
+                         VIOCompressionState  state,
+                         VIOCompressionState  newState)
+{
+  return compareAndSwap32(&dataVIO->compression.state, packState(state),
+                          packState(newState));
+}
+
+/**
+ * Advance to the next compression state along the compression path.
+ *
+ * @param dataVIO  The DataVIO to advance
+ *
+ * @return The new compression status of the DataVIO
+ **/
+static VIOCompressionStatus advanceStatus(DataVIO *dataVIO)
+{
+  for (;;) {
+    VIOCompressionState state = getCompressionState(dataVIO);
+    if (state.status == VIO_POST_PACKER) {
+      // We're already in the last state.
+      return state.status;
+    }
+
+    VIOCompressionState newState = state;
+    if (state.mayNotCompress) {
+      // Compression has been dis-allowed for this VIO, so skip the rest of the
+      // path and go to the end.
+      newState.status = VIO_POST_PACKER;
+    } else {
+      // Go to the next state.
+      newState.status++;
+    }
+
+    if (setCompressionState(dataVIO, state, newState)) {
+      return newState.status;
+    }
+
+    // Another thread changed the state out from under us so try again.
+  }
+}
+
+/**********************************************************************/
+bool mayCompressDataVIO(DataVIO *dataVIO)
+{
+  if (!hasAllocation(dataVIO)
+      || ((getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC)
+          && vioRequiresFlushAfter(dataVIOAsVIO(dataVIO)))
+      || !getVDOCompressing(getVDOFromDataVIO(dataVIO))) {
+    /*
+     * If this VIO didn't get an allocation, the compressed write probably
+     * won't either, so don't try compressing it. Also, if compression is off,
+     * don't compress.
+     */
+    setCompressionDone(dataVIO);
+    return false;
+  }
+
+  if (dataVIO->hashLock == NULL) {
+    // DataVIOs without a HashLock (which should be extremely rare) aren't
+    // able to share the packer's PBN lock, so don't try to compress them.
+    return false;
+  }
+
+  return (advanceStatus(dataVIO) == VIO_COMPRESSING);
+}
+
+/**********************************************************************/
+bool mayPackDataVIO(DataVIO *dataVIO)
+{
+  if (!isSufficientlyCompressible(dataVIO)
+      || !getVDOCompressing(getVDOFromDataVIO(dataVIO))
+      || getCompressionState(dataVIO).mayNotCompress) {
+    // If the data in this VIO doesn't compress, or compression is off, or
+    // compression for this VIO has been canceled, don't send it to the packer.
+    setCompressionDone(dataVIO);
+    return false;
+  }
+
+  return true;
+}
+
+/**********************************************************************/
+bool mayBlockInPacker(DataVIO *dataVIO)
+{
+  return (advanceStatus(dataVIO) == VIO_PACKING);
+}
+
+/**********************************************************************/
+bool mayWriteCompressedDataVIO(DataVIO *dataVIO)
+{
+  advanceStatus(dataVIO);
+  return !getCompressionState(dataVIO).mayNotCompress;
+}
+
+/**********************************************************************/
+void setCompressionDone(DataVIO *dataVIO)
+{
+  for (;;) {
+    VIOCompressionState state = getCompressionState(dataVIO);
+    if (state.status == VIO_POST_PACKER) {
+      // The VIO is already done.
+      return;
+    }
+
+    // If compression was cancelled on this VIO, preserve that fact.
+    VIOCompressionState newState = {
+      .status         = VIO_POST_PACKER,
+      .mayNotCompress = true,
+    };
+    if (setCompressionState(dataVIO, state, newState)) {
+      return;
+    }
+  }
+}
+
+/**********************************************************************/
+bool cancelCompression(DataVIO *dataVIO)
+{
+  VIOCompressionState state;
+  for (;;) {
+    state = getCompressionState(dataVIO);
+    if (state.mayNotCompress || (state.status == VIO_POST_PACKER)) {
+      // This DataVIO is already set up to not block in the packer.
+      break;
+    }
+
+    VIOCompressionState newState = {
+      .status         = state.status,
+      .mayNotCompress = true,
+    };
+    if (setCompressionState(dataVIO, state, newState)) {
+      break;
+    }
+  }
+
+  return ((state.status == VIO_PACKING) && !state.mayNotCompress);
+}
diff --git a/vdo/base/compressionState.h b/vdo/base/compressionState.h
new file mode 100644
index 0000000..19a4143
--- /dev/null
+++ b/vdo/base/compressionState.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.h#2 $
+ */
+
+#ifndef COMPRESSION_STATE_H
+#define COMPRESSION_STATE_H
+
+#include "atomic.h"
+#include "types.h"
+
+/**
+ * Where a DataVIO is on the compression path; advanceStatus() depends on the
+ * order of this enum.
+ **/
+typedef enum {
+  /* A VIO which has not yet entered the compression path */
+  VIO_PRE_COMPRESSOR = 0,
+  /* A VIO which is in the compressor */
+  VIO_COMPRESSING,
+  /* A VIO which is blocked in the packer */
+  VIO_PACKING,
+  /* A VIO which is no longer on the compression path (and never will be) */
+  VIO_POST_PACKER,
+} VIOCompressionStatus;
+
+typedef struct {
+  VIOCompressionStatus status;
+  bool                 mayNotCompress;
+} VIOCompressionState;
+
+/**
+ * Get the compression state of a DataVIO.
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return The compression state
+ **/
+__attribute__((warn_unused_result))
+VIOCompressionState getCompressionState(DataVIO *dataVIO);
+
+/**
+ * Check whether a DataVIO may go to the compressor.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO may be compressed at this time
+ **/
+bool mayCompressDataVIO(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a DataVIO may go to the packer.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO may be packed at this time
+ **/
+bool mayPackDataVIO(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a DataVIO which has gone to the packer may block there. Any
+ * cancelation after this point and before the DataVIO is written out requires
+ * this DataVIO to be picked up by the canceling DataVIO.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO may block in the packer
+ **/
+bool mayBlockInPacker(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the packer may write out a DataVIO as part of a compressed
+ * block.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO may be written as part of a
+ *         compressed block at this time
+ **/
+bool mayWriteCompressedDataVIO(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Indicate that this DataVIO is leaving the compression path.
+ *
+ * @param dataVIO  The DataVIO leaving the compression path
+ **/
+void setCompressionDone(DataVIO *dataVIO);
+
+/**
+ * Prevent this DataVIO from being compressed or packed.
+ *
+ * @param dataVIO  The DataVIO to cancel
+ *
+ * @return <code>true</code> if the DataVIO is in the packer and the caller
+ *         was the first caller to cancel it
+ **/
+bool cancelCompression(DataVIO *dataVIO);
+
+#endif /* COMPRESSION_STATE_H */
diff --git a/vdo/base/compressionStateInternals.h b/vdo/base/compressionStateInternals.h
new file mode 100644
index 0000000..a9b8dec
--- /dev/null
+++ b/vdo/base/compressionStateInternals.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionStateInternals.h#1 $
+ */
+
+#ifndef COMPRESSION_STATE_INTERNALS_H
+#define COMPRESSION_STATE_INTERNALS_H
+
+#include "compressionState.h"
+
+/**
+ * Set the compression state of a DataVIO (exposed for testing).
+ *
+ * @param dataVIO   The DataVIO whose compression state is to be set
+ * @param state     The expected current state of the DataVIO
+ * @param newState  The state to set
+ *
+ * @return <code>true</code> if the new state was set, false if the DataVIO's
+ *         compression state did not match the expected state, and so was
+ *         left unchanged
+ **/
+bool setCompressionState(DataVIO             *dataVIO,
+                         VIOCompressionState  state,
+                         VIOCompressionState  newState);
+
+#endif /* COMPRESSION_STATE_H */
diff --git a/vdo/base/constants.c b/vdo/base/constants.c
new file mode 100644
index 0000000..05d3a42
--- /dev/null
+++ b/vdo/base/constants.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.c#1 $
+ */
+
+#include "types.h"
+
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+const BlockCount MAXIMUM_LOGICAL_BLOCKS  = 1024ULL * 1024 * 1024 * 1024;
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+const BlockCount MAXIMUM_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64;
+
+// unit test minimum
+const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS = 2;
diff --git a/vdo/base/constants.h b/vdo/base/constants.h
new file mode 100644
index 0000000..8b61c5f
--- /dev/null
+++ b/vdo/base/constants.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.h#2 $
+ */
+
+#ifndef CONSTANTS_H
+#define CONSTANTS_H
+
+#include "types.h"
+
+enum {
+  /** The number of entries on a block map page */
+  BLOCK_MAP_ENTRIES_PER_PAGE                       = 812,
+
+  /** The origin of the flat portion of the block map */
+  BLOCK_MAP_FLAT_PAGE_ORIGIN                       = 1,
+
+  /**
+   * The height of a block map tree. Assuming a root count of 60 and 812
+   * entries per page, this is big enough to represent almost 95 PB of logical
+   * space.
+   **/
+  BLOCK_MAP_TREE_HEIGHT                            = 5,
+
+  /** The number of trees in the arboreal block map */
+  DEFAULT_BLOCK_MAP_TREE_ROOT_COUNT                = 60,
+
+  /** The default size of the recovery journal, in blocks */
+  DEFAULT_RECOVERY_JOURNAL_SIZE                    = 32 * 1024,
+
+  /** The default size of each slab journal, in blocks */
+  DEFAULT_SLAB_JOURNAL_SIZE                        = 224,
+
+  /**
+   * The initial size of lbnOperations and pbnOperations, which is based
+   * upon the expected maximum number of outstanding VIOs. This value was
+   * chosen to make it highly unlikely that the maps would need to be resized.
+   **/
+  LOCK_MAP_CAPACITY                                = 10000,
+
+  /** The maximum number of logical zones */
+  MAX_LOGICAL_ZONES                                = 60,
+
+  /** The maximum number of physical zones */
+  MAX_PHYSICAL_ZONES                               = 16,
+
+  /** The base-2 logarithm of the maximum blocks in one slab */
+  MAX_SLAB_BITS                                    = 23,
+
+  /** The maximum number of slabs the slab depot supports */
+  MAX_SLABS                                        = 8192,
+
+  /**
+   * The maximum number of block map pages to load simultaneously during
+   * recovery or rebuild.
+   **/
+  MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS = 1024,
+
+  /** The maximum number of VIOs in the system at once */
+  MAXIMUM_USER_VIOS                                = 2048,
+
+  /**
+   * The number of in-memory recovery journal blocks is determined by:
+   * -- 311 journal entries in a 4k block
+   * -- maximum of 2048 VIOs making entries at once
+   * so we need at least 2048 / 312 = 7 journal blocks.
+   **/
+  RECOVERY_JOURNAL_TAIL_BUFFER_SIZE                = 64,
+
+  /** The number of sectors per block */
+  SECTORS_PER_BLOCK                                = 8,
+
+  /** The only physical block size supported by VDO */
+  VDO_BLOCK_SIZE                                   = 4096,
+
+  /** The size of a sector that will not be torn */
+  VDO_SECTOR_SIZE                                  = 512,
+
+  /** The physical block number reserved for storing the zero block */
+  ZERO_BLOCK                                       = 0,
+};
+
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+extern const BlockCount MAXIMUM_LOGICAL_BLOCKS;
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+ extern const BlockCount MAXIMUM_PHYSICAL_BLOCKS;
+
+// unit test minimum
+extern const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS;
+
+#endif // CONSTANTS_H
diff --git a/vdo/base/dataVIO.c b/vdo/base/dataVIO.c
new file mode 100644
index 0000000..a9778f5
--- /dev/null
+++ b/vdo/base/dataVIO.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.c#7 $
+ */
+
+#include "dataVIO.h"
+
+#include "logger.h"
+
+#include "atomic.h"
+#include "blockMap.h"
+#include "compressionState.h"
+#include "extent.h"
+#include "logicalZone.h"
+#include "threadConfig.h"
+#include "vdoInternal.h"
+#include "vioRead.h"
+#include "vioWrite.h"
+
+static const char *ASYNC_OPERATION_NAMES[] = {
+  "launch",
+  "acknowledgeWrite",
+  "acquireHashLock",
+  "acquireLogicalBlockLock",
+  "acquirePBNReadLock",
+  "checkForDedupeForRollover",
+  "checkForDeduplication",
+  "compressData",
+  "continueVIOAsync",
+  "findBlockMapSlot",
+  "getMappedBlock",
+  "getMappedBlockForDedupe",
+  "getMappedBlockForWrite",
+  "hashData",
+  "journalDecrementForDedupe",
+  "journalDecrementForWrite",
+  "journalIncrementForCompression",
+  "journalIncrementForDedupe",
+  "journalIncrementForWrite",
+  "journalMappingForCompression",
+  "journalMappingForDedupe",
+  "journalMappingForWrite",
+  "journalUnmappingForDedupe",
+  "journalUnmappingForWrite",
+  "attemptPacking",
+  "putMappedBlock",
+  "putMappedBlockForDedupe",
+  "readData",
+  "updateIndex",
+  "verifyDeduplication",
+  "writeData",
+};
+
+/**
+ * Initialize the LBN lock of a DataVIO. In addition to recording the LBN on
+ * which the DataVIO will operate, it will also find the logical zone
+ * associated with the LBN.
+ *
+ * @param dataVIO  The dataVIO to initialize
+ * @param lbn      The lbn on which the dataVIO will operate
+ **/
+static void initializeLBNLock(DataVIO *dataVIO, LogicalBlockNumber lbn)
+{
+  LBNLock *lock = &dataVIO->logical;
+  lock->lbn     = lbn;
+  lock->locked  = false;
+  initializeWaitQueue(&lock->waiters);
+
+  VDO *vdo   = getVDOFromDataVIO(dataVIO);
+  lock->zone = getLogicalZone(vdo->logicalZones, computeLogicalZone(dataVIO));
+}
+
+/**********************************************************************/
+void prepareDataVIO(DataVIO            *dataVIO,
+                    LogicalBlockNumber  lbn,
+                    VIOOperation        operation,
+                    bool                isTrim,
+                    VDOAction          *callback)
+{
+  // Clearing the tree lock must happen before initializing the LBN lock,
+  // which also adds information to the tree lock.
+  memset(&dataVIO->treeLock,  0, sizeof(dataVIO->treeLock));
+  initializeLBNLock(dataVIO, lbn);
+  initializeRing(&dataVIO->hashLockNode);
+  initializeRing(&dataVIO->writeNode);
+
+  resetAllocation(dataVIOAsAllocatingVIO(dataVIO));
+
+  dataVIO->isDuplicate = false;
+
+  memset(&dataVIO->chunkName, 0, sizeof(dataVIO->chunkName));
+  memset(&dataVIO->duplicate, 0, sizeof(dataVIO->duplicate));
+
+  VIO *vio       = dataVIOAsVIO(dataVIO);
+  vio->operation = operation;
+  vio->callback  = callback;
+  dataVIO->pageCompletion.completion.enqueueable
+    = vioAsCompletion(vio)->enqueueable;
+
+  dataVIO->mapped.state = MAPPING_STATE_UNCOMPRESSED;
+  dataVIO->newMapped.state
+    = (isTrim ? MAPPING_STATE_UNMAPPED : MAPPING_STATE_UNCOMPRESSED);
+  resetCompletion(vioAsCompletion(vio));
+  setLogicalCallback(dataVIO, attemptLogicalBlockLock,
+                     THIS_LOCATION("$F;cb=acquireLogicalBlockLock"));
+}
+
+/**********************************************************************/
+void completeDataVIO(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  if (completion->result != VDO_SUCCESS) {
+    VIO *vio = dataVIOAsVIO(dataVIO);
+    updateVIOErrorStats(vio,
+                        "Completing %s VIO for LBN %" PRIu64
+                        " with error after %s",
+                        getVIOReadWriteFlavor(vio), dataVIO->logical.lbn,
+                        getOperationName(dataVIO));
+  }
+
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F($io)"));
+  if (isReadDataVIO(dataVIO)) {
+    cleanupReadDataVIO(dataVIO);
+  } else {
+    cleanupWriteDataVIO(dataVIO);
+  }
+}
+
+/**********************************************************************/
+void finishDataVIO(DataVIO *dataVIO, int result)
+{
+  VDOCompletion *completion = dataVIOAsCompletion(dataVIO);
+  setCompletionResult(completion, result);
+  completeDataVIO(completion);
+}
+
+/**********************************************************************/
+const char *getOperationName(DataVIO *dataVIO)
+{
+  STATIC_ASSERT((MAX_ASYNC_OPERATION_NUMBER - MIN_ASYNC_OPERATION_NUMBER)
+                == COUNT_OF(ASYNC_OPERATION_NAMES));
+
+  return ((dataVIO->lastAsyncOperation < MAX_ASYNC_OPERATION_NUMBER)
+          ? ASYNC_OPERATION_NAMES[dataVIO->lastAsyncOperation]
+          : "unknown async operation");
+}
+
+/**********************************************************************/
+void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice)
+{
+  /*
+   * NOTE: this is called on non-base-code threads. Be very careful to not do
+   * anything here that needs a base code thread-local variable, such as
+   * trying to get the current thread ID, or that does a lot of work.
+   */
+
+  VDO *vdo = getVDOFromDataVIO(dataVIO);
+  ZonedPBN duplicate = validateDedupeAdvice(vdo, advice, dataVIO->logical.lbn);
+  setDuplicateLocation(dataVIO, duplicate);
+}
+
+/**********************************************************************/
+void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source)
+{
+  dataVIO->isDuplicate = (source.pbn != ZERO_BLOCK);
+  dataVIO->duplicate   = source;
+}
+
+/**********************************************************************/
+void clearMappedLocation(DataVIO *dataVIO)
+{
+  dataVIO->mapped = (ZonedPBN) { .state = MAPPING_STATE_UNMAPPED };
+}
+
+/**********************************************************************/
+int setMappedLocation(DataVIO             *dataVIO,
+                      PhysicalBlockNumber  pbn,
+                      BlockMappingState    state)
+{
+  PhysicalZone *zone;
+  int result = getPhysicalZone(getVDOFromDataVIO(dataVIO), pbn, &zone);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  dataVIO->mapped = (ZonedPBN) {
+    .pbn   = pbn,
+    .state = state,
+    .zone  = zone,
+  };
+  return VDO_SUCCESS;
+}
+
+/**
+ * Launch a request which has acquired an LBN lock.
+ *
+ * @param dataVIO  The DataVIO which has just acquired a lock
+ **/
+static void launchLockedRequest(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  dataVIO->logical.locked = true;
+
+  if (isWriteDataVIO(dataVIO)) {
+    launchWriteDataVIO(dataVIO);
+  } else {
+    launchReadDataVIO(dataVIO);
+  }
+}
+
+/**********************************************************************/
+void attemptLogicalBlockLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+
+  if (dataVIO->logical.lbn
+      >= getVDOFromDataVIO(dataVIO)->config.logicalBlocks) {
+    finishDataVIO(dataVIO, VDO_OUT_OF_RANGE);
+    return;
+  }
+
+  DataVIO *lockHolder;
+  LBNLock *lock = &dataVIO->logical;
+  int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, dataVIO, false,
+                         (void **) &lockHolder);
+  if (result != VDO_SUCCESS) {
+    finishDataVIO(dataVIO, result);
+    return;
+  }
+
+  if (lockHolder == NULL) {
+    // We got the lock
+    launchLockedRequest(dataVIO);
+    return;
+  }
+
+  result = ASSERT(lockHolder->logical.locked, "logical block lock held");
+  if (result != VDO_SUCCESS) {
+    finishDataVIO(dataVIO, result);
+    return;
+  }
+
+  /*
+   * If the new request is a pure read request (not read-modify-write) and
+   * the lockHolder is writing and has received an allocation (VDO-2683),
+   * service the read request immediately by copying data from the lockHolder
+   * to avoid having to flush the write out of the packer just to prevent the
+   * read from waiting indefinitely. If the lockHolder does not yet have an
+   * allocation, prevent it from blocking in the packer and wait on it.
+   */
+  if (isReadDataVIO(dataVIO) && atomicLoadBool(&lockHolder->hasAllocation)) {
+    dataVIOAsCompletion(dataVIO)->layer->copyData(lockHolder, dataVIO);
+    finishDataVIO(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  dataVIO->lastAsyncOperation = ACQUIRE_LOGICAL_BLOCK_LOCK;
+  result = enqueueDataVIO(&lockHolder->logical.waiters, dataVIO,
+                          THIS_LOCATION("$F;cb=logicalBlockLock"));
+  if (result != VDO_SUCCESS) {
+    finishDataVIO(dataVIO, result);
+    return;
+  }
+
+  // Prevent writes and read-modify-writes from blocking indefinitely on
+  // lock holders in the packer.
+  if (!isReadDataVIO(lockHolder) && cancelCompression(lockHolder)) {
+    dataVIO->compression.lockHolder = lockHolder;
+    launchPackerCallback(dataVIO, removeLockHolderFromPacker,
+                         THIS_LOCATION("$F;cb=removeLockHolderFromPacker"));
+  }
+}
+
+/**
+ * Release an uncontended LBN lock.
+ *
+ * @param dataVIO  The DataVIO holding the lock
+ **/
+static void releaseLock(DataVIO *dataVIO)
+{
+  LBNLock *lock    = &dataVIO->logical;
+  IntMap  *lockMap = getLBNLockMap(lock->zone);
+  if (!lock->locked) {
+    // The lock is not locked, so it had better not be registered in the lock
+    // map.
+    DataVIO *lockHolder = intMapGet(lockMap, lock->lbn);
+    ASSERT_LOG_ONLY((dataVIO != lockHolder),
+                    "no logical block lock held for block %llu",
+                    lock->lbn);
+    return;
+  }
+
+  // Remove the lock from the logical block lock map, releasing the lock.
+  DataVIO *lockHolder = intMapRemove(lockMap, lock->lbn);
+  ASSERT_LOG_ONLY((dataVIO == lockHolder),
+                  "logical block lock mismatch for block %llu", lock->lbn);
+  lock->locked = false;
+  return;
+}
+
+/**********************************************************************/
+void releaseLogicalBlockLock(DataVIO *dataVIO)
+{
+  assertInLogicalZone(dataVIO);
+  if (!hasWaiters(&dataVIO->logical.waiters)) {
+    releaseLock(dataVIO);
+    return;
+  }
+
+  LBNLock *lock = &dataVIO->logical;
+  ASSERT_LOG_ONLY(lock->locked, "LBNLock with waiters is not locked");
+
+  // Another DataVIO is waiting for the lock, so just transfer it in a single
+  // lock map operation
+  DataVIO *nextLockHolder = waiterAsDataVIO(dequeueNextWaiter(&lock->waiters));
+
+  // Transfer the remaining lock waiters to the next lock holder.
+  transferAllWaiters(&lock->waiters, &nextLockHolder->logical.waiters);
+
+  DataVIO *lockHolder;
+  int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, nextLockHolder,
+                         true, (void **) &lockHolder);
+  if (result != VDO_SUCCESS) {
+    finishDataVIO(nextLockHolder, result);
+    return;
+  }
+
+  ASSERT_LOG_ONLY((lockHolder == dataVIO),
+                  "logical block lock mismatch for block %llu", lock->lbn);
+  lock->locked = false;
+
+  /*
+   * If there are still waiters, other DataVIOs must be trying to get the lock
+   * we just transferred. We must ensure that the new lock holder doesn't block
+   * in the packer.
+   */
+  if (hasWaiters(&nextLockHolder->logical.waiters)) {
+    cancelCompression(nextLockHolder);
+  }
+
+  // Avoid stack overflow on lock transfer.
+  // XXX: this is only an issue in the 1 thread config.
+  dataVIOAsCompletion(nextLockHolder)->requeue = true;
+  launchLockedRequest(nextLockHolder);
+}
diff --git a/vdo/base/dataVIO.h b/vdo/base/dataVIO.h
new file mode 100644
index 0000000..ec6e9f6
--- /dev/null
+++ b/vdo/base/dataVIO.h
@@ -0,0 +1,945 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.h#4 $
+ */
+
+#ifndef DATA_VIO_H
+#define DATA_VIO_H
+
+#include "allocatingVIO.h"
+#include "atomic.h"
+#include "blockMapEntry.h"
+#include "blockMappingState.h"
+#include "constants.h"
+#include "hashZone.h"
+#include "journalPoint.h"
+#include "logicalZone.h"
+#include "referenceOperation.h"
+#include "ringNode.h"
+#include "threadConfig.h"
+#include "trace.h"
+#include "types.h"
+#include "vdoPageCache.h"
+#include "vio.h"
+#include "waitQueue.h"
+
+/**
+ * Codes for describing the last asynchronous operation performed on a VIO.
+ **/
+typedef enum __attribute__((packed)) {
+  MIN_ASYNC_OPERATION_NUMBER = 0,
+  LAUNCH = MIN_ASYNC_OPERATION_NUMBER,
+  ACKNOWLEDGE_WRITE,
+  ACQUIRE_HASH_LOCK,
+  ACQUIRE_LOGICAL_BLOCK_LOCK,
+  ACQUIRE_PBN_READ_LOCK,
+  CHECK_FOR_DEDUPE_FOR_ROLLOVER,
+  CHECK_FOR_DEDUPLICATION,
+  COMPRESS_DATA,
+  CONTINUE_VIO_ASYNC,
+  FIND_BLOCK_MAP_SLOT,
+  GET_MAPPED_BLOCK,
+  GET_MAPPED_BLOCK_FOR_DEDUPE,
+  GET_MAPPED_BLOCK_FOR_WRITE,
+  HASH_DATA,
+  JOURNAL_DECREMENT_FOR_DEDUPE,
+  JOURNAL_DECREMENT_FOR_WRITE,
+  JOURNAL_INCREMENT_FOR_COMPRESSION,
+  JOURNAL_INCREMENT_FOR_DEDUPE,
+  JOURNAL_INCREMENT_FOR_WRITE,
+  JOURNAL_MAPPING_FOR_COMPRESSION,
+  JOURNAL_MAPPING_FOR_DEDUPE,
+  JOURNAL_MAPPING_FOR_WRITE,
+  JOURNAL_UNMAPPING_FOR_DEDUPE,
+  JOURNAL_UNMAPPING_FOR_WRITE,
+  PACK_COMPRESSED_BLOCK,
+  PUT_MAPPED_BLOCK,
+  PUT_MAPPED_BLOCK_FOR_DEDUPE,
+  READ_DATA,
+  UPDATE_INDEX,
+  VERIFY_DEDUPLICATION,
+  WRITE_DATA,
+  MAX_ASYNC_OPERATION_NUMBER,
+} AsyncOperationNumber;
+
+/*
+ * An LBN lock.
+ */
+struct lbnLock {
+  /* The LBN being locked */
+  LogicalBlockNumber  lbn;
+  /* Whether the lock is locked */
+  bool                locked;
+  /* The queue of waiters for the lock */
+  WaitQueue           waiters;
+  /* The logical zone of the LBN */
+  LogicalZone        *zone;
+};
+
+/*
+ * Fields for using the arboreal block map.
+ */
+typedef struct {
+  /* The current height at which this DataVIO is operating */
+  Height            height;
+  /* The block map tree for this LBN */
+  RootCount         rootIndex;
+  /* Whether we hold a page lock */
+  bool              locked;
+  /* The thread on which to run the callback */
+  ThreadID          threadID;
+  /* The function to call after looking up a block map slot */
+  VDOAction        *callback;
+  /* The key for the lock map */
+  uint64_t          key;
+  /* The queue of waiters for the page this VIO is allocating or loading */
+  WaitQueue         waiters;
+  /* The block map tree slots for this LBN */
+  BlockMapTreeSlot  treeSlots[BLOCK_MAP_TREE_HEIGHT + 1];
+} TreeLock;
+
+typedef struct {
+  /*
+   * The current compression state of this VIO. This field contains a value
+   * which consists of a VIOCompressionState possibly ORed with a flag
+   * indicating that a request has been made to cancel (or prevent) compression
+   * for this VIO.
+   *
+   * This field should be accessed through the getCompressionState() and
+   * setCompressionState() methods. It should not be accessed directly.
+   */
+  Atomic32       state;
+
+  /* The compressed size of this block */
+  uint16_t       size;
+
+  /* The packer input or output bin slot which holds the enclosing DataVIO */
+  SlotNumber     slot;
+
+  /* The packer input bin to which the enclosing DataVIO has been assigned */
+  InputBin      *bin;
+
+  /* A pointer to the compressed form of this block */
+  char          *data;
+
+  /*
+   * A VIO which is blocked in the packer while holding a lock this VIO needs.
+   */
+  DataVIO       *lockHolder;
+
+} CompressionState;
+
+/**
+ * A VIO for processing user data requests.
+ **/
+struct dataVIO {
+  /* The underlying AllocatingVIO */
+  AllocatingVIO        allocatingVIO;
+
+  /* The logical block of this request */
+  LBNLock              logical;
+
+  /* The state for traversing the block map tree */
+  TreeLock             treeLock;
+
+  /* The current partition address of this block */
+  ZonedPBN             mapped;
+
+  /** The hash of this VIO (if not zero) */
+  UdsChunkName         chunkName;
+
+  /* Used for logging and debugging */
+  AsyncOperationNumber lastAsyncOperation;
+
+  /* The operation to record in the recovery and slab journals */
+  ReferenceOperation   operation;
+
+  /* Whether this VIO is a read-and-write VIO */
+  bool                 isPartialWrite;
+
+  /* Whether this VIO contains all zeros */
+  bool                 isZeroBlock;
+
+  /* Whether this VIO write is a duplicate */
+  bool                 isDuplicate;
+
+  /*
+   * Whether this VIO has received an allocation (needs to be atomic so it can
+   * be examined from threads not in the allocation zone).
+   */
+  AtomicBool           hasAllocation;
+
+  /* The new partition address of this block after the VIO write completes */
+  ZonedPBN             newMapped;
+
+  /* The hash zone responsible for the chunk name (NULL if isZeroBlock) */
+  HashZone            *hashZone;
+
+  /* The lock this VIO holds or shares with other VIOs with the same data */
+  HashLock            *hashLock;
+
+  /* All DataVIOs sharing a hash lock are kept in a ring linking these nodes */
+  RingNode             hashLockNode;
+
+  /* The block number in the partition of the albireo deduplication advice */
+  ZonedPBN             duplicate;
+
+  /*
+   * The sequence number of the recovery journal block containing the increment
+   * entry for this VIO.
+   */
+  SequenceNumber       recoverySequenceNumber;
+
+  /* The point in the recovery journal where this write last made an entry */
+  JournalPoint         recoveryJournalPoint;
+
+  /* The RingNode of VIOs in user initiated write requests */
+  RingNode             writeNode;
+
+  /* A flag indicating that a data write VIO has a flush generation lock */
+  bool                 hasFlushGenerationLock;
+
+  /* The generation number of the VDO that this VIO belongs to */
+  SequenceNumber       flushGeneration;
+
+  /* The completion to use for fetching block map pages for this vio */
+  VDOPageCompletion    pageCompletion;
+
+  /* All of the fields necessary for the compression path */
+  CompressionState     compression;
+};
+
+/**
+ * Convert an AllocatingVIO to a DataVIO.
+ *
+ * @param allocatingVIO  The AllocatingVIO to convert
+ *
+ * @return The AllocatingVIO as a DataVIO
+ **/
+static inline DataVIO *allocatingVIOAsDataVIO(AllocatingVIO *allocatingVIO)
+{
+  STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0);
+  ASSERT_LOG_ONLY((allocatingVIOAsVIO(allocatingVIO)->type == VIO_TYPE_DATA),
+                  "AllocatingVIO is a DataVIO");
+  return (DataVIO *) allocatingVIO;
+}
+
+/**
+ * Convert a VIO to a DataVIO.
+ *
+ * @param vio  The VIO to convert
+ *
+ * @return The VIO as a DataVIO
+ **/
+static inline DataVIO *vioAsDataVIO(VIO *vio)
+{
+  STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0);
+  STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0);
+  ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "VIO is a DataVIO");
+  return (DataVIO *) vio;
+}
+
+/**
+ * Convert a DataVIO to an AllocatingVIO.
+ *
+ * @param dataVIO  The DataVIO to convert
+ *
+ * @return The DataVIO as an AllocatingVIO
+ **/
+static inline AllocatingVIO *dataVIOAsAllocatingVIO(DataVIO *dataVIO)
+{
+  return &dataVIO->allocatingVIO;
+}
+
+/**
+ * Convert a DataVIO to a VIO.
+ *
+ * @param dataVIO  The DataVIO to convert
+ *
+ * @return The DataVIO as a VIO
+ **/
+static inline VIO *dataVIOAsVIO(DataVIO *dataVIO)
+{
+  return allocatingVIOAsVIO(dataVIOAsAllocatingVIO(dataVIO));
+}
+
+/**
+ * Convert a generic VDOCompletion to a DataVIO.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a DataVIO
+ **/
+static inline DataVIO *asDataVIO(VDOCompletion *completion)
+{
+  return vioAsDataVIO(asVIO(completion));
+}
+
+/**
+ * Convert a DataVIO to a generic completion.
+ *
+ * @param dataVIO  The DataVIO to convert
+ *
+ * @return The DataVIO as a completion
+ **/
+static inline VDOCompletion *dataVIOAsCompletion(DataVIO *dataVIO)
+{
+  return allocatingVIOAsCompletion(dataVIOAsAllocatingVIO(dataVIO));
+}
+
+/**
+ * Convert a DataVIO to a generic wait queue entry.
+ *
+ * @param dataVIO  The DataVIO to convert
+ *
+ * @return The DataVIO as a wait queue entry
+ **/
+static inline Waiter *dataVIOAsWaiter(DataVIO *dataVIO)
+{
+  return allocatingVIOAsWaiter(dataVIOAsAllocatingVIO(dataVIO));
+}
+
+/**
+ * Convert a DataVIO's generic wait queue entry back to the DataVIO.
+ *
+ * @param waiter  The wait queue entry to convert
+ *
+ * @return The wait queue entry as a DataVIO
+ **/
+static inline DataVIO *waiterAsDataVIO(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+
+  return allocatingVIOAsDataVIO(waiterAsAllocatingVIO(waiter));
+}
+
+/**
+ * Check whether a DataVIO is a read.
+ *
+ * @param dataVIO  The DataVIO to check
+ **/
+static inline bool isReadDataVIO(DataVIO *dataVIO)
+{
+  return isReadVIO(dataVIOAsVIO(dataVIO));
+}
+
+/**
+ * Check whether a DataVIO is a write.
+ *
+ * @param dataVIO  The DataVIO to check
+ **/
+static inline bool isWriteDataVIO(DataVIO *dataVIO)
+{
+  return isWriteVIO(dataVIOAsVIO(dataVIO));
+}
+
+/**
+ * Check whether a DataVIO is a compressed block write.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO is a compressed block write
+ **/
+static inline bool isCompressedWriteDataVIO(DataVIO *dataVIO)
+{
+  return isCompressedWriteVIO(dataVIOAsVIO(dataVIO));
+}
+
+/**
+ * Check whether a DataVIO is a trim.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO is a trim
+ **/
+static inline bool isTrimDataVIO(DataVIO *dataVIO)
+{
+  return (dataVIO->newMapped.state == MAPPING_STATE_UNMAPPED);
+}
+
+/**
+ * Get the location that should passed Albireo as the new advice for where to
+ * find the data written by this DataVIO.
+ *
+ * @param dataVIO  The write DataVIO that is ready to update Albireo
+ *
+ * @return a DataLocation containing the advice to store in Albireo
+ **/
+static inline DataLocation getDataVIONewAdvice(const DataVIO *dataVIO)
+{
+  return (DataLocation) {
+    .pbn   = dataVIO->newMapped.pbn,
+    .state = dataVIO->newMapped.state,
+  };
+}
+
+/**
+ * Get the VDO from a DataVIO.
+ *
+ * @param dataVIO  The DataVIO from which to get the VDO
+ *
+ * @return The VDO to which a DataVIO belongs
+ **/
+static inline VDO *getVDOFromDataVIO(DataVIO *dataVIO)
+{
+  return dataVIOAsVIO(dataVIO)->vdo;
+}
+
+/**
+ * Get the ThreadConfig from a DataVIO.
+ *
+ * @param dataVIO  The DataVIO from which to get the ThreadConfig
+ *
+ * @return The ThreadConfig of the VDO to which a DataVIO belongs
+ **/
+static inline const ThreadConfig *getThreadConfigFromDataVIO(DataVIO *dataVIO)
+{
+  return getThreadConfig(getVDOFromDataVIO(dataVIO));
+}
+
+/**
+ * Get the allocation of a DataVIO.
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return The allocation of the DataVIO
+ **/
+static inline PhysicalBlockNumber getDataVIOAllocation(DataVIO *dataVIO)
+{
+  return dataVIOAsAllocatingVIO(dataVIO)->allocation;
+}
+
+/**
+ * Check whether a DataVIO has an allocation.
+ *
+ * @param dataVIO  The DataVIO to check
+ *
+ * @return <code>true</code> if the DataVIO has an allocated block
+ **/
+static inline bool hasAllocation(DataVIO *dataVIO)
+{
+  return (getDataVIOAllocation(dataVIO) != ZERO_BLOCK);
+}
+
+/**
+ * (Re)initialize a DataVIO to have a new logical block number, keeping the
+ * same parent and other state. This method must be called before using a
+ * DataVIO.
+ *
+ * @param dataVIO    The DataVIO to initialize
+ * @param lbn        The logical block number of the DataVIO
+ * @param operation  The operation this DataVIO will perform
+ * @param isTrim     <code>true</code> if this DataVIO is for a trim request
+ * @param callback   The function to call once the VIO has completed its
+ *                   operation
+ **/
+void prepareDataVIO(DataVIO            *dataVIO,
+                    LogicalBlockNumber  lbn,
+                    VIOOperation        operation,
+                    bool                isTrim,
+                    VDOAction          *callback);
+
+/**
+ * Complete the processing of a DataVIO.
+ *
+ * @param completion The completion of the VIO to complete
+ **/
+void completeDataVIO(VDOCompletion *completion);
+
+/**
+ * Finish processing a DataVIO, possibly due to an error. This function will
+ * set any error, and then initiate DataVIO clean up.
+ *
+ * @param dataVIO  The DataVIO to abort
+ * @param result   The result of processing the DataVIO
+ **/
+void finishDataVIO(DataVIO *dataVIO, int result);
+
+/**
+ * Continue processing a DataVIO that has been waiting for an event, setting
+ * the result from the event and calling the current callback.
+ *
+ * @param dataVIO  The DataVIO to continue
+ * @param result   The current result (will not mask older errors)
+ **/
+static inline void continueDataVIO(DataVIO *dataVIO, int result)
+{
+  continueCompletion(dataVIOAsCompletion(dataVIO), result);
+}
+
+/**
+ * Get the name of the last asynchronous operation performed on a DataVIO.
+ *
+ * @param dataVIO  The DataVIO in question
+ *
+ * @return The name of the last operation performed on the DataVIO
+ **/
+const char *getOperationName(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Add a trace record for the current source location.
+ *
+ * @param dataVIO   The DataVIO structure to be updated
+ * @param location  The source-location descriptor to be recorded
+ **/
+static inline void dataVIOAddTraceRecord(DataVIO       *dataVIO,
+                                         TraceLocation  location)
+{
+  vioAddTraceRecord(dataVIOAsVIO(dataVIO), location);
+}
+
+/**
+ * Add a DataVIO to the tail end of a wait queue. The DataVIO must not already
+ * be waiting in a queue. A trace record is also generated for the DataVIO.
+ *
+ * @param queue     The queue to which to add the waiter
+ * @param waiter    The DataVIO to add to the queue
+ * @param location  The source-location descriptor to be traced in the DataVIO
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static inline int enqueueDataVIO(WaitQueue     *queue,
+                                 DataVIO       *waiter,
+                                 TraceLocation  location)
+{
+  dataVIOAddTraceRecord(waiter, location);
+  return enqueueWaiter(queue, dataVIOAsWaiter(waiter));
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its hash zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInHashZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getHashZoneThreadID(dataVIO->hashZone);
+  ThreadID threadID = getCallbackThreadID();
+  // It's odd to use the LBN, but converting the chunk name to hex is a bit
+  // clunky for an inline, and the LBN better than nothing as an identifier.
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for logical block %" PRIu64
+                  " on thread %u, should be on hash zone thread %u",
+                  dataVIO->logical.lbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a hash zone operation. This function presumes that the
+ * hashZone field of the DataVIO has already been set.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setHashZoneCallback(DataVIO       *dataVIO,
+                                       VDOAction     *callback,
+                                       TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getHashZoneThreadID(dataVIO->hashZone));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a hash zone operation and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchHashZoneCallback(DataVIO       *dataVIO,
+                                          VDOAction     *callback,
+                                          TraceLocation  location)
+{
+  setHashZoneCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its logical zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInLogicalZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getLogicalZoneThreadID(dataVIO->logical.zone);
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for logical block %" PRIu64
+                  " on thread %u, should be on thread %u",
+                  dataVIO->logical.lbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a logical block operation. This function presumes that the
+ * logicalZone field of the DataVIO has already been set.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setLogicalCallback(DataVIO       *dataVIO,
+                                      VDOAction     *callback,
+                                      TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getLogicalZoneThreadID(dataVIO->logical.zone));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a logical block operation and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchLogicalCallback(DataVIO       *dataVIO,
+                                         VDOAction     *callback,
+                                         TraceLocation  location)
+{
+  setLogicalCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its allocated
+ * zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInAllocatedZone(DataVIO *dataVIO)
+{
+  assertInPhysicalZone(dataVIOAsAllocatingVIO(dataVIO));
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's allocated zone.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setAllocatedZoneCallback(DataVIO       *dataVIO,
+                                            VDOAction     *callback,
+                                            TraceLocation  location)
+{
+  setPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback,
+                          location);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's allocated zone
+ * and queue the DataVIO and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to invoke
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchAllocatedZoneCallback(DataVIO       *dataVIO,
+                                               VDOAction     *callback,
+                                               TraceLocation  location)
+{
+  launchPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback,
+                             location);
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its duplicate
+ * zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInDuplicateZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getPhysicalZoneThreadID(dataVIO->duplicate.zone);
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for duplicate physical block %" PRIu64
+                  " on thread %u, should be on thread %u",
+                  dataVIO->duplicate.pbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's duplicate zone.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setDuplicateZoneCallback(DataVIO       *dataVIO,
+                                            VDOAction     *callback,
+                                            TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getPhysicalZoneThreadID(dataVIO->duplicate.zone));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's duplicate zone
+ * and queue the DataVIO and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to invoke
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchDuplicateZoneCallback(DataVIO       *dataVIO,
+                                               VDOAction     *callback,
+                                               TraceLocation  location)
+{
+  setDuplicateZoneCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its mapped zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInMappedZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getPhysicalZoneThreadID(dataVIO->mapped.zone);
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for mapped physical block %" PRIu64
+                  " on thread %u, should be on thread %u",
+                  dataVIO->mapped.pbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's mapped zone.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setMappedZoneCallback(DataVIO       *dataVIO,
+                                         VDOAction     *callback,
+                                         TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getPhysicalZoneThreadID(dataVIO->mapped.zone));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Check that a DataVIO is running on the correct thread for its newMapped
+ * zone.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInNewMappedZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getPhysicalZoneThreadID(dataVIO->newMapped.zone);
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for newMapped physical block %" PRIu64
+                  " on thread %u, should be on thread %u",
+                  dataVIO->newMapped.pbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's newMapped zone.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setNewMappedZoneCallback(DataVIO       *dataVIO,
+                                            VDOAction     *callback,
+                                            TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getPhysicalZoneThreadID(dataVIO->newMapped.zone));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a physical block operation in a DataVIO's newMapped zone
+ * and queue the DataVIO and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO
+ * @param callback  The callback to invoke
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchNewMappedZoneCallback(DataVIO       *dataVIO,
+                                               VDOAction     *callback,
+                                               TraceLocation  location)
+{
+  setNewMappedZoneCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check that a DataVIO is running on the journal thread.
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInJournalZone(DataVIO *dataVIO)
+{
+  ThreadID expected
+    = getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO));
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for logical block %" PRIu64
+                  " on thread %u, should be on journal thread %u",
+                  dataVIO->logical.lbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a journal operation.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setJournalCallback(DataVIO       *dataVIO,
+                                      VDOAction     *callback,
+                                      TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO)));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a journal operation and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchJournalCallback(DataVIO       *dataVIO,
+                                         VDOAction     *callback,
+                                         TraceLocation  location)
+{
+  setJournalCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check that a DataVIO is running on the packer thread
+ *
+ * @param dataVIO  The DataVIO in question
+ **/
+static inline void assertInPackerZone(DataVIO *dataVIO)
+{
+  ThreadID expected = getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO));
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((expected == threadID),
+                  "DataVIO for logical block %" PRIu64
+                  " on thread %u, should be on packer thread %u",
+                  dataVIO->logical.lbn, threadID, expected);
+}
+
+/**
+ * Set a callback as a packer operation.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void setPackerCallback(DataVIO       *dataVIO,
+                                     VDOAction     *callback,
+                                     TraceLocation  location)
+{
+  setCallback(dataVIOAsCompletion(dataVIO), callback,
+              getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO)));
+  dataVIOAddTraceRecord(dataVIO, location);
+}
+
+/**
+ * Set a callback as a packer operation and invoke it immediately.
+ *
+ * @param dataVIO   The DataVIO with which to set the callback
+ * @param callback  The callback to set
+ * @param location  The tracing info for the call site
+ **/
+static inline void launchPackerCallback(DataVIO       *dataVIO,
+                                        VDOAction     *callback,
+                                        TraceLocation  location)
+{
+  setPackerCallback(dataVIO, callback, location);
+  invokeCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Check whether the advice received from Albireo is a valid data location,
+ * and if it is, accept it as the location of a potential duplicate of the
+ * DataVIO.
+ *
+ * @param dataVIO  The DataVIO that queried Albireo
+ * @param advice   A potential location of the data, or NULL for no advice
+ **/
+void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice);
+
+/**
+ * Set the location of the duplicate block for a DataVIO, updating the
+ * isDuplicate and duplicate fields from a ZonedPBN.
+ *
+ * @param dataVIO  The DataVIO to modify
+ * @param source   The location of the duplicate
+ **/
+void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source);
+
+/**
+ * Clear a DataVIO's mapped block location, setting it to be unmapped. This
+ * indicates the block map entry for the logical block is either unmapped or
+ * corrupted.
+ *
+ * @param dataVIO  The DataVIO whose mapped block location is to be reset
+ **/
+void clearMappedLocation(DataVIO *dataVIO);
+
+/**
+ * Set a DataVIO's mapped field to the physical location recorded in the block
+ * map for the logical block in the VIO.
+ *
+ * @param dataVIO  The DataVIO whose field is to be set
+ * @param pbn      The physical block number to set
+ * @param state    The mapping state to set
+ *
+ * @return VDO_SUCCESS or an error code if the mapping is unusable
+ **/
+int setMappedLocation(DataVIO             *dataVIO,
+                      PhysicalBlockNumber  pbn,
+                      BlockMappingState    state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to acquire the lock on a logical block. This is the start of the
+ * path for all external requests. It is registered in prepareDataVIO().
+ *
+ * @param completion  The DataVIO for an external data request as a completion
+ **/
+void attemptLogicalBlockLock(VDOCompletion *completion);
+
+/**
+ * Release the lock on the logical block, if any, that a DataVIO has acquired.
+ *
+ * @param dataVIO  The DataVIO releasing its logical block lock
+ **/
+void releaseLogicalBlockLock(DataVIO *dataVIO);
+
+#endif // DATA_VIO_H
diff --git a/vdo/base/dirtyLists.c b/vdo/base/dirtyLists.c
new file mode 100644
index 0000000..d16b790
--- /dev/null
+++ b/vdo/base/dirtyLists.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.c#1 $
+ */
+
+#include "dirtyLists.h"
+#include "dirtyListsInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "types.h"
+
+struct dirtyLists {
+  /** The number of periods after which an element will be expired */
+  BlockCount      maximumAge;
+  /** The oldest period which has unexpired elements */
+  SequenceNumber  oldestPeriod;
+  /** One more than the current period */
+  SequenceNumber  nextPeriod;
+  /** The function to call on expired elements */
+  DirtyCallback  *callback;
+  /** The callback context */
+  void           *context;
+  /** The offset in the array of lists of the oldest period */
+  BlockCount      offset;
+  /** The list of elements which are being expired */
+  RingNode        expired;
+  /** The lists of dirty elements */
+  RingNode        lists[];
+};
+
+/**********************************************************************/
+int makeDirtyLists(BlockCount      maximumAge,
+                   DirtyCallback  *callback,
+                   void           *context,
+                   DirtyLists    **dirtyListsPtr)
+{
+  DirtyLists *dirtyLists;
+  int result = ALLOCATE_EXTENDED(DirtyLists, maximumAge, RingNode, __func__,
+                                 &dirtyLists);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  dirtyLists->maximumAge = maximumAge;
+  dirtyLists->callback   = callback;
+  dirtyLists->context    = context;
+
+  initializeRing(&dirtyLists->expired);
+  for (BlockCount i = 0; i < maximumAge; i++) {
+    initializeRing(&dirtyLists->lists[i]);
+  }
+
+  *dirtyListsPtr = dirtyLists;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeDirtyLists(DirtyLists **dirtyListsPtr)
+{
+  DirtyLists *lists = *dirtyListsPtr;
+  if (lists == NULL) {
+    return;
+  }
+
+  FREE(lists);
+  *dirtyListsPtr = NULL;
+}
+
+/**********************************************************************/
+void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period)
+{
+  ASSERT_LOG_ONLY(dirtyLists->nextPeriod == 0, "current period not set");
+  dirtyLists->oldestPeriod = period;
+  dirtyLists->nextPeriod   = period + 1;
+  dirtyLists->offset       = period % dirtyLists->maximumAge;
+}
+
+/**
+ * Expire the oldest list.
+ *
+ * @param dirtyLists  The DirtyLists to expire
+ **/
+static void expireOldestList(DirtyLists *dirtyLists)
+{
+  dirtyLists->oldestPeriod++;
+  RingNode *ring = &(dirtyLists->lists[dirtyLists->offset++]);
+  if (!isRingEmpty(ring)) {
+    spliceRingChainBefore(ring->next, ring->prev, &dirtyLists->expired);
+  }
+
+  if (dirtyLists->offset == dirtyLists->maximumAge) {
+    dirtyLists->offset = 0;
+  }
+}
+
+/**
+ * Update the period if necessary.
+ *
+ * @param dirtyLists  The DirtyLists
+ * @param period      The new period
+ **/
+static void updatePeriod(DirtyLists *dirtyLists, SequenceNumber period)
+{
+  while (dirtyLists->nextPeriod <= period) {
+    if ((dirtyLists->nextPeriod - dirtyLists->oldestPeriod)
+        == dirtyLists->maximumAge) {
+      expireOldestList(dirtyLists);
+    }
+    dirtyLists->nextPeriod++;
+  }
+}
+
+/**
+ * Write out the expired list.
+ *
+ * @param dirtyLists  The dirtyLists
+ **/
+static void writeExpiredElements(DirtyLists *dirtyLists)
+{
+  if (isRingEmpty(&dirtyLists->expired)) {
+    return;
+  }
+
+  dirtyLists->callback(&dirtyLists->expired, dirtyLists->context);
+  ASSERT_LOG_ONLY(isRingEmpty(&dirtyLists->expired),
+                  "no expired elements remain");
+}
+
+/**********************************************************************/
+void addToDirtyLists(DirtyLists     *dirtyLists,
+                     RingNode       *node,
+                     SequenceNumber  oldPeriod,
+                     SequenceNumber  newPeriod)
+{
+  if ((oldPeriod == newPeriod)
+      || ((oldPeriod != 0) && (oldPeriod < newPeriod))) {
+    return;
+  }
+
+  if (newPeriod < dirtyLists->oldestPeriod) {
+    pushRingNode(&dirtyLists->expired, node);
+  } else {
+    updatePeriod(dirtyLists, newPeriod);
+    pushRingNode(&dirtyLists->lists[newPeriod % dirtyLists->maximumAge], node);
+  }
+
+  writeExpiredElements(dirtyLists);
+}
+
+/**********************************************************************/
+void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period)
+{
+  updatePeriod(dirtyLists, period);
+  writeExpiredElements(dirtyLists);
+}
+
+/**********************************************************************/
+void flushDirtyLists(DirtyLists *dirtyLists)
+{
+  while (dirtyLists->oldestPeriod < dirtyLists->nextPeriod) {
+    expireOldestList(dirtyLists);
+  }
+  writeExpiredElements(dirtyLists);
+}
+
+/**********************************************************************/
+SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists)
+{
+  return dirtyLists->nextPeriod;
+}
diff --git a/vdo/base/dirtyLists.h b/vdo/base/dirtyLists.h
new file mode 100644
index 0000000..f3d27f7
--- /dev/null
+++ b/vdo/base/dirtyLists.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.h#1 $
+ */
+
+#ifndef DIRTY_LISTS_H
+#define DIRTY_LISTS_H
+
+#include "ringNode.h"
+#include "types.h"
+
+/**
+ * A collection of lists of dirty elements ordered by age. An element is always
+ * placed on the oldest list in which it was dirtied (moving between lists or
+ * removing altogether is cheap). Whenever the current period is advanced, any
+ * elements older than the maxium age are expired. If an element is to be added
+ * with a dirty age older than the maximum age, it is expired immediately.
+ **/
+typedef struct dirtyLists DirtyLists;
+
+/**
+ * A function which will be called with a ring of dirty elements which have
+ * been expired. All of the expired elements must be removed from the ring
+ * before this function returns.
+ *
+ * @param expired  The list of expired elements
+ * @param context  The context for the callback
+ **/
+typedef void DirtyCallback(RingNode *expired, void *context);
+
+/**
+ * Construct a new set of dirty lists.
+ *
+ * @param [in]  maximumAge     The age at which an element will be expired
+ * @param [in]  callback       The function to call when a set of elements have
+ *                             expired
+ * @param [in]  context        The context for the callback
+ * @param [out] dirtyListsPtr  A pointer to hold the new DirtyLists
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeDirtyLists(BlockCount      maximumAge,
+                   DirtyCallback  *callback,
+                   void           *context,
+                   DirtyLists    **dirtyListsPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a set of dirty lists and null out the pointer to them.
+ *
+ * @param dirtyListsPtr A pointer to the dirty lists to be freed
+ **/
+void freeDirtyLists(DirtyLists **dirtyListsPtr);
+
+/**
+ * Set the current period. This function should only be called once.
+ *
+ * @param dirtyLists  The dirtyLists
+ * @param period      The current period
+ **/
+void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period);
+
+/**
+ * Add an element to the dirty lists.
+ *
+ * @param dirtyLists  The DirtyLists receiving the element
+ * @param node        The RingNode of the element to add
+ * @param oldPeriod   The period in which the element was previous dirtied,
+ *                    or 0 if it was not dirty
+ * @param newPeriod   The period in which the element has now been dirtied,
+ *                    or 0 if it does not hold a lock
+ **/
+void addToDirtyLists(DirtyLists     *dirtyLists,
+                     RingNode       *node,
+                     SequenceNumber  oldPeriod,
+                     SequenceNumber  newPeriod);
+
+/**
+ * Advance the current period. If the current period is greater than the number
+ * of lists, expire the oldest lists.
+ *
+ * @param dirtyLists  The DirtyLists to advance
+ * @param period      The new current period
+ **/
+void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period);
+
+/**
+ * Flush all dirty lists. This will cause the period to be advanced past the
+ * current period.
+ *
+ * @param dirtyLists  The dirtyLists to flush
+ **/
+void flushDirtyLists(DirtyLists *dirtyLists);
+
+#endif // DIRTY_LISTS_H
diff --git a/vdo/base/dirtyListsInternals.h b/vdo/base/dirtyListsInternals.h
new file mode 100644
index 0000000..d5876d0
--- /dev/null
+++ b/vdo/base/dirtyListsInternals.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyListsInternals.h#1 $
+ */
+
+#ifndef DIRTY_LISTS_INTERNALS_H
+#define DIRTY_LISTS_INTERNALS_H
+
+#include "dirtyLists.h"
+#include "types.h"
+
+/**
+ * Get the next period from a DirtyLists. This method is used by unit tests.
+ *
+ * @param dirtyLists  The DirtyLists to examine
+ **/
+SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists)
+  __attribute__((warn_unused_result));
+
+#endif // DIRTY_LISTS_INTERNALS_H
diff --git a/vdo/base/extent.c b/vdo/base/extent.c
new file mode 100644
index 0000000..5983615
--- /dev/null
+++ b/vdo/base/extent.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.c#3 $
+ */
+
+#include "extent.h"
+
+#include "memoryAlloc.h"
+
+#include "completion.h"
+#include "constants.h"
+#include "logger.h"
+#include "physicalLayer.h"
+#include "types.h"
+#include "vdo.h"
+#include "vioRead.h"
+#include "vioWrite.h"
+
+/**********************************************************************/
+int createExtent(PhysicalLayer  *layer,
+                 VIOType         vioType,
+                 VIOPriority     priority,
+                 BlockCount      blockCount,
+                 char           *data,
+                 VDOExtent     **extentPtr)
+{
+  int result = ASSERT(isMetadataVIOType(vioType),
+                      "createExtent() called for metadata");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDOExtent *extent;
+  result = ALLOCATE_EXTENDED(VDOExtent, blockCount, VIO *, __func__, &extent);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&extent->completion,
+                                           VDO_EXTENT_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    FREE(extent);
+    return result;
+  }
+
+  for (; extent->count < blockCount; extent->count++) {
+    result = layer->createMetadataVIO(layer, vioType, priority, extent, data,
+                                      &extent->vios[extent->count]);
+    if (result != VDO_SUCCESS) {
+      freeExtent(&extent);
+      return result;
+    }
+
+    data += VDO_BLOCK_SIZE;
+  }
+
+  *extentPtr = extent;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeExtent(VDOExtent **extentPtr)
+{
+  VDOExtent *extent = *extentPtr;
+  if (extent == NULL) {
+    return;
+  }
+
+  for (BlockCount i = 0; i < extent->count; i++) {
+    freeVIO(&extent->vios[i]);
+  }
+
+  destroyEnqueueable(&extent->completion);
+  FREE(extent);
+  *extentPtr = NULL;
+}
+
+/**
+ * Launch a metadata extent.
+ *
+ * @param extent      The extent
+ * @param startBlock  The absolute physical block at which the extent should
+ *                    begin its I/O
+ * @param count       The number of blocks to write
+ * @param operation   The operation to perform on the extent
+ **/
+static void launchMetadataExtent(VDOExtent           *extent,
+                                 PhysicalBlockNumber  startBlock,
+                                 BlockCount           count,
+                                 VIOOperation         operation)
+{
+  resetCompletion(&extent->completion);
+  if (count > extent->count) {
+    finishCompletion(&extent->completion, VDO_OUT_OF_RANGE);
+    return;
+  }
+
+  extent->completeCount = extent->count - count;
+  for (BlockCount i = 0; i < count; i++) {
+    VIO *vio = extent->vios[i];
+    vio->completion.callbackThreadID = extent->completion.callbackThreadID;
+    launchMetadataVIO(vio, startBlock++, handleVIOCompletion,
+                      handleVIOCompletion, operation);
+  }
+}
+
+/**********************************************************************/
+void readPartialMetadataExtent(VDOExtent           *extent,
+                               PhysicalBlockNumber  startBlock,
+                               BlockCount           count)
+{
+  launchMetadataExtent(extent, startBlock, count, VIO_READ);
+}
+
+/**********************************************************************/
+void writePartialMetadataExtent(VDOExtent           *extent,
+                                PhysicalBlockNumber  startBlock,
+                                BlockCount           count)
+{
+  launchMetadataExtent(extent, startBlock, count, VIO_WRITE);
+}
+
+/**********************************************************************/
+void handleVIOCompletion(VDOCompletion *completion)
+{
+  VDOExtent *extent = asVDOExtent(completion->parent);
+  if (++extent->completeCount != extent->count) {
+    setCompletionResult(extentAsCompletion(extent), completion->result);
+    return;
+  }
+
+  finishCompletion(extentAsCompletion(extent), completion->result);
+}
diff --git a/vdo/base/extent.h b/vdo/base/extent.h
new file mode 100644
index 0000000..b023c06
--- /dev/null
+++ b/vdo/base/extent.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.h#2 $
+ */
+
+#ifndef EXTENT_H
+#define EXTENT_H
+
+#include "permassert.h"
+
+#include "completion.h"
+#include "types.h"
+#include "vio.h"
+
+/**
+ * A chain of VIOs which are part of the same request. An extent contains
+ * a chain of at least 'count' VIOs. The 'next' pointer of the last VIO
+ * in the extent (as indicated by the count) may not be NULL, but it is not
+ * part of the extent. A VIO may belong to a single extent.
+ **/
+struct vdoExtent {
+  // The completion for asynchronous extent processing
+  VDOCompletion  completion;
+  // The number of VIOs in the extent
+  BlockCount     count;
+  // The number of completed VIOs in the extent
+  BlockCount     completeCount;
+  // The VIOs in the extent
+  VIO           *vios[];
+};
+
+/**
+ * Convert a generic VDOCompletion to a VDOExtent.
+ *
+ * @param completion The completion to convert
+ *
+ * @return The completion as an extent
+ **/
+static inline VDOExtent *asVDOExtent(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(VDOExtent, completion) == 0);
+  assertCompletionType(completion->type, VDO_EXTENT_COMPLETION);
+  return (VDOExtent *) completion;
+}
+
+/**
+ * Convert a VDOExtent to VDOCompletion.
+ *
+ * @param extent The extent to convert
+ *
+ * @return The extent as a VDOCompletion
+ **/
+static inline VDOCompletion *extentAsCompletion(VDOExtent *extent)
+{
+  return &extent->completion;
+}
+
+/**
+ * Create a VDOExtent.
+ *
+ * @param [in]  layer       The layer
+ * @param [in]  vioType     The usage type to assign to the VIOs in the extent
+ *                          (data / block map / journal)
+ * @param [in]  priority    The relative priority to assign to the VIOs
+ * @param [in]  blockCount  The number of blocks in the buffer
+ * @param [in]  data        The buffer
+ * @param [out] extentPtr   A pointer to hold the new extent
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int createExtent(PhysicalLayer  *layer,
+                 VIOType         vioType,
+                 VIOPriority     priority,
+                 BlockCount      blockCount,
+                 char           *data,
+                 VDOExtent     **extentPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an extent and null out the reference to it.
+ *
+ * @param [in,out] extentPtr   The reference to the extent to free
+ **/
+void freeExtent(VDOExtent **extentPtr);
+
+/**
+ * Read metadata from the underlying storage.
+ *
+ * @param extent      The extent to read
+ * @param startBlock  The physical block number of the first block
+ *                    in the extent
+ * @param count       The number of blocks to read (must be less than or
+ *                    equal to the length of the extent)
+ **/
+void readPartialMetadataExtent(VDOExtent           *extent,
+                               PhysicalBlockNumber  startBlock,
+                               BlockCount           count);
+
+/**
+ * Read metadata from the underlying storage.
+ *
+ * @param extent      The extent to read
+ * @param startBlock  The physical block number of the first block
+ *                    in the extent
+ **/
+static inline void readMetadataExtent(VDOExtent           *extent,
+                                      PhysicalBlockNumber  startBlock)
+{
+  readPartialMetadataExtent(extent, startBlock, extent->count);
+}
+
+/**
+ * Write metadata to the underlying storage.
+ *
+ * @param extent      The extent to write
+ * @param startBlock  The physical block number of the first block in the
+ *                    extent
+ * @param count       The number of blocks to read (must be less than or
+ *                    equal to the length of the extent)
+ **/
+void writePartialMetadataExtent(VDOExtent           *extent,
+                                PhysicalBlockNumber  startBlock,
+                                BlockCount           count);
+/**
+ * Write metadata to the underlying storage.
+ *
+ * @param extent      The extent to write
+ * @param startBlock  The physical block number of the first block in the
+ *                    extent
+ **/
+static inline void writeMetadataExtent(VDOExtent           *extent,
+                                       PhysicalBlockNumber  startBlock)
+{
+  writePartialMetadataExtent(extent, startBlock, extent->count);
+}
+
+/**
+ * Notify an extent that one of its VIOs has completed. If the signaling VIO
+ * is the last of the extent's VIOs to complete, the extent will finish. This
+ * function is set as the VIO callback in completeVIO().
+ *
+ * @param completion  The completion of the VIO which has just finished
+ **/
+void handleVIOCompletion(VDOCompletion *completion);
+
+#endif /* EXTENT_H */
diff --git a/vdo/base/fixedLayout.c b/vdo/base/fixedLayout.c
new file mode 100644
index 0000000..4ea048a
--- /dev/null
+++ b/vdo/base/fixedLayout.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.c#3 $
+ */
+
+#include "fixedLayout.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "header.h"
+#include "statusCodes.h"
+
+const BlockCount ALL_FREE_BLOCKS = (uint64_t) -1;
+
+struct fixedLayout {
+  PhysicalBlockNumber  firstFree;
+  PhysicalBlockNumber  lastFree;
+  size_t               numPartitions;
+  Partition           *head;
+};
+
+struct partition {
+  PartitionID          id;     // The id of this partition
+  FixedLayout         *layout; // The layout to which this partition belongs
+  PhysicalBlockNumber  offset; // The offset into the layout of this partition
+  PhysicalBlockNumber  base;   // The untranslated number of the first block
+  BlockCount           count;  // The number of blocks in the partition
+  Partition           *next;   // A pointer to the next partition in the layout
+};
+
+typedef struct {
+  PhysicalBlockNumber firstFree;
+  PhysicalBlockNumber lastFree;
+  byte                partitionCount;
+} __attribute__((packed)) Layout3_0;
+
+typedef struct {
+  PartitionID         id;
+  PhysicalBlockNumber offset;
+  PhysicalBlockNumber base;
+  BlockCount          count;
+} __attribute__((packed)) Partition3_0;
+
+static const Header LAYOUT_HEADER_3_0 = {
+  .id = FIXED_LAYOUT,
+  .version = {
+    .majorVersion = 3,
+    .minorVersion = 0,
+  },
+  .size = sizeof(Layout3_0),   // Minimum size (contains no partitions)
+};
+
+/**********************************************************************/
+int makeFixedLayout(BlockCount            totalBlocks,
+                    PhysicalBlockNumber   startOffset,
+                    FixedLayout         **layoutPtr)
+{
+  FixedLayout *layout;
+  int result = ALLOCATE(1, FixedLayout, "fixed layout", &layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  layout->firstFree     = startOffset;
+  layout->lastFree      = startOffset + totalBlocks;
+  layout->numPartitions = 0;
+  layout->head          = NULL;
+
+  *layoutPtr = layout;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeFixedLayout(FixedLayout **layoutPtr)
+{
+  FixedLayout *layout = *layoutPtr;
+  if (layout == NULL) {
+    return;
+  }
+
+  while (layout->head != NULL) {
+    Partition *part = layout->head;
+    layout->head = part->next;
+    FREE(part);
+  }
+
+  FREE(layout);
+  *layoutPtr = NULL;
+}
+
+/**********************************************************************/
+BlockCount getTotalFixedLayoutSize(const FixedLayout *layout)
+{
+  BlockCount size = getFixedLayoutBlocksAvailable(layout);
+  for (Partition *partition = layout->head; partition != NULL;
+       partition = partition->next) {
+    size += partition->count;
+  }
+
+  return size;
+}
+
+/**********************************************************************/
+int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr)
+{
+  for (Partition *partition = layout->head; partition != NULL;
+       partition = partition->next) {
+    if (partition->id == id) {
+      if (partitionPtr != NULL) {
+        *partitionPtr = partition;
+      }
+      return VDO_SUCCESS;
+    }
+  }
+
+  return VDO_UNKNOWN_PARTITION;
+}
+
+/**********************************************************************/
+int translateToPBN(const Partition     *partition,
+                   PhysicalBlockNumber  partitionBlockNumber,
+                   PhysicalBlockNumber *layerBlockNumber)
+{
+  if (partition == NULL) {
+    *layerBlockNumber = partitionBlockNumber;
+    return VDO_SUCCESS;
+  }
+
+  if (partitionBlockNumber < partition->base) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  PhysicalBlockNumber offsetFromBase = partitionBlockNumber - partition->base;
+  if (offsetFromBase >= partition->count) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  *layerBlockNumber = partition->offset + offsetFromBase;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int translateFromPBN(const Partition     *partition,
+                     PhysicalBlockNumber  layerBlockNumber,
+                     PhysicalBlockNumber *partitionBlockNumberPtr)
+{
+  if (partition == NULL) {
+    *partitionBlockNumberPtr = layerBlockNumber;
+    return VDO_SUCCESS;
+  }
+
+  if (layerBlockNumber < partition->offset) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  PhysicalBlockNumber partitionBlockNumber
+    = layerBlockNumber - partition->offset;
+  if (partitionBlockNumber >= partition->count) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  *partitionBlockNumberPtr = partitionBlockNumber + partition->base;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout)
+{
+  return layout->lastFree - layout->firstFree;
+}
+
+/**
+ * Allocate a partition. The partition will be attached to the partition
+ * list in the layout.
+ *
+ * @param layout     The layout containing the partition
+ * @param id         The id of the partition
+ * @param offset     The offset into the layout at which the partition begins
+ * @param base       The number of the first block for users of the partition
+ * @param blockCount The number of blocks in the partition
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int allocatePartition(FixedLayout         *layout,
+                             byte                 id,
+                             PhysicalBlockNumber  offset,
+                             PhysicalBlockNumber  base,
+                             BlockCount           blockCount)
+{
+  Partition *partition;
+  int result = ALLOCATE(1, Partition, "fixed layout partition", &partition);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  partition->id                  = id;
+  partition->layout              = layout;
+  partition->offset              = offset;
+  partition->base                = base;
+  partition->count               = blockCount;
+  partition->next                = layout->head;
+  layout->head                   = partition;
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeFixedLayoutPartition(FixedLayout         *layout,
+                             PartitionID          id,
+                             BlockCount           blockCount,
+                             PartitionDirection   direction,
+                             PhysicalBlockNumber  base)
+{
+  BlockCount freeBlocks = layout->lastFree - layout->firstFree;
+  if (blockCount == ALL_FREE_BLOCKS) {
+    if (freeBlocks == 0) {
+      return VDO_NO_SPACE;
+    } else {
+      blockCount = freeBlocks;
+    }
+  } else if (blockCount > freeBlocks) {
+    return VDO_NO_SPACE;
+  }
+
+  int result = getPartition(layout, id, NULL);
+  if (result != VDO_UNKNOWN_PARTITION) {
+    return VDO_PARTITION_EXISTS;
+  }
+
+  PhysicalBlockNumber offset = ((direction == FROM_END)
+                                ? (layout->lastFree - blockCount)
+                                : layout->firstFree);
+  result = allocatePartition(layout, id, offset, base, blockCount);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  layout->numPartitions++;
+  if (direction == FROM_END) {
+    layout->lastFree = layout->lastFree - blockCount;
+  } else {
+    layout->firstFree += blockCount;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+BlockCount getFixedLayoutPartitionSize(const Partition *partition)
+{
+  return partition->count;
+}
+
+/**********************************************************************/
+PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition)
+{
+  return partition->offset;
+}
+
+/**********************************************************************/
+PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition)
+{
+  return partition->base;
+}
+
+/**********************************************************************/
+static inline size_t getEncodedSize(const FixedLayout *layout)
+{
+  return sizeof(Layout3_0) + (sizeof(Partition3_0) * layout->numPartitions);
+}
+
+/**********************************************************************/
+size_t getFixedLayoutEncodedSize(const FixedLayout *layout)
+{
+  return ENCODED_HEADER_SIZE + getEncodedSize(layout);
+}
+
+/**
+ * Encode a null-terminated list of fixed layout partitions into a buffer
+ * using partition format 3.0.
+ *
+ * @param layout  The layout containing the list of partitions to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int encodePartitions_3_0(const FixedLayout *layout, Buffer *buffer)
+{
+  for (const Partition *partition = layout->head;
+       partition != NULL;
+       partition = partition->next) {
+    STATIC_ASSERT_SIZEOF(PartitionID, sizeof(byte));
+    int result = putByte(buffer, partition->id);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    result = putUInt64LEIntoBuffer(buffer, partition->offset);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    result = putUInt64LEIntoBuffer(buffer, partition->base);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    result = putUInt64LEIntoBuffer(buffer, partition->count);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Encode the header fields of a fixed layout into a buffer using layout
+ * format 3.0.
+ *
+ * @param layout  The layout to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int encodeLayout_3_0(const FixedLayout *layout, Buffer *buffer)
+{
+  int result = ASSERT(layout->numPartitions <= UINT8_MAX,
+                      "fixed layout partition count must fit in a byte");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, layout->firstFree);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, layout->lastFree);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return putByte(buffer, layout->numPartitions);
+}
+
+/**********************************************************************/
+int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer)
+{
+  if (!ensureAvailableSpace(buffer, getFixedLayoutEncodedSize(layout))) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  Header header = LAYOUT_HEADER_3_0;
+  header.size = getEncodedSize(layout);
+  int result = encodeHeader(&header, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t initialLength = contentLength(buffer);
+
+  result = encodeLayout_3_0(layout, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t encodedSize = contentLength(buffer) - initialLength;
+  result = ASSERT(encodedSize == sizeof(Layout3_0),
+                "encoded size of fixed layout header must match structure");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodePartitions_3_0(layout, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  encodedSize = contentLength(buffer) - initialLength;
+  return ASSERT(encodedSize == header.size,
+                "encoded size of fixed layout must match header size");
+}
+
+/**
+ * Decode a sequence of fixed layout partitions from a buffer
+ * using partition format 3.0.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param layout  The layout in which to allocate the decoded partitions
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodePartitions_3_0(Buffer *buffer, FixedLayout *layout)
+{
+  for (size_t i = 0; i < layout->numPartitions; i++) {
+    byte id;
+    int result = getByte(buffer, &id);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    uint64_t offset;
+    result = getUInt64LEFromBuffer(buffer, &offset);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    uint64_t base;
+    result = getUInt64LEFromBuffer(buffer, &base);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    uint64_t count;
+    result = getUInt64LEFromBuffer(buffer, &count);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    result = allocatePartition(layout, id, offset, base, count);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Decode the header fields of a fixed layout from a buffer using layout
+ * format 3.0.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param layout  The structure to receive the decoded fields
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodeLayout_3_0(Buffer *buffer, Layout3_0 *layout)
+{
+  size_t initialLength = contentLength(buffer);
+
+  PhysicalBlockNumber firstFree;
+  int result = getUInt64LEFromBuffer(buffer, &firstFree);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  PhysicalBlockNumber lastFree;
+  result = getUInt64LEFromBuffer(buffer, &lastFree);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  byte partitionCount;
+  result = getByte(buffer, &partitionCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *layout = (Layout3_0) {
+    .firstFree      = firstFree,
+    .lastFree       = lastFree,
+    .partitionCount = partitionCount,
+  };
+
+  size_t decodedSize = initialLength - contentLength(buffer);
+  return ASSERT(decodedSize == sizeof(Layout3_0),
+                "decoded size of fixed layout header must match structure");
+}
+
+/**********************************************************************/
+int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr)
+{
+  Header header;
+  int result = decodeHeader(buffer, &header);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Layout is variable size, so only do a minimum size check here.
+  result = validateHeader(&LAYOUT_HEADER_3_0, &header, false, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Layout3_0 layoutHeader;
+  result = decodeLayout_3_0(buffer, &layoutHeader);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (contentLength(buffer)
+      < (sizeof(Partition3_0) * layoutHeader.partitionCount)) {
+    return VDO_UNSUPPORTED_VERSION;
+  }
+
+  FixedLayout *layout;
+  result = ALLOCATE(1, FixedLayout, "fixed layout", &layout);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  layout->firstFree     = layoutHeader.firstFree;
+  layout->lastFree      = layoutHeader.lastFree;
+  layout->numPartitions = layoutHeader.partitionCount;
+
+  result = decodePartitions_3_0(buffer, layout);
+  if (result != VDO_SUCCESS) {
+    freeFixedLayout(&layout);
+    return result;
+  }
+
+  *layoutPtr = layout;
+  return VDO_SUCCESS;
+}
diff --git a/vdo/base/fixedLayout.h b/vdo/base/fixedLayout.h
new file mode 100644
index 0000000..0907299
--- /dev/null
+++ b/vdo/base/fixedLayout.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.h#1 $
+ */
+
+#ifndef FIXED_LAYOUT_H
+#define FIXED_LAYOUT_H
+
+#include "buffer.h"
+
+#include "types.h"
+
+typedef enum {
+  FROM_BEGINNING,
+  FROM_END,
+} PartitionDirection;
+
+extern const BlockCount ALL_FREE_BLOCKS;
+
+/**
+ * A fixed layout is like a traditional disk partitioning scheme.  In the
+ * beginning there is one large unused area, of which parts are carved off.
+ * Each carved off section has its own internal offset and size.
+ **/
+typedef struct fixedLayout FixedLayout;
+typedef struct partition Partition;
+
+/**
+ * Make an unpartitioned fixed layout.
+ *
+ * @param [in]  totalBlocks  The total size of the layout, in blocks
+ * @param [in]  startOffset  The block offset in the underlying layer at which
+ *                           the fixed layout begins
+ * @param [out] layoutPtr    The pointer to hold the resulting layout
+ *
+ * @return a success or error code
+ **/
+int makeFixedLayout(BlockCount            totalBlocks,
+                    PhysicalBlockNumber   startOffset,
+                    FixedLayout         **layoutPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the fixed layout and null out the reference to it.
+ *
+ * @param layoutPtr  The reference to the layout to free
+ *
+ * @note all partitions created by this layout become invalid pointers
+ **/
+void freeFixedLayout(FixedLayout **layoutPtr);
+
+/**
+ * Get the total size of the layout in blocks.
+ *
+ * @param layout  The layout
+ *
+ * @return The size of the layout
+ **/
+BlockCount getTotalFixedLayoutSize(const FixedLayout *layout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get a partition by id.
+ *
+ * @param layout        The layout from which to get a partition
+ * @param id            The id of the partition
+ * @param partitionPtr  A pointer to hold the partition
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Translate a block number from the partition's view to the layer's
+ *
+ * @param partition             The partition to use for translation
+ * @param partitionBlockNumber  The block number relative to the partition
+ * @param layerBlockNumber      The block number relative to the layer
+ *
+ * @return  VDO_SUCCESS or an error code
+ **/
+int translateToPBN(const Partition     *partition,
+                   PhysicalBlockNumber  partitionBlockNumber,
+                   PhysicalBlockNumber *layerBlockNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Translate a block number from the layer's view to the partition's.
+ * This is the inverse of translateToPBN().
+ *
+ * @param partition             The partition to use for translation
+ * @param layerBlockNumber      The block number relative to the layer
+ * @param partitionBlockNumber  The block number relative to the partition
+ *
+ * @return  VDO_SUCCESS or an error code
+ **/
+int translateFromPBN(const Partition     *partition,
+                     PhysicalBlockNumber  layerBlockNumber,
+                     PhysicalBlockNumber *partitionBlockNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the number of unallocated blocks available.
+ *
+ * @param layout        the fixed layout
+ *
+ * @return the number of blocks yet unallocated to partitions
+ **/
+BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a new partition from the beginning or end of the unused space
+ * within a fixed layout.
+ *
+ * @param   layout          the fixed layout
+ * @param   id              the id of the partition to make
+ * @param   blockCount      the number of blocks to carve out, if set
+ *                          to ALL_FREE_BLOCKS, all remaining blocks will
+ *                          be used
+ * @param   direction       whether to carve out from beginning or end
+ * @param   base            the number of the first block in the partition
+ *                          from the point of view of its users
+ *
+ * @return a success or error code, particularly
+ *      VDO_NO_SPACE if there are less than blockCount blocks remaining
+ **/
+int makeFixedLayoutPartition(FixedLayout         *layout,
+                             PartitionID          id,
+                             BlockCount           blockCount,
+                             PartitionDirection   direction,
+                             PhysicalBlockNumber  base)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the size in blocks of a partition.
+ *
+ * @param partition       a partition of the fixedLayout
+ *
+ * @return the size of the partition in blocks
+ **/
+BlockCount getFixedLayoutPartitionSize(const Partition *partition)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the first block of the partition in the layout.
+ *
+ * @param partition       a partition of the fixedLayout
+ *
+ * @return the partition's offset in blocks
+ **/
+PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of the first block in the partition from the partition users
+ * point of view.
+ *
+ * @param partition a partition of the fixedLayout
+ *
+ * @return the number of the first block in the partition
+ **/
+PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the size of an encoded layout
+ *
+ * @param layout The layout
+ *
+ * @return The encoded size of the layout
+ **/
+size_t getFixedLayoutEncodedSize(const FixedLayout *layout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode a layout into a buffer.
+ *
+ * @param layout The layout to encode
+ * @param buffer The buffer to encode into
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode a fixed layout from a buffer.
+ *
+ * @param [in]  buffer    The buffer from which to decode
+ * @param [out] layoutPtr A pointer to hold the layout
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr)
+  __attribute__((warn_unused_result));
+
+#endif // FIXED_LAYOUT_H
diff --git a/vdo/base/flush.c b/vdo/base/flush.c
new file mode 100644
index 0000000..4c6b94c
--- /dev/null
+++ b/vdo/base/flush.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.c#3 $
+ */
+
+#include "flush.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockAllocator.h"
+#include "completion.h"
+#include "logicalZone.h"
+#include "numUtils.h"
+#include "readOnlyNotifier.h"
+#include "slabDepot.h"
+#include "vdoInternal.h"
+
+struct flusher {
+  VDOCompletion   completion;
+  /** The VDO to which this flusher belongs */
+  VDO            *vdo;
+  /** The current flush generation of the VDO */
+  SequenceNumber  flushGeneration;
+  /** The first unacknowledged flush generation */
+  SequenceNumber  firstUnacknowledgedGeneration;
+  /** The queue of flush requests waiting to notify other threads */
+  WaitQueue       notifiers;
+  /** The queue of flush requests waiting for VIOs to complete */
+  WaitQueue       pendingFlushes;
+  /** The flush generation for which notifications are being sent */
+  SequenceNumber  notifyGeneration;
+  /** The logical zone to notify next */
+  LogicalZone    *logicalZoneToNotify;
+  /** The ID of the thread on which flush requests should be made */
+  ThreadID        threadID;
+};
+
+/**
+ * Convert a generic VDOCompletion to a Flusher.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a Flusher
+ **/
+static Flusher *asFlusher(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(Flusher, completion) == 0);
+  assertCompletionType(completion->type, FLUSH_NOTIFICATION_COMPLETION);
+  return (Flusher *) completion;
+}
+
+/**
+ * Convert a VDOFlush's generic wait queue entry back to the VDOFlush.
+ *
+ * @param waiter  The wait queue entry to convert
+ *
+ * @return The wait queue entry as a VDOFlush
+ **/
+static VDOFlush *waiterAsFlush(Waiter *waiter)
+{
+  STATIC_ASSERT(offsetof(VDOFlush, waiter) == 0);
+  return (VDOFlush *) waiter;
+}
+
+/**********************************************************************/
+int makeFlusher(VDO *vdo)
+{
+  int result = ALLOCATE(1, Flusher, __func__, &vdo->flusher);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  vdo->flusher->vdo      = vdo;
+  vdo->flusher->threadID = getPackerZoneThread(getThreadConfig(vdo));
+  return initializeEnqueueableCompletion(&vdo->flusher->completion,
+                                         FLUSH_NOTIFICATION_COMPLETION,
+                                         vdo->layer);
+}
+
+/**********************************************************************/
+void freeFlusher(Flusher **flusherPtr)
+{
+  if (*flusherPtr == NULL) {
+    return;
+  }
+
+  Flusher *flusher = *flusherPtr;
+  destroyEnqueueable(&flusher->completion);
+  FREE(flusher);
+  *flusherPtr = NULL;
+}
+
+/**********************************************************************/
+ThreadID getFlusherThreadID(Flusher *flusher)
+{
+  return flusher->threadID;
+}
+
+/**********************************************************************/
+static void notifyFlush(Flusher *flusher);
+
+/**
+ * Finish the notification process by checking if any flushes have completed
+ * and then starting the notification of the next flush request if one came in
+ * while the current notification was in progress. This callback is registered
+ * in flushPackerCallback().
+ *
+ * @param completion  The flusher completion
+ **/
+static void finishNotification(VDOCompletion *completion)
+{
+  Flusher *flusher = asFlusher(completion);
+  ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID),
+                  "finishNotification() called from flusher thread");
+
+  Waiter *waiter = dequeueNextWaiter(&flusher->notifiers);
+  int     result = enqueueWaiter(&flusher->pendingFlushes, waiter);
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(flusher->vdo->readOnlyNotifier, result);
+    VDOFlush *flush = waiterAsFlush(waiter);
+    completion->layer->completeFlush(&flush);
+    return;
+  }
+
+  completeFlushes(flusher);
+  if (hasWaiters(&flusher->notifiers)) {
+    notifyFlush(flusher);
+  }
+}
+
+/**
+ * Flush the packer now that all of the logical and physical zones have been
+ * notified of the new flush request. This callback is registered in
+ * incrementGeneration().
+ *
+ * @param completion  The flusher completion
+ **/
+static void flushPackerCallback(VDOCompletion *completion)
+{
+  Flusher *flusher = asFlusher(completion);
+  incrementPackerFlushGeneration(flusher->vdo->packer);
+  launchCallback(completion, finishNotification, flusher->threadID);
+}
+
+/**
+ * Increment the flush generation in a logical zone. If there are more logical
+ * zones, go on to the next one, otherwise, prepare the physical zones. This
+ * callback is registered both in notifyFlush() and in itself.
+ *
+ * @param completion  The flusher as a completion
+ **/
+static void incrementGeneration(VDOCompletion *completion)
+{
+  Flusher *flusher = asFlusher(completion);
+  incrementFlushGeneration(flusher->logicalZoneToNotify,
+                           flusher->notifyGeneration);
+  flusher->logicalZoneToNotify
+    = getNextLogicalZone(flusher->logicalZoneToNotify);
+  if (flusher->logicalZoneToNotify == NULL) {
+    launchCallback(completion, flushPackerCallback, flusher->threadID);
+    return;
+  }
+
+  launchCallback(completion, incrementGeneration,
+                 getLogicalZoneThreadID(flusher->logicalZoneToNotify));
+}
+
+/**
+ * Lauch a flush notification.
+ *
+ * @param flusher  The flusher doing the notification
+ **/
+static void notifyFlush(Flusher *flusher)
+{
+  VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->notifiers));
+  flusher->notifyGeneration    = flush->flushGeneration;
+  flusher->logicalZoneToNotify = getLogicalZone(flusher->vdo->logicalZones, 0);
+  flusher->completion.requeue  = true;
+  launchCallback(&flusher->completion, incrementGeneration,
+                 getLogicalZoneThreadID(flusher->logicalZoneToNotify));
+}
+
+/**********************************************************************/
+void flush(VDO *vdo, VDOFlush *flush)
+{
+  Flusher *flusher = vdo->flusher;
+  ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID),
+                  "flush() called from flusher thread");
+
+  flush->flushGeneration = flusher->flushGeneration++;
+  bool mayNotify         = !hasWaiters(&flusher->notifiers);
+
+  int result = enqueueWaiter(&flusher->notifiers, &flush->waiter);
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(vdo->readOnlyNotifier, result);
+    flusher->completion.layer->completeFlush(&flush);
+    return;
+  }
+
+  if (mayNotify) {
+    notifyFlush(flusher);
+  }
+}
+
+/**********************************************************************/
+void completeFlushes(Flusher *flusher)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID),
+                  "completeFlushes() called from flusher thread");
+
+  SequenceNumber oldestActiveGeneration = UINT64_MAX;
+  for (LogicalZone *zone = getLogicalZone(flusher->vdo->logicalZones, 0);
+       zone != NULL;
+       zone = getNextLogicalZone(zone)) {
+    SequenceNumber oldestInZone = getOldestLockedGeneration(zone);
+    oldestActiveGeneration = minSequenceNumber(oldestActiveGeneration,
+                                               oldestInZone);
+  }
+
+  while (hasWaiters(&flusher->pendingFlushes)) {
+    VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->pendingFlushes));
+    if (flush->flushGeneration >= oldestActiveGeneration) {
+      return;
+    }
+
+    ASSERT_LOG_ONLY((flush->flushGeneration
+                     == flusher->firstUnacknowledgedGeneration),
+                    "acknowledged next expected flush, %" PRIu64
+                    ", was: %llu",
+                    flusher->firstUnacknowledgedGeneration,
+                    flush->flushGeneration);
+    dequeueNextWaiter(&flusher->pendingFlushes);
+    flusher->completion.layer->completeFlush(&flush);
+    flusher->firstUnacknowledgedGeneration++;
+  }
+}
+
+/**********************************************************************/
+void dumpFlusher(const Flusher *flusher)
+{
+  logInfo("Flusher");
+  logInfo("  flushGeneration=%" PRIu64
+          " firstUnacknowledgedGeneration=%llu",
+          flusher->flushGeneration, flusher->firstUnacknowledgedGeneration);
+  logInfo("  notifiers queue is %s; pendingFlushes queue is %s",
+          (hasWaiters(&flusher->notifiers) ? "not empty" : "empty"),
+          (hasWaiters(&flusher->pendingFlushes) ? "not empty" : "empty"));
+}
diff --git a/vdo/base/flush.h b/vdo/base/flush.h
new file mode 100644
index 0000000..da7c8bc
--- /dev/null
+++ b/vdo/base/flush.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.h#1 $
+ */
+
+#ifndef FLUSH_H
+#define FLUSH_H
+
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * A marker for tracking which journal entries are affected by a flush request.
+ **/
+struct vdoFlush {
+  /** The wait queue entry for this flush */
+  Waiter         waiter;
+  /** Which flush this struct represents */
+  SequenceNumber flushGeneration;
+};
+
+/**
+ * Make a flusher for a VDO.
+ *
+ * @param vdo  The VDO which owns the flusher
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeFlusher(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a flusher and null out the reference to it.
+ *
+ * @param flusherPtr  A pointer to the flusher to free
+ **/
+void freeFlusher(Flusher **flusherPtr);
+
+/**
+ * Get the ID of the thread on which flusher functions should be called.
+ *
+ * @param flusher  The flusher to query
+ *
+ * @return The ID of the thread which handles the flusher
+ **/
+ThreadID getFlusherThreadID(Flusher *flusher)
+  __attribute__((warn_unused_result));
+
+/**
+ * Handle empty flush requests.
+ *
+ * @param vdo       The VDO
+ * @param vdoFlush  The opaque flush request
+ **/
+void flush(VDO *vdo, VDOFlush *vdoFlush);
+
+/**
+ * Attempt to complete any flushes which might have finished.
+ *
+ * @param flusher  The flusher
+ **/
+void completeFlushes(Flusher *flusher);
+
+/**
+ * Dump the flusher, in a thread-unsafe fashion.
+ *
+ * @param flusher  The flusher
+ **/
+void dumpFlusher(const Flusher *flusher);
+
+#endif /* FLUSH_H */
diff --git a/vdo/base/forest.c b/vdo/base/forest.c
new file mode 100644
index 0000000..eabd6c3
--- /dev/null
+++ b/vdo/base/forest.c
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.c#8 $
+ */
+
+#include "forest.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMap.h"
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "blockMapTree.h"
+#include "blockMapTreeInternals.h"
+#include "constants.h"
+#include "dirtyLists.h"
+#include "forest.h"
+#include "numUtils.h"
+#include "recoveryJournal.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vio.h"
+#include "vioPool.h"
+
+enum {
+  BLOCK_MAP_VIO_POOL_SIZE = 64,
+};
+
+typedef struct {
+  TreePage *levels[BLOCK_MAP_TREE_HEIGHT];
+} BlockMapTreeSegment;
+
+typedef struct blockMapTree {
+  BlockMapTreeSegment *segments;
+} BlockMapTree;
+
+struct forest {
+  BlockMap      *map;
+  size_t         segments;
+  Boundary      *boundaries;
+  TreePage     **pages;
+  BlockMapTree   trees[];
+};
+
+typedef struct {
+  PageNumber pageIndex;
+  SlotNumber slot;
+} CursorLevel;
+
+typedef struct cursors Cursors;
+
+typedef struct {
+  Waiter        waiter;
+  BlockMapTree *tree;
+  Height        height;
+  Cursors      *parent;
+  Boundary      boundary;
+  CursorLevel   levels[BLOCK_MAP_TREE_HEIGHT];
+  VIOPoolEntry *vioPoolEntry;
+} Cursor;
+
+struct cursors {
+  BlockMap         *map;
+  BlockMapTreeZone *zone;
+  VIOPool          *pool;
+  EntryCallback    *entryCallback;
+  VDOCompletion    *parent;
+  RootCount         activeRoots;
+  Cursor            cursors[];
+};
+
+/**********************************************************************/
+TreePage *getTreePageByIndex(Forest       *forest,
+                             RootCount     rootIndex,
+                             Height        height,
+                             PageNumber    pageIndex)
+{
+  PageNumber offset = 0;
+  for (size_t segment = 0; segment < forest->segments; segment++) {
+    PageNumber border = forest->boundaries[segment].levels[height - 1];
+    if (pageIndex < border) {
+      BlockMapTree *tree = &forest->trees[rootIndex];
+      return &(tree->segments[segment].levels[height - 1][pageIndex - offset]);
+    }
+    offset = border;
+  }
+
+  return NULL;
+}
+
+/**
+ * Compute the number of pages which must be allocated at each level in order
+ * to grow the forest to a new number of entries.
+ *
+ * @param [in]  rootCount      The number of roots
+ * @param [in]  flatPageCount  The number of flat block map pages
+ * @param [in]  oldSizes       The current size of the forest at each level
+ * @param [in]  entries        The new number of entries the block map must
+ *                             address
+ * @param [out] newSizes       The new size of the forest at each level
+ *
+ * @return The total number of non-leaf pages required
+ **/
+static BlockCount computeNewPages(RootCount   rootCount,
+                                  BlockCount  flatPageCount,
+                                  Boundary   *oldSizes,
+                                  BlockCount  entries,
+                                  Boundary   *newSizes)
+{
+  PageCount leafPages
+    = maxPageCount(computeBlockMapPageCount(entries) - flatPageCount, 1);
+  PageCount  levelSize  = computeBucketCount(leafPages, rootCount);
+  BlockCount totalPages = 0;
+  for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) {
+    levelSize = computeBucketCount(levelSize, BLOCK_MAP_ENTRIES_PER_PAGE);
+    newSizes->levels[height] = levelSize;
+    BlockCount newPages = levelSize;
+    if (oldSizes != NULL) {
+      newPages -= oldSizes->levels[height];
+    }
+    totalPages += (newPages * rootCount);
+  }
+
+  return totalPages;
+}
+
+/**********************************************************************/
+static int makeSegment(Forest      *oldForest,
+                       BlockCount   newPages,
+                       Boundary    *newBoundary,
+                       Forest      *forest)
+{
+  size_t index     = (oldForest == NULL) ? 0 : oldForest->segments;
+  forest->segments = index + 1;
+
+  int result = ALLOCATE(forest->segments, Boundary, "forest boundary array",
+                        &forest->boundaries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(forest->segments, TreePage *, "forest page pointers",
+                    &forest->pages);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(newPages, TreePage, "new forest pages",
+                    &forest->pages[index]);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (index > 0) {
+    memcpy(forest->boundaries, oldForest->boundaries,
+           index * sizeof(Boundary));
+    memcpy(forest->pages, oldForest->pages, index * sizeof(TreePage *));
+  }
+
+  memcpy(&(forest->boundaries[index]), newBoundary, sizeof(Boundary));
+
+  PageCount segmentSizes[BLOCK_MAP_TREE_HEIGHT];
+  for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) {
+    segmentSizes[height] = newBoundary->levels[height];
+    if (index > 0) {
+      segmentSizes[height] -= oldForest->boundaries[index - 1].levels[height];
+    }
+  }
+
+  TreePage *pagePtr = forest->pages[index];
+  for (RootCount root = 0; root < forest->map->rootCount; root++) {
+    BlockMapTree *tree = &(forest->trees[root]);
+    int result = ALLOCATE(forest->segments, BlockMapTreeSegment,
+                          "tree root segments", &tree->segments);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+
+    if (index > 0) {
+      memcpy(tree->segments, oldForest->trees[root].segments,
+             index * sizeof(BlockMapTreeSegment));
+    }
+
+    BlockMapTreeSegment *segment = &(tree->segments[index]);
+    for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) {
+      if (segmentSizes[height] == 0) {
+        continue;
+      }
+
+      segment->levels[height] = pagePtr;
+      if (height == (BLOCK_MAP_TREE_HEIGHT - 1)) {
+        // Record the root.
+        BlockMapPage *page = formatBlockMapPage(pagePtr->pageBuffer,
+                                                forest->map->nonce,
+                                                INVALID_PBN, true);
+        page->entries[0] = packPBN(forest->map->rootOrigin + root,
+                                   MAPPING_STATE_UNCOMPRESSED);
+      }
+      pagePtr += segmentSizes[height];
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void deforest(Forest *forest, size_t firstPageSegment)
+{
+  if (forest->pages != NULL) {
+    for (size_t segment = firstPageSegment; segment < forest->segments;
+         segment++) {
+      FREE(forest->pages[segment]);
+    }
+    FREE(forest->pages);
+  }
+
+  for (RootCount root = 0; root < forest->map->rootCount; root++) {
+    BlockMapTree *tree = &(forest->trees[root]);
+    FREE(tree->segments);
+  }
+
+  FREE(forest->boundaries);
+  FREE(forest);
+}
+
+/**********************************************************************/
+int makeForest(BlockMap *map, BlockCount entries)
+{
+  STATIC_ASSERT(offsetof(TreePage, waiter) == 0);
+
+  Forest   *oldForest   = map->forest;
+  Boundary *oldBoundary = NULL;
+  if (oldForest != NULL) {
+    oldBoundary = &(oldForest->boundaries[oldForest->segments - 1]);
+  }
+
+  Boundary newBoundary;
+  BlockCount newPages = computeNewPages(map->rootCount, map->flatPageCount,
+                                        oldBoundary, entries, &newBoundary);
+  if (newPages == 0) {
+    map->nextEntryCount = entries;
+    return VDO_SUCCESS;
+  }
+
+  Forest *forest;
+  int result = ALLOCATE_EXTENDED(Forest, map->rootCount, BlockMapTree,
+                                 __func__, &forest);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  forest->map = map;
+  result = makeSegment(oldForest, newPages, &newBoundary, forest);
+  if (result != VDO_SUCCESS) {
+    deforest(forest, forest->segments - 1);
+    return result;
+  }
+
+  map->nextForest     = forest;
+  map->nextEntryCount = entries;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeForest(Forest **forestPtr)
+{
+  Forest *forest = *forestPtr;
+  if (forest == NULL) {
+    return;
+  }
+
+  deforest(forest, 0);
+  *forestPtr = NULL;
+}
+
+/**********************************************************************/
+void abandonForest(BlockMap *map)
+{
+  Forest *forest = map->nextForest;
+  map->nextForest = NULL;
+  if (forest != NULL) {
+    deforest(forest, forest->segments - 1);
+  }
+
+  map->nextEntryCount = 0;
+}
+
+/**********************************************************************/
+void replaceForest(BlockMap *map)
+{
+  if (map->nextForest != NULL) {
+    if (map->forest != NULL) {
+      deforest(map->forest, map->forest->segments);
+    }
+    map->forest     = map->nextForest;
+    map->nextForest = NULL;
+  }
+
+  map->entryCount     = map->nextEntryCount;
+  map->nextEntryCount = 0;
+}
+
+/**
+ * Finish the traversal of a single tree. If it was the last cursor, finish
+ * the traversal.
+ *
+ * @param cursor  The cursor doing the traversal
+ **/
+static void finishCursor(Cursor *cursor)
+{
+  Cursors *cursors = cursor->parent;
+  returnVIOToPool(cursors->pool, cursor->vioPoolEntry);
+  if (--cursors->activeRoots > 0) {
+    return;
+  }
+
+  VDOCompletion *parent = cursors->parent;
+  FREE(cursors);
+
+  finishCompletion(parent, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+static void traverse(Cursor *cursor);
+
+/**
+ * Continue traversing a block map tree.
+ *
+ * @param completion  The VIO doing a read or write
+ **/
+static void continueTraversal(VDOCompletion *completion)
+{
+  VIOPoolEntry *poolEntry = completion->parent;
+  Cursor       *cursor    = poolEntry->parent;
+  traverse(cursor);
+}
+
+/**
+ * Continue traversing a block map tree now that a page has been loaded.
+ *
+ * @param completion  The VIO doing the read
+ **/
+static void finishTraversalLoad(VDOCompletion *completion)
+{
+  VIOPoolEntry *entry  = completion->parent;
+  Cursor       *cursor = entry->parent;
+  Height        height = cursor->height;
+  CursorLevel  *level  = &cursor->levels[height];
+
+  TreePage     *treePage
+    = &(cursor->tree->segments[0].levels[height][level->pageIndex]);
+  BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer;
+  copyValidPage(entry->buffer, cursor->parent->map->nonce,
+                entry->vio->physical, page);
+  traverse(cursor);
+}
+
+/**
+ * Traverse a single block map tree. This is the recursive heart of the
+ * traversal process.
+ *
+ * @param cursor  The cursor doing the traversal
+ **/
+static void traverse(Cursor *cursor)
+{
+  for (; cursor->height < BLOCK_MAP_TREE_HEIGHT; cursor->height++) {
+    Height       height = cursor->height;
+    CursorLevel *level  = &cursor->levels[height];
+    TreePage *treePage
+      = &(cursor->tree->segments[0].levels[height][level->pageIndex]);
+    BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer;
+    if (!isBlockMapPageInitialized(page)) {
+      continue;
+    }
+
+    for (; level->slot < BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) {
+      DataLocation location = unpackBlockMapEntry(&page->entries[level->slot]);
+      if (!isValidLocation(&location)) {
+        // This entry is invalid, so remove it from the page.
+        page->entries[level->slot]
+          = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+        writeTreePage(treePage, cursor->parent->zone);
+        continue;
+      }
+
+      if (!isMappedLocation(&location)) {
+        continue;
+      }
+
+      PageNumber entryIndex
+        = (BLOCK_MAP_ENTRIES_PER_PAGE * level->pageIndex) + level->slot;
+
+      // Erase mapped entries past the end of the logical space.
+      if (entryIndex >= cursor->boundary.levels[height]) {
+        page->entries[level->slot]
+          = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+        writeTreePage(treePage, cursor->parent->zone);
+        continue;
+      }
+
+      if (cursor->height < BLOCK_MAP_TREE_HEIGHT - 1) {
+        int result = cursor->parent->entryCallback(location.pbn,
+                                                   cursor->parent->parent);
+        if (result != VDO_SUCCESS) {
+          page->entries[level->slot]
+            = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+          writeTreePage(treePage, cursor->parent->zone);
+          continue;
+        }
+      }
+
+      if (cursor->height == 0) {
+        continue;
+      }
+
+      cursor->height--;
+      CursorLevel *nextLevel = &cursor->levels[cursor->height];
+      nextLevel->pageIndex   = entryIndex;
+      nextLevel->slot        = 0;
+      level->slot++;
+      launchReadMetadataVIO(cursor->vioPoolEntry->vio, location.pbn,
+                            finishTraversalLoad, continueTraversal);
+      return;
+    }
+  }
+
+  finishCursor(cursor);
+}
+
+/**
+ * Start traversing a single block map tree now that the Cursor has a VIO with
+ * which to load pages.
+ *
+ * <p>Implements WaiterCallback.
+ *
+ * @param waiter   The Cursor
+ * @param context  The VIOPoolEntry just acquired
+ **/
+static void launchCursor(Waiter *waiter, void *context)
+{
+  STATIC_ASSERT(offsetof(Cursor, waiter) == 0);
+  Cursor *cursor               = (Cursor *) waiter;
+  cursor->vioPoolEntry         = (VIOPoolEntry *) context;
+  cursor->vioPoolEntry->parent = cursor;
+  vioAsCompletion(cursor->vioPoolEntry->vio)->callbackThreadID
+    = cursor->parent->zone->mapZone->threadID;
+  traverse(cursor);
+}
+
+/**
+ * Compute the number of pages used at each level of the given root's tree.
+ *
+ * @param map        The block map
+ * @param rootIndex  The index of the root to measure
+ *
+ * @return The list of page counts as a Boundary
+ **/
+static Boundary computeBoundary(BlockMap *map, RootCount rootIndex)
+{
+  PageCount leafPages     = computeBlockMapPageCount(map->entryCount);
+  PageCount treeLeafPages = leafPages - map->flatPageCount;
+
+  /*
+   * Compute the leaf pages for this root. If the number of leaf pages does
+   * not distribute evenly, we must determine if this root gets an extra page.
+   * Extra pages are assigned to roots starting at firstTreeRoot and going up.
+   */
+  PageCount firstTreeRoot = map->flatPageCount % map->rootCount;
+  PageCount lastTreeRoot  = (leafPages - 1) % map->rootCount;
+
+  PageCount levelPages = treeLeafPages / map->rootCount;
+  if (inCyclicRange(firstTreeRoot, rootIndex, lastTreeRoot, map->rootCount)) {
+    levelPages++;
+  }
+
+  Boundary boundary;
+  for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT - 1; height++) {
+    boundary.levels[height] = levelPages;
+    levelPages = computeBucketCount(levelPages, BLOCK_MAP_ENTRIES_PER_PAGE);
+  }
+
+  // The root node always exists, even if the root is otherwise unused.
+  boundary.levels[BLOCK_MAP_TREE_HEIGHT - 1] = 1;
+
+  return boundary;
+}
+
+/**********************************************************************/
+void traverseForest(BlockMap      *map,
+                    EntryCallback *entryCallback,
+                    VDOCompletion *parent)
+{
+  if (computeBlockMapPageCount(map->entryCount) <= map->flatPageCount) {
+    // There are no tree pages, so there's nothing to do.
+    finishCompletion(parent, VDO_SUCCESS);
+    return;
+  }
+
+  Cursors *cursors;
+  int result = ALLOCATE_EXTENDED(Cursors, map->rootCount, Cursor, __func__,
+                                 &cursors);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  cursors->map           = map;
+  cursors->zone          = &(getBlockMapZone(map, 0)->treeZone);
+  cursors->pool          = cursors->zone->vioPool;
+  cursors->entryCallback = entryCallback;
+  cursors->parent        = parent;
+  cursors->activeRoots   = map->rootCount;
+  for (RootCount root = 0; root < map->rootCount; root++) {
+    Cursor *cursor = &cursors->cursors[root];
+    *cursor = (Cursor) {
+      .tree     = &map->forest->trees[root],
+      .height   = BLOCK_MAP_TREE_HEIGHT - 1,
+      .parent   = cursors,
+      .boundary = computeBoundary(map, root),
+    };
+
+    cursor->waiter.callback = launchCursor;
+    acquireVIOFromPool(cursors->pool, &cursor->waiter);
+  };
+}
+
+/**********************************************************************/
+BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount)
+{
+  Boundary newSizes;
+  BlockCount approximateNonLeaves
+    = computeNewPages(rootCount, 0, NULL, logicalBlocks, &newSizes);
+
+  // Exclude the tree roots since those aren't allocated from slabs,
+  // and also exclude the super-roots, which only exist in memory.
+  approximateNonLeaves
+    -= rootCount * (newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 2]
+                    + newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 1]);
+
+  BlockCount approximateLeaves
+    = computeBlockMapPageCount(logicalBlocks - approximateNonLeaves);
+
+  // This can be a slight over-estimate since the tree will never have to
+  // address these blocks, so it might be a tiny bit smaller.
+  return (approximateNonLeaves + approximateLeaves);
+}
diff --git a/vdo/base/forest.h b/vdo/base/forest.h
new file mode 100644
index 0000000..9a5a7cf
--- /dev/null
+++ b/vdo/base/forest.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.h#2 $
+ */
+
+#ifndef FOREST_H
+#define FOREST_H
+
+#include "blockMapTree.h"
+#include "types.h"
+
+/**
+ * A function to be called for each allocated PBN when traversing the forest.
+ *
+ * @param pbn         A PBN of a tree node
+ * @param completion  The parent completion of the traversal
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+typedef int EntryCallback(PhysicalBlockNumber pbn, VDOCompletion *completion);
+
+/**
+ * Get the tree page for a given height and page index.
+ *
+ * @param forest     The forest which holds the page
+ * @param rootIndex  The index of the tree that holds the page
+ * @param height     The height of the desired page
+ * @param pageIndex  The index of the desired page
+ *
+ * @return The requested page
+ **/
+TreePage *getTreePageByIndex(Forest       *forest,
+                             RootCount     rootIndex,
+                             Height        height,
+                             PageNumber    pageIndex)
+  __attribute__((warn_unused_result));
+
+/**
+ * Make a collection of trees for a BlockMap, expanding the existing forest if
+ * there is one.
+ *
+ * @param map      The block map
+ * @param entries  The number of entries the block map will hold
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeForest(BlockMap *map, BlockCount entries)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a forest and all of the segments it contains and NULL out the reference
+ * to it.
+ *
+ * @param forestPtr  A pointer to the forest to free
+ **/
+void freeForest(Forest **forestPtr);
+
+/**
+ * Abandon the unused next forest from a BlockMap.
+ *
+ * @param map  The block map
+ **/
+void abandonForest(BlockMap *map);
+
+/**
+ * Replace a BlockMap's Forest with the already-prepared larger forest.
+ *
+ * @param map  The block map
+ **/
+void replaceForest(BlockMap *map);
+
+/**
+ * Walk the entire forest of a block map.
+ *
+ * @param map            The block map to traverse
+ * @param entryCallback  A function to call with the pbn of each allocated node
+ *                       in the forest
+ * @param parent         The completion to notify on each traversed PBN, and
+ *                       when the traversal is complete
+ **/
+void traverseForest(BlockMap      *map,
+                    EntryCallback *entryCallback,
+                    VDOCompletion *parent);
+
+/**
+ * Compute the approximate number of pages which the forest will allocate in
+ * order to map the specified number of logical blocks. This method assumes
+ * that the block map is entirely arboreal.
+ *
+ * @param logicalBlocks  The number of blocks to map
+ * @param rootCount      The number of trees in the forest
+ *
+ * @return A (slight) over-estimate of the total number of possible forest
+ *         pages including the leaves
+ **/
+BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount)
+  __attribute__((warn_unused_result));
+#endif // FOREST_H
diff --git a/vdo/base/hashLock.c b/vdo/base/hashLock.c
new file mode 100644
index 0000000..8494f1d
--- /dev/null
+++ b/vdo/base/hashLock.c
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.c#5 $
+ */
+
+/**
+ * HashLock controls and coordinates writing, index access, and dedupe among
+ * groups of DataVIOs concurrently writing identical blocks, allowing them to
+ * deduplicate not only against advice but also against each other. This save
+ * on index queries and allows those DataVIOs to concurrently deduplicate
+ * against a single block instead of being serialized through a PBN read lock.
+ * Only one index query is needed for each HashLock, instead of one for every
+ * DataVIO.
+ *
+ * A HashLock acts like a state machine perhaps more than as a lock. Other
+ * than the starting and ending states INITIALIZING and DESTROYING, every
+ * state represents and is held for the duration of an asynchronous operation.
+ * All state transitions are performed on the thread of the HashZone
+ * containing the lock. An asynchronous operation is almost always performed
+ * upon entering a state, and the callback from that operation triggers
+ * exiting the state and entering a new state.
+ *
+ * In all states except DEDUPING, there is a single DataVIO, called the lock
+ * agent, performing the asynchronous operations on behalf of the lock. The
+ * agent will change during the lifetime of the lock if the lock is shared by
+ * more than one DataVIO. DataVIOs waiting to deduplicate are kept on a wait
+ * queue. Viewed a different way, the agent holds the lock exclusively until
+ * the lock enters the DEDUPING state, at which point it becomes a shared lock
+ * that all the waiters (and any new DataVIOs that arrive) use to share a PBN
+ * lock. In state DEDUPING, there is no agent. When the last DataVIO in the
+ * lock calls back in DEDUPING, it becomes the agent and the lock becomes
+ * exclusive again. New DataVIOs that arrive in the lock will also go on the
+ * wait queue.
+ *
+ * The existence of lock waiters is a key factor controlling which state the
+ * lock transitions to next. When the lock is new or has waiters, it will
+ * always try to reach DEDUPING, and when it doesn't, it will try to clean up
+ * and exit.
+ *
+ * Deduping requires holding a PBN lock on a block that is known to contain
+ * data identical to the DataVIOs in the lock, so the lock will send the
+ * agent to the duplicate zone to acquire the PBN lock (LOCKING), to the
+ * kernel I/O threads to read and verify the data (VERIFYING), or to write a
+ * new copy of the data to a full data block or a slot in a compressed block
+ * (WRITING).
+ *
+ * Cleaning up consists of updating the index when the data location is
+ * different from the initial index query (UPDATING, triggered by stale
+ * advice, compression, and rollover), releasing the PBN lock on the duplicate
+ * block (UNLOCKING), and releasing the HashLock itself back to the hash zone
+ * (DESTROYING).
+ *
+ * The shortest sequence of states is for non-concurrent writes of new data:
+ *   INITIALIZING -> QUERYING -> WRITING -> DESTROYING
+ * This sequence is short because no PBN read lock or index update is needed.
+ *
+ * Non-concurrent, finding valid advice looks like this (endpoints elided):
+ *   -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING ->
+ * Or with stale advice (endpoints elided):
+ *   -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING ->
+ *
+ * When there are not enough available reference count increments available on
+ * a PBN for a DataVIO to deduplicate, a new lock is forked and the excess
+ * waiters roll over to the new lock (which goes directly to WRITING). The new
+ * lock takes the place of the old lock in the lock map so new DataVIOs will
+ * be directed to it. The two locks will proceed independently, but only the
+ * new lock will have the right to update the index (unless it also forks).
+ *
+ * Since rollover happens in a lock instance, once a valid data location has
+ * been selected, it will not change. QUERYING and WRITING are only performed
+ * once per lock lifetime. All other non-endpoint states can be re-entered.
+ *
+ * XXX still need doc on BYPASSING
+ *
+ * The function names in this module follow a convention referencing the
+ * states and transitions in the state machine diagram for VDOSTORY-190.
+ * [XXX link or repository path to it?]
+ * For example, for the LOCKING state, there are startLocking() and
+ * finishLocking() functions. startLocking() is invoked by the finish function
+ * of the state (or states) that transition to LOCKING. It performs the actual
+ * lock state change and must be invoked on the hash zone thread.
+ * finishLocking() is called by (or continued via callback from) the code
+ * actually obtaining the lock. It does any bookkeeping or decision-making
+ * required and invokes the appropriate start function of the state being
+ * transitioned to after LOCKING.
+ **/
+
+#include "hashLock.h"
+#include "hashLockInternals.h"
+
+#include "logger.h"
+#include "permassert.h"
+
+#include "compressionState.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "hashZone.h"
+#include "packer.h"
+#include "pbnLock.h"
+#include "physicalZone.h"
+#include "ringNode.h"
+#include "slab.h"
+#include "slabDepot.h"
+#include "trace.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vioWrite.h"
+#include "waitQueue.h"
+
+static const char *LOCK_STATE_NAMES[] = {
+  [HASH_LOCK_BYPASSING]    = "BYPASSING",
+  [HASH_LOCK_DEDUPING]     = "DEDUPING",
+  [HASH_LOCK_DESTROYING]   = "DESTROYING",
+  [HASH_LOCK_INITIALIZING] = "INITIALIZING",
+  [HASH_LOCK_LOCKING]      = "LOCKING",
+  [HASH_LOCK_QUERYING]     = "QUERYING",
+  [HASH_LOCK_UNLOCKING]    = "UNLOCKING",
+  [HASH_LOCK_UPDATING]     = "UPDATING",
+  [HASH_LOCK_VERIFYING]    = "VERIFYING",
+  [HASH_LOCK_WRITING]      = "WRITING",
+};
+
+// There are loops in the state diagram, so some forward decl's are needed.
+static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone);
+static void startLocking(HashLock *lock, DataVIO *agent);
+static void startWriting(HashLock *lock, DataVIO *agent);
+static void unlockDuplicatePBN(VDOCompletion *completion);
+static void transferAllocationLock(DataVIO *dataVIO);
+
+/**********************************************************************/
+PBNLock *getDuplicateLock(DataVIO *dataVIO)
+{
+  if (dataVIO->hashLock == NULL) {
+    return NULL;
+  }
+  return dataVIO->hashLock->duplicateLock;
+}
+
+/**********************************************************************/
+const char *getHashLockStateName(HashLockState state)
+{
+  // Catch if a state has been added without updating the name array.
+  STATIC_ASSERT((HASH_LOCK_DESTROYING + 1) == COUNT_OF(LOCK_STATE_NAMES));
+  return (state < COUNT_OF(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : NULL;
+}
+
+/**
+ * Set the current state of a hash lock.
+ *
+ * @param lock      The lock to update
+ * @param newState  The new state
+ **/
+static void setHashLockState(HashLock *lock, HashLockState newState)
+{
+  if (false) {
+    logWarning("XXX %" PRIptr " %s -> %s", (void *) lock,
+               getHashLockStateName(lock->state),
+               getHashLockStateName(newState));
+  }
+  lock->state = newState;
+}
+
+/**
+ * Assert that a DataVIO is the agent of its hash lock, and that this is being
+ * called in the hash zone.
+ *
+ * @param dataVIO  The DataVIO expected to be the lock agent
+ * @param where    A string describing the function making the assertion
+ **/
+static void assertHashLockAgent(DataVIO *dataVIO, const char *where)
+{
+  // Not safe to access the agent field except from the hash zone.
+  assertInHashZone(dataVIO);
+  ASSERT_LOG_ONLY(dataVIO == dataVIO->hashLock->agent,
+                  "%s must be for the hash lock agent", where);
+}
+
+/**
+ * Set or clear the lock agent.
+ *
+ * @param lock      The hash lock to update
+ * @param newAgent  The new lock agent (may be NULL to clear the agent)
+ **/
+static void setAgent(HashLock *lock, DataVIO *newAgent)
+{
+  lock->agent = newAgent;
+}
+
+/**
+ * Set the duplicate lock held by a hash lock. May only be called in the
+ * physical zone of the PBN lock.
+ *
+ * @param hashLock  The hash lock to update
+ * @param pbnLock   The PBN read lock to use as the duplicate lock
+ **/
+static void setDuplicateLock(HashLock *hashLock, PBNLock *pbnLock)
+{
+  ASSERT_LOG_ONLY((hashLock->duplicateLock == NULL),
+                  "hash lock must not already hold a duplicate lock");
+
+  pbnLock->holderCount += 1;
+  hashLock->duplicateLock = pbnLock;
+}
+
+/**
+ * Convert a pointer to the hashLockNode field in a DataVIO to the enclosing
+ * DataVIO.
+ *
+ * @param lockNode The RingNode to convert
+ *
+ * @return A pointer to the DataVIO containing the RingNode
+ **/
+static inline DataVIO *dataVIOFromLockNode(RingNode *lockNode)
+{
+  return (DataVIO *) ((byte *) lockNode - offsetof(DataVIO, hashLockNode));
+}
+
+/**
+ * Remove the first DataVIO from the lock's wait queue and return it.
+ *
+ * @param lock  The lock containing the wait queue
+ *
+ * @return The first (oldest) waiter in the queue, or <code>NULL</code> if
+ *         the queue is empty
+ **/
+static inline DataVIO *dequeueLockWaiter(HashLock *lock)
+{
+  return waiterAsDataVIO(dequeueNextWaiter(&lock->waiters));
+}
+
+/**
+ * Continue processing a DataVIO that has been waiting for an event, setting
+ * the result from the event, and continuing in a specified callback function.
+ *
+ * @param dataVIO   The DataVIO to continue
+ * @param result    The current result (will not mask older errors)
+ * @param callback  The function in which to continue processing
+ **/
+static void continueDataVIOIn(DataVIO   *dataVIO,
+                              int        result,
+                              VDOAction *callback)
+{
+  dataVIOAsCompletion(dataVIO)->callback = callback;
+  continueDataVIO(dataVIO, result);
+}
+
+/**
+ * Set, change, or clear the hash lock a DataVIO is using. Updates the hash
+ * lock (or locks) to reflect the change in membership.
+ *
+ * @param dataVIO  The DataVIO to update
+ * @param newLock  The hash lock the DataVIO is joining
+ **/
+static void setHashLock(DataVIO *dataVIO, HashLock *newLock)
+{
+  HashLock *oldLock = dataVIO->hashLock;
+  if (oldLock != NULL) {
+    ASSERT_LOG_ONLY(dataVIO->hashZone != NULL,
+                    "must have a hash zone when halding a hash lock");
+    ASSERT_LOG_ONLY(!isRingEmpty(&dataVIO->hashLockNode),
+                    "must be on a hash lock ring when holding a hash lock");
+    ASSERT_LOG_ONLY(oldLock->referenceCount > 0,
+                    "hash lock reference must be counted");
+
+    if ((oldLock->state != HASH_LOCK_BYPASSING)
+        && (oldLock->state != HASH_LOCK_UNLOCKING)) {
+      // If the reference count goes to zero in a non-terminal state, we're
+      // most likely leaking this lock.
+      ASSERT_LOG_ONLY(oldLock->referenceCount > 1,
+                      "hash locks should only become unreferenced in"
+                      " a terminal state, not state %s",
+                      getHashLockStateName(oldLock->state));
+    }
+
+    unspliceRingNode(&dataVIO->hashLockNode);
+    oldLock->referenceCount -= 1;
+
+    dataVIO->hashLock = NULL;
+  }
+
+  if (newLock != NULL) {
+    // Keep all DataVIOs sharing the lock on a ring since they can complete in
+    // any order and we'll always need a pointer to one to compare data.
+    pushRingNode(&newLock->duplicateRing, &dataVIO->hashLockNode);
+    newLock->referenceCount += 1;
+
+    // XXX Not needed for VDOSTORY-190, but useful for checking whether a test
+    // is getting concurrent dedupe, and how much.
+    if (newLock->maxReferences < newLock->referenceCount) {
+      newLock->maxReferences = newLock->referenceCount;
+    }
+
+    dataVIO->hashLock = newLock;
+  }
+}
+
+/**
+ * Bottleneck for DataVIOs that have written or deduplicated and that are no
+ * longer needed to be an agent for the hash lock.
+ *
+ * @param dataVIO  The DataVIO to complete and send to be cleaned up
+ **/
+static void exitHashLock(DataVIO *dataVIO)
+{
+  // XXX trace record?
+
+  // Release the hash lock now, saving a thread transition in cleanup.
+  releaseHashLock(dataVIO);
+
+  // Complete the DataVIO and start the clean-up path in vioWrite to release
+  // any locks it still holds.
+  finishDataVIO(dataVIO, VDO_SUCCESS);
+}
+
+/**
+ * Retire the active lock agent, replacing it with the first lock waiter, and
+ * make the retired agent exit the hash lock.
+ *
+ * @param lock  The hash lock to update
+ *
+ * @return The new lock agent (which will be NULL if there was no waiter)
+ **/
+static DataVIO *retireLockAgent(HashLock *lock)
+{
+  DataVIO *oldAgent = lock->agent;
+  DataVIO *newAgent = dequeueLockWaiter(lock);
+  setAgent(lock, newAgent);
+  exitHashLock(oldAgent);
+  if (newAgent != NULL) {
+    setDuplicateLocation(newAgent, lock->duplicate);
+  }
+  return newAgent;
+}
+
+/**
+ * Callback to call compressData(), putting a DataVIO back on the write path.
+ *
+ * @param completion  The DataVIO
+ **/
+static void compressDataCallback(VDOCompletion *completion)
+{
+  // XXX VDOSTORY-190 need an error check since compressData doesn't have one.
+  compressData(asDataVIO(completion));
+}
+
+/**
+ * Add a DataVIO to the lock's queue of waiters.
+ *
+ * @param lock     The hash lock on which to wait
+ * @param dataVIO  The DataVIO to add to the queue
+ **/
+static void waitOnHashLock(HashLock *lock, DataVIO *dataVIO)
+{
+  int result = enqueueDataVIO(&lock->waiters, dataVIO, THIS_LOCATION(NULL));
+  if (result != VDO_SUCCESS) {
+    // This should be impossible, but if it somehow happens, give up on trying
+    // to dedupe the data.
+    setHashLock(dataVIO, NULL);
+    continueDataVIOIn(dataVIO, result, compressDataCallback);
+    return;
+  }
+
+  // Make sure the agent doesn't block indefinitely in the packer since it now
+  // has at least one other DataVIO waiting on it.
+  if ((lock->state == HASH_LOCK_WRITING) && cancelCompression(lock->agent)) {
+    /*
+     * Even though we're waiting, we also have to send ourselves as a one-way
+     * message to the packer to ensure the agent continues executing. This is
+     * safe because cancelCompression() guarantees the agent won't continue
+     * executing until this message arrives in the packer, and because the
+     * wait queue link isn't used for sending the message.
+     */
+    dataVIO->compression.lockHolder = lock->agent;
+    launchPackerCallback(dataVIO, removeLockHolderFromPacker,
+                         THIS_LOCATION("$F;cb=removeLockHolderFromPacker"));
+  }
+}
+
+/**
+ * WaiterCallback function that calls compressData on the DataVIO waiter.
+ *
+ * @param waiter   The DataVIO's waiter link
+ * @param context  Not used
+ **/
+static void compressWaiter(Waiter *waiter,
+                           void   *context __attribute__((unused)))
+{
+  DataVIO *dataVIO     = waiterAsDataVIO(waiter);
+  dataVIO->isDuplicate = false;
+  compressData(dataVIO);
+}
+
+/**
+ * Handle the result of the agent for the lock releasing a read lock on
+ * duplicate candidate due to aborting the hash lock. This continuation is
+ * registered in unlockDuplicatePBN().
+ *
+ * @param completion  The completion of the DataVIO acting as the lock's agent
+ **/
+static void finishBypassing(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  ASSERT_LOG_ONLY(lock->duplicateLock == NULL,
+                  "must have released the duplicate lock for the hash lock");
+  exitHashLock(agent);
+}
+
+/**
+ * Stop using the hash lock, resuming the old write path for the lock agent
+ * and any DataVIOs waiting on it, and put it in a state where DataVIOs
+ * entering the lock will use the old dedupe path instead of waiting.
+ *
+ * @param lock   The hash lock
+ * @param agent  The DataVIO acting as the agent for the lock
+ **/
+static void startBypassing(HashLock *lock, DataVIO *agent)
+{
+  setHashLockState(lock, HASH_LOCK_BYPASSING);
+
+  // Ensure we don't attempt to update advice when cleaning up.
+  lock->updateAdvice = false;
+
+  ASSERT_LOG_ONLY(((agent != NULL) || !hasWaiters(&lock->waiters)),
+                  "should not have waiters without an agent");
+  notifyAllWaiters(&lock->waiters, compressWaiter, NULL);
+
+  if (lock->duplicateLock != NULL) {
+    if (agent != NULL) {
+      // The agent must reference the duplicate zone to launch it.
+      agent->duplicate = lock->duplicate;
+      launchDuplicateZoneCallback(agent, unlockDuplicatePBN,
+                                  THIS_LOCATION(NULL));
+      return;
+    }
+    ASSERT_LOG_ONLY(false, "hash lock holding a PBN lock must have an agent");
+  }
+
+  if (agent == NULL) {
+    return;
+  }
+
+  setAgent(lock, NULL);
+  agent->isDuplicate = false;
+  compressData(agent);
+}
+
+/**
+ * Abort processing on this hash lock when noticing an error. Currently, this
+ * moves the hash lock to the BYPASSING state, to release all pending DataVIOs.
+ *
+ * @param lock     The HashLock
+ * @param dataVIO  The DataVIO with the error
+ **/
+static void abortHashLock(HashLock *lock, DataVIO *dataVIO)
+{
+  // If we've already aborted the lock, don't try to re-abort it; just exit.
+  if (lock->state == HASH_LOCK_BYPASSING) {
+    exitHashLock(dataVIO);
+    return;
+  }
+
+  if (dataVIO != lock->agent) {
+    if ((lock->agent != NULL) || (lock->referenceCount > 1)) {
+      // Other DataVIOs are still sharing the lock (which should be DEDUPING),
+      // so just kick this one out of the lock to report its error.
+      ASSERT_LOG_ONLY(lock->agent == NULL,
+                      "only active agent should call abortHashLock");
+      exitHashLock(dataVIO);
+      return;
+    }
+    // Make the lone DataVIO the lock agent so it can abort and clean up.
+    setAgent(lock, dataVIO);
+  }
+
+  startBypassing(lock, dataVIO);
+}
+
+/**
+ * Handle the result of the agent for the lock releasing a read lock on
+ * duplicate candidate. This continuation is registered in
+ * unlockDuplicatePBN().
+ *
+ * @param completion  The completion of the DataVIO acting as the lock's agent
+ **/
+static void finishUnlocking(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  ASSERT_LOG_ONLY(lock->duplicateLock == NULL,
+                  "must have released the duplicate lock for the hash lock");
+
+  if (completion->result != VDO_SUCCESS) {
+    abortHashLock(lock, agent);
+    return;
+  }
+
+  if (!lock->verified) {
+    /*
+     * UNLOCKING -> WRITING transition: The lock we released was on an
+     * unverified block, so it must have been a lock on advice we were
+     * verifying, not on a location that was used for deduplication. Go write
+     * (or compress) the block to get a location to dedupe against.
+     */
+    startWriting(lock, agent);
+    return;
+  }
+
+  // With the lock released, the verified duplicate block may already have
+  // changed and will need to be re-verified if a waiter arrived.
+  lock->verified = false;
+
+  if (hasWaiters(&lock->waiters)) {
+    /*
+     * UNLOCKING -> LOCKING transition: A new DataVIO entered the hash lock
+     * while the agent was releasing the PBN lock. The current agent exits and
+     * the waiter has to re-lock and re-verify the duplicate location.
+     */
+    // XXX VDOSTORY-190 If we used the current agent to re-acquire the PBN
+    // lock we wouldn't need to re-verify.
+    agent = retireLockAgent(lock);
+    startLocking(lock, agent);
+    return;
+  }
+
+  /*
+   * UNLOCKING -> DESTROYING transition: The agent is done with the lock
+   * and no other DataVIOs reference it, so remove it from the lock map
+   * and return it to the pool.
+   */
+  exitHashLock(agent);
+}
+
+/**
+ * Release a read lock on the PBN of the block that may or may not have
+ * contained duplicate data. This continuation is launched by
+ * startUnlocking(), and calls back to finishUnlocking() on the hash zone
+ * thread.
+ *
+ * @param completion  The completion of the DataVIO acting as the lock's agent
+ **/
+static void unlockDuplicatePBN(VDOCompletion *completion)
+{
+  DataVIO *agent = asDataVIO(completion);
+  assertInDuplicateZone(agent);
+  HashLock *lock = agent->hashLock;
+
+  ASSERT_LOG_ONLY(lock->duplicateLock != NULL,
+                  "must have a duplicate lock to release");
+
+  releasePBNLock(agent->duplicate.zone, agent->duplicate.pbn,
+                 &lock->duplicateLock);
+
+  if (lock->state == HASH_LOCK_BYPASSING) {
+    launchHashZoneCallback(agent, finishBypassing, THIS_LOCATION(NULL));
+  } else {
+    launchHashZoneCallback(agent, finishUnlocking, THIS_LOCATION(NULL));
+  }
+}
+
+/**
+ * Release a read lock on the PBN of the block that may or may not have
+ * contained duplicate data.
+ *
+ * @param lock   The hash lock
+ * @param agent  The DataVIO currently acting as the agent for the lock
+ **/
+static void startUnlocking(HashLock *lock, DataVIO *agent)
+{
+  setHashLockState(lock, HASH_LOCK_UNLOCKING);
+
+  /*
+   * XXX If we arrange to continue on the duplicate zone thread when
+   * verification fails, and don't explicitly change lock states (or use an
+   * agent-local state, or an atomic), we can avoid a thread transition here.
+   */
+  launchDuplicateZoneCallback(agent, unlockDuplicatePBN, THIS_LOCATION(NULL));
+}
+
+/**
+ * Process the result of a UDS update performed by the agent for the lock.
+ * This continuation is registered in startQuerying().
+ *
+ * @param completion  The completion of the DataVIO that performed the update
+ **/
+static void finishUpdating(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  if (completion->result != VDO_SUCCESS) {
+    abortHashLock(lock, agent);
+    return;
+  }
+
+  // UDS was updated successfully, so don't update again unless the
+  // duplicate location changes due to rollover.
+  lock->updateAdvice = false;
+
+  if (hasWaiters(&lock->waiters)) {
+    /*
+     * UPDATING -> DEDUPING transition: A new DataVIO arrived during the UDS
+     * update. Send it on the verified dedupe path. The agent is done with the
+     * lock, but the lock may still need to use it to clean up after rollover.
+     */
+    startDeduping(lock, agent, true);
+    return;
+  }
+
+  if (lock->duplicateLock != NULL) {
+    /*
+     * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we
+     * hold a duplicate PBN lock, so go release it.
+     */
+    startUnlocking(lock, agent);
+  } else {
+    /*
+     * UPDATING -> DESTROYING transition: No one is waiting to dedupe and
+     * there's no lock to release.
+     */
+    // XXX startDestroying(lock, agent);
+    startBypassing(lock, NULL);
+    exitHashLock(agent);
+  }
+}
+
+/**
+ * Continue deduplication with the last step, updating UDS with the location
+ * of the duplicate that should be returned as advice in the future.
+ *
+ * @param lock   The hash lock
+ * @param agent  The DataVIO currently acting as the agent for the lock
+ **/
+static void startUpdating(HashLock *lock, DataVIO *agent)
+{
+  setHashLockState(lock, HASH_LOCK_UPDATING);
+
+  ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified");
+  ASSERT_LOG_ONLY(lock->updateAdvice, "should only update advice if needed");
+
+  agent->lastAsyncOperation = UPDATE_INDEX;
+  setHashZoneCallback(agent, finishUpdating, THIS_LOCATION(NULL));
+  dataVIOAsCompletion(agent)->layer->updateAlbireo(agent);
+}
+
+/**
+ * Handle a DataVIO that has finished deduplicating against the block locked
+ * by the hash lock. If there are other DataVIOs still sharing the lock, this
+ * will just release the DataVIO's share of the lock and finish processing the
+ * DataVIO. If this is the last DataVIO holding the lock, this makes the
+ * DataVIO the lock agent and uses it to advance the state of the lock so it
+ * can eventually be released.
+ *
+ * @param lock     The hash lock
+ * @param dataVIO  The lock holder that has finished deduplicating
+ **/
+static void finishDeduping(HashLock *lock, DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING");
+  ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters),
+                  "shouldn't have any lock waiters in DEDUPING");
+
+  // Just release the lock reference if other DataVIOs are still deduping.
+  if (lock->referenceCount > 1) {
+    exitHashLock(dataVIO);
+    return;
+  }
+
+  // The hash lock must have an agent for all other lock states.
+  DataVIO *agent = dataVIO;
+  setAgent(lock, agent);
+
+  if (lock->updateAdvice) {
+    /*
+     * DEDUPING -> UPDATING transition: The location of the duplicate block
+     * changed since the initial UDS query because of compression, rollover,
+     * or because the query agent didn't have an allocation. The UDS update
+     * was delayed in case there was another change in location, but with only
+     * this DataVIO using the hash lock, it's time to update the advice.
+     */
+    startUpdating(lock, agent);
+  } else {
+    /*
+     * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the
+     * duplicate location so the hash lock itself can be released (contingent
+     * on no new DataVIOs arriving in the lock before the agent returns).
+     */
+    startUnlocking(lock, agent);
+  }
+}
+
+/**
+ * Implements WaiterCallback. Binds the DataVIO that was waiting to a new hash
+ * lock and waits on that lock.
+ **/
+static void enterForkedLock(Waiter *waiter, void *context)
+{
+  DataVIO  *dataVIO = waiterAsDataVIO(waiter);
+  HashLock *newLock = (HashLock *) context;
+
+  setHashLock(dataVIO, newLock);
+  waitOnHashLock(newLock, dataVIO);
+}
+
+/**
+ * Fork a hash lock because it has run out of increments on the duplicate PBN.
+ * Transfers the new agent and any lock waiters to a new hash lock instance
+ * which takes the place of the old lock in the lock map. The old lock remains
+ * active, but will not update advice.
+ *
+ * @param oldLock   The hash lock to fork
+ * @param newAgent  The DataVIO that will be the agent for the new lock
+ **/
+static void forkHashLock(HashLock *oldLock, DataVIO *newAgent)
+{
+  HashLock *newLock;
+  int result = acquireHashLockFromZone(newAgent->hashZone,
+                                       &newAgent->chunkName,
+                                       oldLock, &newLock);
+  if (result != VDO_SUCCESS) {
+    abortHashLock(oldLock, newAgent);
+    return;
+  }
+
+  // Only one of the two locks should update UDS. The old lock is out of
+  // references, so it would be poor dedupe advice in the short term.
+  oldLock->updateAdvice = false;
+  newLock->updateAdvice = true;
+
+  setHashLock(newAgent, newLock);
+  setAgent(newLock, newAgent);
+
+  notifyAllWaiters(&oldLock->waiters, enterForkedLock, newLock);
+
+  newAgent->isDuplicate = false;
+  startWriting(newLock, newAgent);
+}
+
+/**
+ * Reserve a reference count increment for a DataVIO and launch it on the
+ * dedupe path. If no increments are available, this will roll over to a new
+ * hash lock and launch the DataVIO as the writing agent for that lock.
+ *
+ * @param lock      The hash lock
+ * @param dataVIO   The DataVIO to deduplicate using the hash lock
+ * @param hasClaim  <code>true</code> if the dataVIO already has claimed
+ *                  an increment from the duplicate lock
+ **/
+static void launchDedupe(HashLock *lock, DataVIO *dataVIO, bool hasClaim)
+{
+  if (!hasClaim && !claimPBNLockIncrement(lock->duplicateLock)) {
+    // Out of increments, so must roll over to a new lock.
+    forkHashLock(lock, dataVIO);
+    return;
+  }
+
+  // Deduplicate against the lock's verified location.
+  setDuplicateLocation(dataVIO, lock->duplicate);
+  launchDuplicateZoneCallback(dataVIO, shareBlock,
+                              THIS_LOCATION("$F;cb=shareBlock"));
+}
+
+/**
+ * Enter the hash lock state where DataVIOs deduplicate in parallel against a
+ * true copy of their data on disk. If the agent itself needs to deduplicate,
+ * an increment for it must already have been claimed from the duplicate lock,
+ * ensuring the hash lock will still have a DataVIO holding it.
+ *
+ * @param lock         The hash lock
+ * @param agent        The DataVIO acting as the agent for the lock
+ * @param agentIsDone  <code>true</code> only if the agent has already written
+ *                     or deduplicated against its data
+ **/
+static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone)
+{
+  setHashLockState(lock, HASH_LOCK_DEDUPING);
+
+  // We don't take the downgraded allocation lock from the agent unless we
+  // actually need to deduplicate against it.
+  if (lock->duplicateLock == NULL) {
+    ASSERT_LOG_ONLY(!isCompressed(agent->newMapped.state),
+                    "compression must have shared a lock");
+    ASSERT_LOG_ONLY(agentIsDone, "agent must have written the new duplicate");
+    transferAllocationLock(agent);
+  }
+
+  ASSERT_LOG_ONLY(isPBNReadLock(lock->duplicateLock),
+                  "duplicateLock must be a PBN read lock");
+
+  /*
+   * This state is not like any of the other states. There is no designated
+   * agent--the agent transitioning to this state and all the waiters will be
+   * launched to deduplicate in parallel.
+   */
+  setAgent(lock, NULL);
+
+  /*
+   * Launch the agent (if not already deduplicated) and as many lock waiters
+   * as we have available increments for on the dedupe path. If we run out of
+   * increments, rollover will be triggered and the remaining waiters will be
+   * transferred to the new lock.
+   */
+  if (!agentIsDone) {
+    launchDedupe(lock, agent, true);
+    agent = NULL;
+  }
+  while (hasWaiters(&lock->waiters)) {
+    launchDedupe(lock, dequeueLockWaiter(lock), false);
+  }
+
+  if (agentIsDone) {
+    /*
+     * In the degenerate case where all the waiters rolled over to a new lock,
+     * this will continue to use the old agent to clean up this lock, and
+     * otherwise it just lets the agent exit the lock.
+     */
+    finishDeduping(lock, agent);
+  }
+}
+
+/**
+ * Handle the result of the agent for the lock comparing its data to the
+ * duplicate candidate. This continuation is registered in startVerifying().
+ *
+ * @param completion  The completion of the DataVIO used to verify dedupe
+ **/
+static void finishVerifying(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  if (completion->result != VDO_SUCCESS) {
+    // XXX VDOSTORY-190 should convert verify IO errors to verification failure
+    abortHashLock(lock, agent);
+    return;
+  }
+
+  lock->verified = agent->isDuplicate;
+
+  // Only count the result of the initial verification of the advice as valid
+  // or stale, and not any re-verifications due to PBN lock releases.
+  if (!lock->verifyCounted) {
+    lock->verifyCounted = true;
+    if (lock->verified) {
+      bumpHashZoneValidAdviceCount(agent->hashZone);
+    } else {
+      bumpHashZoneStaleAdviceCount(agent->hashZone);
+    }
+  }
+
+  // Even if the block is a verified duplicate, we can't start to deduplicate
+  // unless we can claim a reference count increment for the agent.
+  if (lock->verified && !claimPBNLockIncrement(lock->duplicateLock)) {
+    agent->isDuplicate = false;
+    lock->verified     = false;
+  }
+
+  if (lock->verified) {
+    /*
+     * VERIFYING -> DEDUPING transition: The advice is for a true duplicate,
+     * so start deduplicating against it, if references are available.
+     */
+    startDeduping(lock, agent, false);
+  } else {
+    /*
+     * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try
+     * to dedupe and roll over immediately, which would fail because it would
+     * leave the lock without an agent to release the PBN lock. In both cases,
+     * the data will have to be written or compressed, but first the advice
+     * PBN must be unlocked by the VERIFYING agent.
+     */
+    lock->updateAdvice = true;
+    startUnlocking(lock, agent);
+  }
+}
+
+/**
+ * Continue the deduplication path for a hash lock by using the agent to read
+ * (and possibly decompress) the data at the candidate duplicate location,
+ * comparing it to the data in the agent to verify that the candidate is
+ * identical to all the DataVIOs sharing the hash. If so, it can be
+ * deduplicated against, otherwise a DataVIO allocation will have to be
+ * written to and used for dedupe.
+ *
+ * @param lock   The hash lock (must be LOCKING)
+ * @param agent  The DataVIO to use to read and compare candidate data
+ **/
+static void startVerifying(HashLock *lock, DataVIO *agent)
+{
+  setHashLockState(lock, HASH_LOCK_VERIFYING);
+  ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once");
+
+  /*
+   * XXX VDOSTORY-190 Optimization: This is one of those places where the zone
+   * and continuation we want to use depends on the outcome of the comparison.
+   * If we could choose which path in the layer thread before continuing, we
+   * could save a thread transition in one of the two cases (assuming we're
+   * willing to delay visibility of the the hash lock state change).
+   */
+  VDOCompletion *completion = dataVIOAsCompletion(agent);
+  agent->lastAsyncOperation = VERIFY_DEDUPLICATION;
+  setHashZoneCallback(agent, finishVerifying, THIS_LOCATION(NULL));
+  completion->layer->verifyDuplication(agent);
+}
+
+/**
+ * Handle the result of the agent for the lock attempting to obtain a PBN read
+ * lock on the candidate duplicate block. this continuation is registered in
+ * lockDuplicatePBN().
+ *
+ * @param completion  The completion of the DataVIO that attempted to get
+ *                    the read lock
+ **/
+static void finishLocking(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  if (completion->result != VDO_SUCCESS) {
+    // XXX clearDuplicateLocation()?
+    agent->isDuplicate = false;
+    abortHashLock(lock, agent);
+    return;
+  }
+
+  if (!agent->isDuplicate) {
+    ASSERT_LOG_ONLY(lock->duplicateLock == NULL,
+                  "must not hold duplicateLock if not flagged as a duplicate");
+    /*
+     * LOCKING -> WRITING transition: The advice block is being modified or
+     * has no available references, so try to write or compress the data,
+     * remembering to update UDS later with the new advice.
+     */
+    bumpHashZoneStaleAdviceCount(agent->hashZone);
+    lock->updateAdvice = true;
+    startWriting(lock, agent);
+    return;
+  }
+
+  ASSERT_LOG_ONLY(lock->duplicateLock != NULL,
+                  "must hold duplicateLock if flagged as a duplicate");
+
+  if (!lock->verified) {
+    /*
+     * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path,
+     * reading the candidate duplicate and comparing it to the agent's data to
+     * decide whether it is a true duplicate or stale advice.
+     */
+    startVerifying(lock, agent);
+    return;
+  }
+
+  if (!claimPBNLockIncrement(lock->duplicateLock)) {
+    /*
+     * LOCKING -> UNLOCKING transition: The verified block was re-locked, but
+     * has no available increments left. Must first release the useless PBN
+     * read lock before rolling over to a new copy of the block.
+     */
+    agent->isDuplicate = false;
+    lock->verified     = false;
+    lock->updateAdvice = true;
+    startUnlocking(lock, agent);
+    return;
+  }
+
+  /*
+   * LOCKING -> DEDUPING transition: Continue on the verified dedupe path,
+   * deduplicating against a location that was previously verified or
+   * written to.
+   */
+  startDeduping(lock, agent, false);
+}
+
+/**
+ * Acquire a read lock on the PBN of the block containing candidate duplicate
+ * data (compressed or uncompressed). If the PBN is already locked for
+ * writing, the lock attempt is abandoned and isDuplicate will be cleared
+ * before calling back. this continuation is launched from startLocking(), and
+ * calls back to finishLocking() on the hash zone thread.
+ *
+ * @param completion The completion of the DataVIO attempting to acquire the
+ *                   physical block lock on behalf of its hash lock
+ **/
+static void lockDuplicatePBN(VDOCompletion *completion)
+{
+  DataVIO      *agent = asDataVIO(completion);
+  PhysicalZone *zone  = agent->duplicate.zone;
+  assertInDuplicateZone(agent);
+
+  setHashZoneCallback(agent, finishLocking, THIS_LOCATION(NULL));
+
+  // While in the zone that owns it, find out how many additional references
+  // can be made to the block if it turns out to truly be a duplicate.
+  SlabDepot *depot = getSlabDepot(getVDOFromDataVIO(agent));
+  unsigned int incrementLimit = getIncrementLimit(depot, agent->duplicate.pbn);
+  if (incrementLimit == 0) {
+    // We could deduplicate against it later if a reference happened to be
+    // released during verification, but it's probably better to bail out now.
+    // XXX clearDuplicateLocation()?
+    agent->isDuplicate = false;
+    continueDataVIO(agent, VDO_SUCCESS);
+    return;
+  }
+
+  PBNLock *lock;
+  int result = attemptPBNLock(zone, agent->duplicate.pbn, VIO_READ_LOCK,
+                              &lock);
+  if (result != VDO_SUCCESS) {
+    continueDataVIO(agent, result);
+    return;
+  }
+
+  if (!isPBNReadLock(lock)) {
+    /*
+     * There are three cases of write locks: uncompressed data block writes,
+     * compressed (packed) block writes, and block map page writes. In all
+     * three cases, we give up on trying to verify the advice and don't bother
+     * to try deduplicate against the data in the write lock holder.
+     *
+     * 1) We don't ever want to try to deduplicate against a block map page.
+     *
+     * 2a) It's very unlikely we'd deduplicate against an entire packed block,
+     * both because of the chance of matching it, and because we don't record
+     * advice for it, but for the uncompressed representation of all the
+     * fragments it contains. The only way we'd be getting lock contention is
+     * if we've written the same representation coincidentally before, had it
+     * become unreferenced, and it just happened to be packed together from
+     * compressed writes when we go to verify the lucky advice. Giving up is a
+     * miniscule loss of potential dedupe.
+     *
+     * 2b) If the advice is for a slot of a compressed block, it's about to
+     * get smashed, and the write smashing it cannot contain our data--it
+     * would have to be writing on behalf of our hash lock, but that's
+     * impossible since we're the lock agent.
+     *
+     * 3a) If the lock is held by a DataVIO with different data, the advice is
+     * already stale or is about to become stale.
+     *
+     * 3b) If the lock is held by a DataVIO that matches us, we may as well
+     * either write it ourselves (or reference the copy we already wrote)
+     * instead of potentially having many duplicates wait for the lock holder
+     * to write, journal, hash, and finally arrive in the hash lock. All we
+     * lose is a chance to avoid a UDS update in the very rare case of advice
+     * for a free block that just happened to be allocated to a DataVIO with
+     * the same hash. In async mode, there's also a chance to save on a block
+     * write, at the cost of a block verify. Saving on a full block compare in
+     * all stale advice cases almost certainly outweighs saving a UDS update
+     * in a lucky case where advice would have been saved from becoming stale.
+     */
+    // XXX clearDuplicateLocation()?
+    agent->isDuplicate = false;
+    continueDataVIO(agent, VDO_SUCCESS);
+    return;
+  }
+
+  if (lock->holderCount == 0) {
+    // Ensure that the newly-locked block is referenced.
+    Slab *slab = getSlab(depot, agent->duplicate.pbn);
+    result = acquireProvisionalReference(slab, agent->duplicate.pbn, lock);
+    if (result != VDO_SUCCESS) {
+      logWarningWithStringError(result,
+                                "Error acquiring provisional reference for "
+                                "dedupe candidate; aborting dedupe");
+      agent->isDuplicate = false;
+      releasePBNLock(zone, agent->duplicate.pbn, &lock);
+      continueDataVIO(agent, result);
+      return;
+    }
+
+    /*
+     * The increment limit we grabbed earlier is still valid. The lock now
+     * holds the rights to acquire all those references. Those rights will be
+     * claimed by hash locks sharing this read lock.
+     */
+    lock->incrementLimit = incrementLimit;
+  }
+
+  // We've successfully acquired a read lock on behalf of the hash lock,
+  // so mark it as such.
+  setDuplicateLock(agent->hashLock, lock);
+
+  /*
+   * XXX VDOSTORY-190 Optimization: Same as startLocking() lazily changing
+   * state to save on having to switch back to the hash zone thread. Here we
+   * could directly launch the block verify, then switch to a hash thread.
+   */
+  continueDataVIO(agent, VDO_SUCCESS);
+}
+
+/**
+ * Continue deduplication for a hash lock that has obtained valid advice
+ * of a potential duplicate through its agent.
+ *
+ * @param lock   The hash lock (currently must be QUERYING)
+ * @param agent  The DataVIO bearing the dedupe advice
+ **/
+static void startLocking(HashLock *lock, DataVIO *agent)
+{
+  ASSERT_LOG_ONLY(lock->duplicateLock == NULL,
+                  "must not acquire a duplicate lock when already holding it");
+
+  setHashLockState(lock, HASH_LOCK_LOCKING);
+
+  /*
+   * XXX VDOSTORY-190 Optimization: If we arrange to continue on the duplicate
+   * zone thread when accepting the advice, and don't explicitly change lock
+   * states (or use an agent-local state, or an atomic), we can avoid a thread
+   * transition here.
+   */
+  agent->lastAsyncOperation = ACQUIRE_PBN_READ_LOCK;
+  launchDuplicateZoneCallback(agent, lockDuplicatePBN, THIS_LOCATION(NULL));
+}
+
+/**
+ * Re-entry point for the lock agent after it has finished writing or
+ * compressing its copy of the data block. The agent will never need to dedupe
+ * against anything, so it's done with the lock, but the lock may not be
+ * finished with it, as a UDS update might still be needed.
+ *
+ * If there are other lock holders, the agent will hand the job to one of them
+ * and exit, leaving the lock to deduplicate against the just-written block.
+ * If there are no other lock holders, the agent either exits (and later tears
+ * down the hash lock), or it remains the agent and updates UDS.
+ *
+ * @param lock   The hash lock, which must be in state WRITING
+ * @param agent  The DataVIO that wrote its data for the lock
+ **/
+static void finishWriting(HashLock *lock, DataVIO *agent)
+{
+  // Dedupe against the data block or compressed block slot the agent wrote.
+  // Since we know the write succeeded, there's no need to verify it.
+  lock->duplicate = agent->newMapped;
+  lock->verified  = true;
+
+  if (isCompressed(lock->duplicate.state) && lock->registered) {
+    // Compression means the location we gave in the UDS query is not the
+    // location we're using to deduplicate.
+    lock->updateAdvice = true;
+  }
+
+  // If there are any waiters, we need to start deduping them.
+  if (hasWaiters(&lock->waiters)) {
+    /*
+     * WRITING -> DEDUPING transition: an asynchronously-written block
+     * failed to compress, so the PBN lock on the written copy was already
+     * transferred. The agent is done with the lock, but the lock may
+     * still need to use it to clean up after rollover.
+     */
+    startDeduping(lock, agent, true);
+    return;
+  }
+
+  // There are no waiters and the agent has successfully written, so take a
+  // step towards being able to release the hash lock (or just release it).
+  if (lock->updateAdvice) {
+    /*
+     * WRITING -> UPDATING transition: There's no waiter and a UDS update is
+     * needed, so retain the WRITING agent and use it to launch the update.
+     * The happens on compression, rollover, or the QUERYING agent not having
+     * an allocation.
+     */
+    startUpdating(lock, agent);
+  } else if (lock->duplicateLock != NULL) {
+    /*
+     * WRITING -> UNLOCKING transition: There's no waiter and no update
+     * needed, but the compressed write gave us a shared duplicate lock that
+     * we must release.
+     */
+    setDuplicateLocation(agent, lock->duplicate);
+    startUnlocking(lock, agent);
+  } else {
+    /*
+     * WRITING -> DESTROYING transition: There's no waiter, no update needed,
+     * and no duplicate lock held, so both the agent and lock have no more
+     * work to do. The agent will release its allocation lock in cleanup.
+     */
+    // XXX startDestroying(lock, agent);
+    startBypassing(lock, NULL);
+    exitHashLock(agent);
+  }
+}
+
+/**
+ * Search through the lock waiters for a DataVIO that has an allocation. If
+ * one is found, swap agents, put the old agent at the head of the wait queue,
+ * then return the new agent. Otherwise, just return the current agent.
+ *
+ * @param lock   The hash lock to modify
+ **/
+static DataVIO *selectWritingAgent(HashLock *lock)
+{
+  // This should-be-impossible condition is the only cause for
+  // enqueueDataVIO() to fail later on, where it would be a pain to handle.
+  int result = ASSERT(!isWaiting(dataVIOAsWaiter(lock->agent)),
+                      "agent must not be waiting");
+  if (result != VDO_SUCCESS) {
+    return lock->agent;
+  }
+
+  WaitQueue tempQueue;
+  initializeWaitQueue(&tempQueue);
+
+  // Move waiters to the temp queue one-by-one until we find an allocation.
+  // Not ideal to search, but it only happens when nearly out of space.
+  DataVIO *dataVIO;
+  while (((dataVIO = dequeueLockWaiter(lock)) != NULL)
+         && !hasAllocation(dataVIO)) {
+    // Use the lower-level enqueue since we're just moving waiters around.
+    int result = enqueueWaiter(&tempQueue, dataVIOAsWaiter(dataVIO));
+    // The only error is the DataVIO already being on a wait queue, and since
+    // we just dequeued it, that could only happen due to a memory smash or
+    // concurrent use of that DataVIO.
+    ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error");
+  }
+
+  if (dataVIO != NULL) {
+    // Move the rest of the waiters over to the temp queue, preserving the
+    // order they arrived at the lock.
+    transferAllWaiters(&lock->waiters, &tempQueue);
+
+    // The current agent is being replaced and will have to wait to dedupe;
+    // make it the first waiter since it was the first to reach the lock.
+    int result = enqueueDataVIO(&lock->waiters, lock->agent,
+                                THIS_LOCATION(NULL));
+    ASSERT_LOG_ONLY(result == VDO_SUCCESS,
+                    "impossible enqueueDataVIO error after isWaiting checked");
+    setAgent(lock, dataVIO);
+  } else {
+    // No one has an allocation, so keep the current agent.
+    dataVIO = lock->agent;
+  }
+
+  // Swap all the waiters back onto the lock's queue.
+  transferAllWaiters(&tempQueue, &lock->waiters);
+  return dataVIO;
+}
+
+/**
+ * Begin the non-duplicate write path for a hash lock that had no advice,
+ * selecting a DataVIO with an allocation as a new agent, if necessary,
+ * then resuming the agent on the DataVIO write path.
+ *
+ * @param lock   The hash lock (currently must be QUERYING)
+ * @param agent  The DataVIO currently acting as the agent for the lock
+ **/
+static void startWriting(HashLock *lock, DataVIO *agent)
+{
+  setHashLockState(lock, HASH_LOCK_WRITING);
+
+  // The agent might not have received an allocation and so can't be used for
+  // writing, but it's entirely possible that one of the waiters did.
+  if (!hasAllocation(agent)) {
+    agent = selectWritingAgent(lock);
+    // If none of the waiters had an allocation, the writes all have to fail.
+    if (!hasAllocation(agent)) {
+      /*
+       * XXX VDOSTORY-190 Should we keep a variant of BYPASSING that causes
+       * new arrivals to fail immediately if they don't have an allocation? It
+       * might be possible that on some path there would be non-waiters still
+       * referencing the lock, so it would remain in the map as everything is
+       * currently spelled, even if the agent and all the waiters release.
+       */
+      startBypassing(lock, agent);
+      return;
+    }
+  }
+
+  // If the agent compresses, it might wait indefinitely in the packer,
+  // which would be bad if there are any other DataVIOs waiting.
+  if (hasWaiters(&lock->waiters)) {
+    // XXX in sync mode, transition directly to LOCKING to start dedupe?
+    cancelCompression(agent);
+  }
+
+  /*
+   * Send the agent to the compress/pack/async-write path in vioWrite. If it
+   * succeeds, it will return to the hash lock via continueHashLock() and call
+   * finishWriting().
+   */
+  compressData(agent);
+}
+
+/**
+ * Process the result of a UDS query performed by the agent for the lock. This
+ * continuation is registered in startQuerying().
+ *
+ * @param completion  The completion of the DataVIO that performed the query
+ **/
+static void finishQuerying(VDOCompletion *completion)
+{
+  DataVIO  *agent = asDataVIO(completion);
+  assertHashLockAgent(agent, __func__);
+  HashLock *lock = agent->hashLock;
+
+  if (completion->result != VDO_SUCCESS) {
+    abortHashLock(lock, agent);
+    return;
+  }
+
+  if (agent->isDuplicate) {
+    lock->duplicate = agent->duplicate;
+    /*
+     * QUERYING -> LOCKING transition: Valid advice was obtained from UDS.
+     * Use the QUERYING agent to start the hash lock on the unverified dedupe
+     * path, verifying that the advice can be used.
+     */
+    startLocking(lock, agent);
+  } else {
+    // The agent will be used as the duplicate if has an allocation; if it
+    // does, that location was posted to UDS, so no update will be needed.
+    lock->updateAdvice = !hasAllocation(agent);
+    /*
+     * QUERYING -> WRITING transition: There was no advice or the advice
+     * wasn't valid, so try to write or compress the data.
+     */
+    startWriting(lock, agent);
+  }
+}
+
+/**
+ * Start deduplication for a hash lock that has finished initializing by
+ * making the DataVIO that requested it the agent, entering the QUERYING
+ * state, and using the agent to perform the UDS query on behalf of the lock.
+ *
+ * @param lock     The initialized hash lock
+ * @param dataVIO  The DataVIO that has just obtained the new lock
+ **/
+static void startQuerying(HashLock *lock, DataVIO *dataVIO)
+{
+  setAgent(lock, dataVIO);
+  setHashLockState(lock, HASH_LOCK_QUERYING);
+
+  VDOCompletion *completion   = dataVIOAsCompletion(dataVIO);
+  dataVIO->lastAsyncOperation = CHECK_FOR_DEDUPLICATION;
+  setHashZoneCallback(dataVIO, finishQuerying, THIS_LOCATION(NULL));
+  completion->layer->checkForDuplication(dataVIO);
+}
+
+/**
+ * Complain that a DataVIO has entered a HashLock that is in an unimplemented
+ * or unusable state and continue the DataVIO with an error.
+ *
+ * @param lock     The hash lock
+ * @param dataVIO  The DataVIO attempting to enter the lock
+ **/
+static void reportBogusLockState(HashLock *lock, DataVIO *dataVIO)
+{
+  int result = ASSERT_FALSE("hash lock must not be in unimplemented state %s",
+                            getHashLockStateName(lock->state));
+  continueDataVIOIn(dataVIO, result, compressDataCallback);
+}
+
+/**********************************************************************/
+void enterHashLock(DataVIO *dataVIO)
+{
+  HashLock *lock = dataVIO->hashLock;
+  switch (lock->state) {
+  case HASH_LOCK_INITIALIZING:
+    startQuerying(lock, dataVIO);
+    break;
+
+  case HASH_LOCK_QUERYING:
+  case HASH_LOCK_WRITING:
+  case HASH_LOCK_UPDATING:
+  case HASH_LOCK_LOCKING:
+  case HASH_LOCK_VERIFYING:
+  case HASH_LOCK_UNLOCKING:
+    // The lock is busy, and can't be shared yet.
+    waitOnHashLock(lock, dataVIO);
+    break;
+
+  case HASH_LOCK_BYPASSING:
+    // Bypass dedupe entirely.
+    compressData(dataVIO);
+    break;
+
+  case HASH_LOCK_DEDUPING:
+    launchDedupe(lock, dataVIO, false);
+    break;
+
+  case HASH_LOCK_DESTROYING:
+    // A lock in this state should not be acquired by new VIOs.
+    reportBogusLockState(lock, dataVIO);
+    break;
+
+  default:
+    reportBogusLockState(lock, dataVIO);
+  }
+}
+
+/**********************************************************************/
+void continueHashLock(DataVIO *dataVIO)
+{
+  HashLock *lock = dataVIO->hashLock;
+  // XXX VDOSTORY-190 Eventually we may be able to fold the error handling
+  // in at this point instead of using a separate entry point for it.
+
+  switch (lock->state) {
+  case HASH_LOCK_WRITING:
+    ASSERT_LOG_ONLY(dataVIO == lock->agent,
+                    "only the lock agent may continue the lock");
+    finishWriting(lock, dataVIO);
+    break;
+
+  case HASH_LOCK_DEDUPING:
+    finishDeduping(lock, dataVIO);
+    break;
+
+  case HASH_LOCK_BYPASSING:
+    // This DataVIO has finished the write path and the lock doesn't need it.
+    // XXX This isn't going to be correct if DEDUPING ever uses BYPASSING.
+    finishDataVIO(dataVIO, VDO_SUCCESS);
+    break;
+
+  case HASH_LOCK_INITIALIZING:
+  case HASH_LOCK_QUERYING:
+  case HASH_LOCK_UPDATING:
+  case HASH_LOCK_LOCKING:
+  case HASH_LOCK_VERIFYING:
+  case HASH_LOCK_UNLOCKING:
+  case HASH_LOCK_DESTROYING:
+    // A lock in this state should never be re-entered.
+    reportBogusLockState(lock, dataVIO);
+    break;
+
+  default:
+    reportBogusLockState(lock, dataVIO);
+  }
+}
+
+/**********************************************************************/
+void continueHashLockOnError(DataVIO *dataVIO)
+{
+  // XXX We could simply use continueHashLock() and check for errors in that.
+  abortHashLock(dataVIO->hashLock, dataVIO);
+}
+
+/**
+ * Check whether the data in DataVIOs sharing a lock is different than in a
+ * DataVIO seeking to share the lock, which should only be possible in the
+ * extremely unlikely case of a hash collision.
+ *
+ * @param lock       The lock to check
+ * @param candidate  The DataVIO seeking to share the lock
+ *
+ * @return <code>true</code> if the given DataVIO must not share the lock
+ *         because it doesn't have the same data as the lock holders
+ **/
+static bool isHashCollision(HashLock *lock, DataVIO *candidate)
+{
+  if (isRingEmpty(&lock->duplicateRing)) {
+    return false;
+  }
+
+  DataVIO       *lockHolder = dataVIOFromLockNode(lock->duplicateRing.next);
+  PhysicalLayer *layer      = dataVIOAsCompletion(candidate)->layer;
+  bool           collides   = !layer->compareDataVIOs(lockHolder, candidate);
+
+  if (collides) {
+    bumpHashZoneCollisionCount(candidate->hashZone);
+  } else {
+    bumpHashZoneDataMatchCount(candidate->hashZone);
+  }
+
+  return collides;
+}
+
+/**********************************************************************/
+static inline int assertHashLockPreconditions(const DataVIO *dataVIO)
+{
+  int result = ASSERT(dataVIO->hashLock == NULL,
+                      "must not already hold a hash lock");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  result = ASSERT(isRingEmpty(&dataVIO->hashLockNode),
+                  "must not already be a member of a hash lock ring");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  return ASSERT(dataVIO->recoverySequenceNumber == 0,
+                "must not hold a recovery lock when getting a hash lock");
+}
+
+/**********************************************************************/
+int acquireHashLock(DataVIO *dataVIO)
+{
+  int result = assertHashLockPreconditions(dataVIO);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  HashLock *lock;
+  result = acquireHashLockFromZone(dataVIO->hashZone, &dataVIO->chunkName,
+                                   NULL, &lock);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (isHashCollision(lock, dataVIO)) {
+    // Hash collisions are extremely unlikely, but the bogus dedupe would be a
+    // data corruption. Bypass dedupe entirely by leaving hashLock unset.
+    // XXX clear hashZone too?
+    return VDO_SUCCESS;
+  }
+
+  setHashLock(dataVIO, lock);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void releaseHashLock(DataVIO *dataVIO)
+{
+  HashLock *lock = dataVIO->hashLock;
+  if (lock == NULL) {
+    return;
+  }
+
+  setHashLock(dataVIO, NULL);
+
+  if (lock->referenceCount > 0) {
+    // The lock is still in use by other DataVIOs.
+    return;
+  }
+
+  setHashLockState(lock, HASH_LOCK_DESTROYING);
+  returnHashLockToZone(dataVIO->hashZone, &lock);
+}
+
+/**
+ * Transfer a DataVIO's downgraded allocation PBN lock to the DataVIO's hash
+ * lock, converting it to a duplicate PBN lock.
+ *
+ * @param dataVIO  The DataVIO holding the allocation lock to transfer
+ **/
+static void transferAllocationLock(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(dataVIO->newMapped.pbn == getDataVIOAllocation(dataVIO),
+                  "transferred lock must be for the block written");
+
+  AllocatingVIO *allocatingVIO  = dataVIOAsAllocatingVIO(dataVIO);
+  PBNLock       *pbnLock        = allocatingVIO->allocationLock;
+  allocatingVIO->allocationLock = NULL;
+  allocatingVIO->allocation     = ZERO_BLOCK;
+
+  ASSERT_LOG_ONLY(isPBNReadLock(pbnLock),
+                  "must have downgraded the allocation lock before transfer");
+
+  HashLock *hashLock  = dataVIO->hashLock;
+  hashLock->duplicate = dataVIO->newMapped;
+  dataVIO->duplicate  = dataVIO->newMapped;
+
+  // Since the lock is being transferred, the holder count doesn't change (and
+  // isn't even safe to examine on this thread).
+  hashLock->duplicateLock = pbnLock;
+}
+
+/**********************************************************************/
+void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock)
+{
+  ASSERT_LOG_ONLY(getDuplicateLock(dataVIO) == NULL,
+                  "a duplicate PBN lock should not exist when writing");
+  ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state),
+                  "lock transfer must be for a compressed write");
+  assertInNewMappedZone(dataVIO);
+
+  // First sharer downgrades the lock.
+  if (!isPBNReadLock(pbnLock)) {
+    downgradePBNWriteLock(pbnLock);
+  }
+
+  // Get a share of the PBN lock, ensuring it cannot be released until
+  // after this DataVIO has had a chance to journal a reference.
+  dataVIO->duplicate = dataVIO->newMapped;
+  dataVIO->hashLock->duplicate = dataVIO->newMapped;
+  setDuplicateLock(dataVIO->hashLock, pbnLock);
+
+  // Claim a reference for this DataVIO, which is necessary since another
+  // HashLock might start deduplicating against it before our incRef.
+  bool claimed = claimPBNLockIncrement(pbnLock);
+  ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment");
+}
diff --git a/vdo/base/hashLock.h b/vdo/base/hashLock.h
new file mode 100644
index 0000000..b21e465
--- /dev/null
+++ b/vdo/base/hashLock.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.h#3 $
+ */
+
+#ifndef HASH_LOCK_H
+#define HASH_LOCK_H
+
+#include "types.h"
+
+/**
+ * Get the PBN lock on the duplicate data location for a DataVIO from the
+ * HashLock the DataVIO holds (if there is one).
+ *
+ * @param dataVIO  The DataVIO to query
+ *
+ * @return The PBN lock on the DataVIO's duplicate location
+ **/
+PBNLock *getDuplicateLock(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire or share a lock on the hash (chunk name) of the data in a DataVIO,
+ * updating the DataVIO to reference the lock. This must only be called in the
+ * correct thread for the zone. In the unlikely case of a hash collision, this
+ * function will succeed, but the DataVIO will not get a lock reference.
+ *
+ * @param dataVIO  The DataVIO acquiring a lock on its chunk name
+ **/
+int acquireHashLock(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Asynchronously process a DataVIO that has just acquired its reference to a
+ * hash lock. This may place the DataVIO on a wait queue, or it may use the
+ * DataVIO to perform operations on the lock's behalf.
+ *
+ * @param dataVIO  The DataVIO that has just acquired a lock on its chunk name
+ **/
+void enterHashLock(DataVIO *dataVIO);
+
+/**
+ * Asynchronously continue processing a DataVIO in its hash lock after it has
+ * finished writing, compressing, or deduplicating, so it can share the result
+ * with any DataVIOs waiting in the hash lock, or update Albireo, or simply
+ * release its share of the lock. This must only be called in the correct
+ * thread for the hash zone.
+ *
+ * @param dataVIO  The DataVIO to continue processing in its hash lock
+ **/
+void continueHashLock(DataVIO *dataVIO);
+
+/**
+ * Re-enter the hash lock after encountering an error, to clean up the hash
+ * lock.
+ *
+ * @param dataVIO  The DataVIO with an error
+ **/
+void continueHashLockOnError(DataVIO *dataVIO);
+
+/**
+ * Release a DataVIO's share of a hash lock, if held, and null out the
+ * DataVIO's reference to it. This must only be called in the correct thread
+ * for the hash zone.
+ *
+ * If the DataVIO is the only one holding the lock, this also releases any
+ * resources or locks used by the hash lock (such as a PBN read lock on a
+ * block containing data with the same hash) and returns the lock to the hash
+ * zone's lock pool.
+ *
+ * @param dataVIO  The DataVIO releasing its hash lock
+ **/
+void releaseHashLock(DataVIO *dataVIO);
+
+/**
+ * Make a DataVIO's hash lock a shared holder of the PBN lock on the
+ * compressed block to which its data was just written. If the lock is still a
+ * write lock (as it will be for the first share), it will be converted to a
+ * read lock. This also reserves a reference count increment for the DataVIO.
+ *
+ * @param dataVIO  The DataVIO which was just compressed
+ * @param pbnLock  The PBN lock on the compressed block
+ **/
+void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock);
+
+#endif // HASH_LOCK_H
diff --git a/vdo/base/hashLockInternals.h b/vdo/base/hashLockInternals.h
new file mode 100644
index 0000000..67b5634
--- /dev/null
+++ b/vdo/base/hashLockInternals.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLockInternals.h#2 $
+ */
+
+#ifndef HASH_LOCK_INTERNALS_H
+#define HASH_LOCK_INTERNALS_H
+
+#include "completion.h"
+#include "ringNode.h"
+#include "types.h"
+#include "uds.h"
+#include "waitQueue.h"
+
+typedef enum {
+  /** State for locks that are not in use or are being initialized. */
+  HASH_LOCK_INITIALIZING = 0,
+
+  // This is the sequence of states typically used on the non-dedupe path.
+  HASH_LOCK_QUERYING,
+  HASH_LOCK_WRITING,
+  HASH_LOCK_UPDATING,
+
+  // The remaining states are typically used on the dedupe path in this order.
+  HASH_LOCK_LOCKING,
+  HASH_LOCK_VERIFYING,
+  HASH_LOCK_DEDUPING,
+  HASH_LOCK_UNLOCKING,
+
+  // XXX This is a temporary state denoting a lock which is sending VIOs back
+  // to the old dedupe and vioWrite pathways. It won't be in the final version
+  // of VDOSTORY-190.
+  HASH_LOCK_BYPASSING,
+
+  /**
+   * Terminal state for locks returning to the pool. Must be last both because
+   * it's the final state, and also because it's used to count the states.
+   **/
+  HASH_LOCK_DESTROYING,
+} HashLockState;
+
+struct hashLock {
+  /** When the lock is unused, this RingNode allows the lock to be pooled */
+  RingNode       poolNode;
+
+  /** The block hash covered by this lock */
+  UdsChunkName   hash;
+
+  /**
+   * A ring containing the DataVIOs sharing this lock, all having the same
+   * chunk name and data block contents, linked by their hashLockNode fields.
+   **/
+  RingNode       duplicateRing;
+
+  /** The number of DataVIOs sharing this lock instance */
+  VIOCount       referenceCount;
+
+  /** The maximum value of referenceCount in the lifetime of this lock */
+  VIOCount       maxReferences;
+
+  /** The current state of this lock */
+  HashLockState  state;
+
+  /** True if the UDS index should be updated with new advice */
+  bool           updateAdvice;
+
+  /** True if the advice has been verified to be a true duplicate */
+  bool           verified;
+
+  /** True if the lock has already accounted for an initial verification */
+  bool           verifyCounted;
+
+  /** True if this lock is registered in the lock map (cleared on rollover) */
+  bool           registered;
+
+  /**
+   * If verified is false, this is the location of a possible duplicate.
+   * If verified is true, is is the verified location of a true duplicate.
+   **/
+  ZonedPBN       duplicate;
+
+  /** The PBN lock on the block containing the duplicate data */
+  PBNLock       *duplicateLock;
+
+  /** The DataVIO designated to act on behalf of the lock */
+  DataVIO       *agent;
+
+  /**
+   * Other DataVIOs with data identical to the agent who are currently waiting
+   * for the agent to get the information they all need to deduplicate--either
+   * against each other, or against an existing duplicate on disk.
+   **/
+  WaitQueue      waiters;
+};
+
+/**
+ * Initialize a HashLock instance which has been newly allocated.
+ *
+ * @param lock  The lock to initialize
+ **/
+static inline void initializeHashLock(HashLock *lock)
+{
+  initializeRing(&lock->poolNode);
+  initializeRing(&lock->duplicateRing);
+  initializeWaitQueue(&lock->waiters);
+}
+
+/**
+ * Get the string representation of a hash lock state.
+ *
+ * @param state  The hash lock state
+ *
+ * @return The short string representing the state
+ **/
+const char *getHashLockStateName(HashLockState state)
+  __attribute__((warn_unused_result));
+
+#endif // HASH_LOCK_INTERNALS_H
diff --git a/vdo/base/hashZone.c b/vdo/base/hashZone.c
new file mode 100644
index 0000000..61345a7
--- /dev/null
+++ b/vdo/base/hashZone.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.c#3 $
+ */
+
+#include "hashZone.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "dataVIO.h"
+#include "hashLock.h"
+#include "hashLockInternals.h"
+#include "pointerMap.h"
+#include "ringNode.h"
+#include "statistics.h"
+#include "threadConfig.h"
+#include "types.h"
+#include "vdoInternal.h"
+
+enum {
+  LOCK_POOL_CAPACITY = MAXIMUM_USER_VIOS,
+};
+
+/**
+ * These fields are only modified by the locks sharing the hash zone thread,
+ * but are queried by other threads.
+ **/
+typedef struct atomicHashLockStatistics {
+  /** Number of times the UDS advice proved correct */
+  Atomic64 dedupeAdviceValid;
+
+  /** Number of times the UDS advice proved incorrect */
+  Atomic64 dedupeAdviceStale;
+
+  /** Number of writes with the same data as another in-flight write */
+  Atomic64 concurrentDataMatches;
+
+  /** Number of writes whose hash collided with an in-flight write */
+  Atomic64 concurrentHashCollisions;
+} AtomicHashLockStatistics;
+
+struct hashZone {
+  /** Which hash zone this is */
+  ZoneCount zoneNumber;
+
+  /** The thread ID for this zone */
+  ThreadID threadID;
+
+  /** Mapping from chunkName fields to HashLocks */
+  PointerMap *hashLockMap;
+
+  /** Ring containing all unused HashLocks */
+  RingNode lockPool;
+
+  /** Statistics shared by all hash locks in this zone */
+  AtomicHashLockStatistics statistics;
+
+  /** Array of all HashLocks */
+  HashLock *lockArray;
+};
+
+/**
+ * Implements PointerKeyComparator.
+ **/
+static bool compareKeys(const void *thisKey, const void *thatKey)
+{
+  // Null keys are not supported.
+  return (memcmp(thisKey, thatKey, sizeof(UdsChunkName)) == 0);
+}
+
+/**
+ * Implements PointerKeyComparator.
+ **/
+static uint32_t hashKey(const void *key)
+{
+  const UdsChunkName *name = key;
+  /*
+   * Use a fragment of the chunk name as a hash code. It must not overlap with
+   * fragments used elsewhere to ensure uniform distributions.
+   */
+  // XXX pick an offset in the chunk name that isn't used elsewhere
+  return getUInt32LE(&name->name[4]);
+}
+
+/**********************************************************************/
+static inline HashLock *asHashLock(RingNode *poolNode)
+{
+  STATIC_ASSERT(offsetof(HashLock, poolNode) == 0);
+  return (HashLock *) poolNode;
+}
+
+/**********************************************************************/
+int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr)
+{
+  HashZone *zone;
+  int result = ALLOCATE(1, HashZone, __func__, &zone);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makePointerMap(LOCK_MAP_CAPACITY, 0, compareKeys, hashKey,
+                          &zone->hashLockMap);
+  if (result != VDO_SUCCESS) {
+    freeHashZone(&zone);
+    return result;
+  }
+
+  zone->zoneNumber = zoneNumber;
+  zone->threadID   = getHashZoneThread(getThreadConfig(vdo), zoneNumber);
+  initializeRing(&zone->lockPool);
+
+  result = ALLOCATE(LOCK_POOL_CAPACITY, HashLock, "HashLock array",
+                    &zone->lockArray);
+  if (result != VDO_SUCCESS) {
+    freeHashZone(&zone);
+    return result;
+  }
+
+  for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) {
+    HashLock *lock = &zone->lockArray[i];
+    initializeHashLock(lock);
+    pushRingNode(&zone->lockPool, &lock->poolNode);
+  }
+
+  *zonePtr = zone;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeHashZone(HashZone **zonePtr)
+{
+  if (*zonePtr == NULL) {
+    return;
+  }
+
+  HashZone *zone = *zonePtr;
+  freePointerMap(&zone->hashLockMap);
+  FREE(zone->lockArray);
+  FREE(zone);
+  *zonePtr = NULL;
+}
+
+/**********************************************************************/
+ZoneCount getHashZoneNumber(const HashZone *zone)
+{
+  return zone->zoneNumber;
+}
+
+/**********************************************************************/
+ThreadID getHashZoneThreadID(const HashZone *zone)
+{
+  return zone->threadID;
+}
+
+/**********************************************************************/
+HashLockStatistics getHashZoneStatistics(const HashZone *zone)
+{
+  const AtomicHashLockStatistics *atoms = &zone->statistics;
+  return (HashLockStatistics) {
+    .dedupeAdviceValid     = relaxedLoad64(&atoms->dedupeAdviceValid),
+    .dedupeAdviceStale     = relaxedLoad64(&atoms->dedupeAdviceStale),
+    .concurrentDataMatches = relaxedLoad64(&atoms->concurrentDataMatches),
+    .concurrentHashCollisions
+      = relaxedLoad64(&atoms->concurrentHashCollisions),
+  };
+}
+
+/**
+ * Return a hash lock to the zone's pool and null out the reference to it.
+ *
+ * @param [in]     zone     The zone from which the lock was borrowed
+ * @param [in,out] lockPtr  The last reference to the lock being returned
+ **/
+static void returnHashLockToPool(HashZone *zone, HashLock **lockPtr)
+{
+  HashLock *lock = *lockPtr;
+  *lockPtr = NULL;
+
+  memset(lock, 0, sizeof(*lock));
+  initializeHashLock(lock);
+  pushRingNode(&zone->lockPool, &lock->poolNode);
+}
+
+/**********************************************************************/
+int acquireHashLockFromZone(HashZone            *zone,
+                            const UdsChunkName  *hash,
+                            HashLock            *replaceLock,
+                            HashLock           **lockPtr)
+{
+  // Borrow and prepare a lock from the pool so we don't have to do two
+  // PointerMap accesses in the common case of no lock contention.
+  HashLock *newLock = asHashLock(popRingNode(&zone->lockPool));
+  int result = ASSERT(newLock != NULL,
+                      "never need to wait for a free hash lock");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Fill in the hash of the new lock so we can map it, since we have to use
+  // the hash as the map key.
+  newLock->hash = *hash;
+
+  HashLock *lock;
+  result = pointerMapPut(zone->hashLockMap, &newLock->hash, newLock,
+                         (replaceLock != NULL), (void **) &lock);
+  if (result != VDO_SUCCESS) {
+    returnHashLockToPool(zone, &newLock);
+    return result;
+  }
+
+  if (replaceLock != NULL) {
+    // XXX on mismatch put the old lock back and return a severe error
+    ASSERT_LOG_ONLY(lock == replaceLock,
+                    "old lock must have been in the lock map");
+    // XXX check earlier and bail out?
+    ASSERT_LOG_ONLY(replaceLock->registered,
+                    "old lock must have been marked registered");
+    replaceLock->registered = false;
+  }
+
+  if (lock == replaceLock) {
+    lock = newLock;
+    lock->registered = true;
+  } else {
+    // There's already a lock for the hash, so we don't need the borrowed lock.
+    returnHashLockToPool(zone, &newLock);
+  }
+
+  *lockPtr = lock;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void returnHashLockToZone(HashZone *zone, HashLock **lockPtr)
+{
+  HashLock *lock = *lockPtr;
+  *lockPtr = NULL;
+
+  if (lock->registered) {
+    HashLock *removed = pointerMapRemove(zone->hashLockMap, &lock->hash);
+    ASSERT_LOG_ONLY(lock == removed,
+                    "hash lock being released must have been mapped");
+  } else {
+    ASSERT_LOG_ONLY(lock != pointerMapGet(zone->hashLockMap, &lock->hash),
+                    "unregistered hash lock must not be in the lock map");
+  }
+
+  ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters),
+                  "hash lock returned to zone must have no waiters");
+  ASSERT_LOG_ONLY((lock->duplicateLock == NULL),
+                  "hash lock returned to zone must not reference a PBN lock");
+  ASSERT_LOG_ONLY((lock->state == HASH_LOCK_DESTROYING),
+                  "returned hash lock must not be in use with state %s",
+                  getHashLockStateName(lock->state));
+  ASSERT_LOG_ONLY(isRingEmpty(&lock->poolNode),
+                  "hash lock returned to zone must not be in a pool ring");
+  ASSERT_LOG_ONLY(isRingEmpty(&lock->duplicateRing),
+                  "hash lock returned to zone must not reference DataVIOs");
+
+  returnHashLockToPool(zone, &lock);
+}
+
+/**
+ * Dump a compact description of HashLock to the log if the lock is not on the
+ * free list.
+ *
+ * @param lock  The hash lock to dump
+ **/
+static void dumpHashLock(const HashLock *lock)
+{
+  if (!isRingEmpty(&lock->poolNode)) {
+    // This lock is on the free list.
+    return;
+  }
+
+  // Necessarily cryptic since we can log a lot of these. First three chars of
+  // state is unambiguous. 'U' indicates a lock not registered in the map.
+  const char *state = getHashLockStateName(lock->state);
+  logInfo("  hl %" PRIptr ": %3.3s %c%llu/%u rc=%u wc=%zu agt=%" PRIptr,
+          (const void *) lock,
+          state,
+          (lock->registered ? 'D' : 'U'),
+          lock->duplicate.pbn,
+          lock->duplicate.state,
+          lock->referenceCount,
+          countWaiters(&lock->waiters),
+          (void *) lock->agent);
+}
+
+/**********************************************************************/
+void bumpHashZoneValidAdviceCount(HashZone *zone)
+{
+  // Must only be mutated on the hash zone thread.
+  relaxedAdd64(&zone->statistics.dedupeAdviceValid, 1);
+}
+
+/**********************************************************************/
+void bumpHashZoneStaleAdviceCount(HashZone *zone)
+{
+  // Must only be mutated on the hash zone thread.
+  relaxedAdd64(&zone->statistics.dedupeAdviceStale, 1);
+}
+
+/**********************************************************************/
+void bumpHashZoneDataMatchCount(HashZone *zone)
+{
+  // Must only be mutated on the hash zone thread.
+  relaxedAdd64(&zone->statistics.concurrentDataMatches, 1);
+}
+
+/**********************************************************************/
+void bumpHashZoneCollisionCount(HashZone *zone)
+{
+  // Must only be mutated on the hash zone thread.
+  relaxedAdd64(&zone->statistics.concurrentHashCollisions, 1);
+}
+
+/**********************************************************************/
+void dumpHashZone(const HashZone *zone)
+{
+  if (zone->hashLockMap == NULL) {
+    logInfo("HashZone %u: NULL map", zone->zoneNumber);
+    return;
+  }
+
+  logInfo("HashZone %u: mapSize=%zu",
+          zone->zoneNumber, pointerMapSize(zone->hashLockMap));
+  for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) {
+    dumpHashLock(&zone->lockArray[i]);
+  }
+}
diff --git a/vdo/base/hashZone.h b/vdo/base/hashZone.h
new file mode 100644
index 0000000..ac1b695
--- /dev/null
+++ b/vdo/base/hashZone.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.h#1 $
+ */
+
+#ifndef HASH_ZONE_H
+#define HASH_ZONE_H
+
+#include "uds.h"
+
+#include "statistics.h"
+#include "types.h"
+
+/**
+ * Create a hash zone.
+ *
+ * @param [in]  vdo         The VDO to which the zone will belong
+ * @param [in]  zoneNumber  The number of the zone to create
+ * @param [out] zonePtr     A pointer to hold the new HashZone
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a hash zone and null out the reference to it.
+ *
+ * @param zonePtr  A pointer to the zone to free
+ **/
+void freeHashZone(HashZone **zonePtr);
+
+/**
+ * Get the zone number of a hash zone.
+ *
+ * @param zone  The zone
+ *
+ * @return The number of the zone
+ **/
+ZoneCount getHashZoneNumber(const HashZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the ID of a hash zone's thread.
+ *
+ * @param zone  The zone
+ *
+ * @return The zone's thread ID
+ **/
+ThreadID getHashZoneThreadID(const HashZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the statistics for this hash zone.
+ *
+ * @param zone  The hash zone to query
+ *
+ * @return A copy of the current statistics for the hash zone
+ **/
+HashLockStatistics getHashZoneStatistics(const HashZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the lock for the hash (chunk name) of the data in a DataVIO, or if one
+ * does not exist (or if we are explicitly rolling over), initialize a new
+ * lock for the hash and register it in the zone. This must only be called in
+ * the correct thread for the zone.
+ *
+ * @param [in]  zone         The zone responsible for the hash
+ * @param [in]  hash         The hash to lock
+ * @param [in]  replaceLock  If non-NULL, the lock already registered for the
+ *                           hash which should be replaced by the new lock
+ * @param [out] lockPtr      A pointer to receive the hash lock
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int acquireHashLockFromZone(HashZone            *zone,
+                            const UdsChunkName  *hash,
+                            HashLock            *replaceLock,
+                            HashLock           **lockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return a hash lock to the zone it was borrowed from, remove it from the
+ * zone's lock map, returning it to the pool, and nulling out the reference to
+ * it. This must only be called when the lock has been completely released,
+ * and only in the correct thread for the zone.
+ *
+ * @param [in]     zone     The zone from which the lock was borrowed
+ * @param [in,out] lockPtr  The lock that is no longer in use
+ **/
+void returnHashLockToZone(HashZone *zone, HashLock **lockPtr);
+
+/**
+ * Increment the valid advice count in the hash zone statistics.
+ * Must only be called from the hash zone thread.
+ *
+ * @param zone  The hash zone of the lock that received valid advice
+ **/
+void bumpHashZoneValidAdviceCount(HashZone *zone);
+
+/**
+ * Increment the stale advice count in the hash zone statistics.
+ * Must only be called from the hash zone thread.
+ *
+ * @param zone  The hash zone of the lock that received stale advice
+ **/
+void bumpHashZoneStaleAdviceCount(HashZone *zone);
+
+/**
+ * Increment the concurrent dedupe count in the hash zone statistics.
+ * Must only be called from the hash zone thread.
+ *
+ * @param zone  The hash zone of the lock that matched a new DataVIO
+ **/
+void bumpHashZoneDataMatchCount(HashZone *zone);
+
+/**
+ * Increment the concurrent hash collision count in the hash zone statistics.
+ * Must only be called from the hash zone thread.
+ *
+ * @param zone  The hash zone of the lock that rejected a colliding DataVIO
+ **/
+void bumpHashZoneCollisionCount(HashZone *zone);
+
+/**
+ * Dump information about a hash zone to the log for debugging.
+ *
+ * @param zone   The zone to dump
+ **/
+void dumpHashZone(const HashZone *zone);
+
+#endif // HASH_ZONE_H
diff --git a/vdo/base/header.c b/vdo/base/header.c
new file mode 100644
index 0000000..8f0582b
--- /dev/null
+++ b/vdo/base/header.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.c#5 $
+ */
+
+#include "header.h"
+
+#include "logger.h"
+#include "permassert.h"
+#include "statusCodes.h"
+
+/**********************************************************************/
+int validateVersion(VersionNumber  expectedVersion,
+                    VersionNumber  actualVersion,
+                    const char    *componentName)
+{
+  if (!areSameVersion(expectedVersion, actualVersion)) {
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "%s version mismatch,"
+                                   " expected %d.%d, got %d.%d",
+                                   componentName,
+                                   expectedVersion.majorVersion,
+                                   expectedVersion.minorVersion,
+                                   actualVersion.majorVersion,
+                                   actualVersion.minorVersion);
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int validateHeader(const Header *expectedHeader,
+                   const Header *actualHeader,
+                   bool          exactSize,
+                   const char   *componentName)
+{
+  if (expectedHeader->id != actualHeader->id) {
+    return logErrorWithStringError(VDO_INCORRECT_COMPONENT,
+                                   "%s ID mismatch, expected %d, got %d",
+                                   componentName,
+                                   expectedHeader->id,
+                                   actualHeader->id);
+  }
+
+  int result = validateVersion(expectedHeader->version,
+                               actualHeader->version,
+                               componentName);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if ((expectedHeader->size > actualHeader->size)
+      || (exactSize && (expectedHeader->size < actualHeader->size))) {
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "%s size mismatch, expected %zu, got %zu",
+                                   componentName,
+                                   expectedHeader->size,
+                                   actualHeader->size);
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int encodeHeader(const Header *header, Buffer *buffer)
+{
+  if (!ensureAvailableSpace(buffer, ENCODED_HEADER_SIZE)) {
+    return UDS_BUFFER_ERROR;
+  }
+
+  int result = putUInt32LEIntoBuffer(buffer, header->id);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = encodeVersionNumber(header->version, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return putUInt64LEIntoBuffer(buffer, header->size);
+}
+
+/**********************************************************************/
+int encodeVersionNumber(VersionNumber version, Buffer *buffer)
+{
+  PackedVersionNumber packed = packVersionNumber(version);
+  return putBytes(buffer, sizeof(packed), &packed);
+}
+
+/**********************************************************************/
+int decodeHeader(Buffer *buffer, Header *header)
+{
+  ComponentID id;
+  int result = getUInt32LEFromBuffer(buffer, &id);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  VersionNumber version;
+  result = decodeVersionNumber(buffer, &version);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint64_t size;
+  result = getUInt64LEFromBuffer(buffer, &size);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *header = (Header) {
+    .id      = id,
+    .version = version,
+    .size    = size,
+  };
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int decodeVersionNumber(Buffer *buffer, VersionNumber *version)
+{
+  PackedVersionNumber packed;
+  int result = getBytesFromBuffer(buffer, sizeof(packed), &packed);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *version = unpackVersionNumber(packed);
+  return UDS_SUCCESS;
+}
diff --git a/vdo/base/header.h b/vdo/base/header.h
new file mode 100644
index 0000000..d5b4f0e
--- /dev/null
+++ b/vdo/base/header.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.h#4 $
+ */
+
+#ifndef HEADER_H
+#define HEADER_H
+
+#include "buffer.h"
+#include "numeric.h"
+
+#include "types.h"
+
+/**
+ * An in-memory representation of a version number for versioned structures on
+ * disk.
+ *
+ * A version number consists of two portions, a major version and a
+ * minor version. Any format change which does not require an explicit
+ * upgrade step from the previous version should increment the minor
+ * version. Any format change which either requires an explicit
+ * upgrade step, or is wholly incompatible (i.e. can not be upgraded
+ * to), should increment the major version, and set the minor version
+ * to 0.
+ **/
+typedef struct {
+  uint32_t majorVersion;
+  uint32_t minorVersion;
+} __attribute__((packed)) VersionNumber;
+
+/**
+ * A packed, machine-independent, on-disk representation of a VersionNumber.
+ * Both fields are stored in little-endian byte order.
+ **/
+typedef struct {
+  byte majorVersion[4];
+  byte minorVersion[4];
+} __attribute__((packed)) PackedVersionNumber;
+
+/**
+ * The registry of component ids for use in headers
+ **/
+typedef enum {
+  SUPER_BLOCK       = 0,
+  FIXED_LAYOUT      = 1,
+  RECOVERY_JOURNAL  = 2,
+  SLAB_DEPOT        = 3,
+  BLOCK_MAP         = 4,
+  GEOMETRY_BLOCK    = 5,
+} ComponentID;
+
+/**
+ * The header for versioned data stored on disk.
+ **/
+typedef struct {
+  ComponentID         id;       // The component this is a header for
+  VersionNumber       version;  // The version of the data format
+  size_t              size;     // The size of the data following this header
+} __attribute__((packed)) Header;
+
+enum {
+  ENCODED_HEADER_SIZE = sizeof(Header),
+};
+
+/**
+ * Check whether two version numbers are the same.
+ *
+ * @param versionA The first version
+ * @param versionB The second version
+ *
+ * @return <code>true</code> if the two versions are the same
+ **/
+static inline bool areSameVersion(VersionNumber versionA,
+                                  VersionNumber versionB)
+{
+  return ((versionA.majorVersion == versionB.majorVersion)
+          && (versionA.minorVersion == versionB.minorVersion));
+}
+
+/**
+ * Check whether an actual version is upgradable to an expected version.
+ * An actual version is upgradable if its major number is expected but
+ * its minor number differs, and the expected version's minor number
+ * is greater than the actual version's minor number.
+ *
+ * @param expectedVersion The expected version
+ * @param actualVersion   The version being validated
+ *
+ * @return <code>true</code> if the actual version is upgradable
+ **/
+static inline bool isUpgradableVersion(VersionNumber expectedVersion,
+                                       VersionNumber actualVersion)
+{
+  return ((expectedVersion.majorVersion == actualVersion.majorVersion)
+          && (expectedVersion.minorVersion > actualVersion.minorVersion));
+}
+
+/**
+ * Check whether a version matches an expected version. Logs an error
+ * describing a mismatch.
+ *
+ * @param expectedVersion  The expected version
+ * @param actualVersion    The version being validated
+ * @param componentName    The name of the component or the calling function
+ *                         (for error logging)
+ *
+ * @return VDO_SUCCESS             if the versions are the same
+ *         VDO_UNSUPPORTED_VERSION if the versions don't match
+ **/
+int validateVersion(VersionNumber  expectedVersion,
+                    VersionNumber  actualVersion,
+                    const char    *componentName)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a header matches expectations. Logs an error describing the
+ * first mismatch found.
+ *
+ * @param expectedHeader  The expected header
+ * @param actualHeader    The header being validated
+ * @param exactSize       If true, the size fields of the two headers must be
+ *                        the same, otherwise actualSize >= expectedSize is OK
+ * @param componentName   The name of the component or the calling function
+ *                        (for error logging)
+ *
+ * @return VDO_SUCCESS             if the header meets expectations
+ *         VDO_INCORRECT_COMPONENT if the component ids don't match
+ *         VDO_UNSUPPORTED_VERSION if the versions or sizes don't match
+ **/
+int validateHeader(const Header *expectedHeader,
+                   const Header *actualHeader,
+                   bool          exactSize,
+                   const char   *componentName)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode a header into a buffer.
+ *
+ * @param header  The header to encode
+ * @param buffer  The buffer in which to encode the header
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeHeader(const Header *header, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode a version number into a buffer.
+ *
+ * @param version  The version to encode
+ * @param buffer   The buffer in which to encode the version
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeVersionNumber(VersionNumber version, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode a header from a buffer.
+ *
+ * @param [in]  buffer  The buffer from which to decode the header
+ * @param [out] header  The header to decode
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int decodeHeader(Buffer *buffer, Header *header)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode a version number from a buffer.
+ *
+ * @param buffer   The buffer from which to decode the version
+ * @param version  The version structure to decode into
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int decodeVersionNumber(Buffer *buffer, VersionNumber *version)
+  __attribute__((warn_unused_result));
+
+/**
+ * Convert a VersionNumber to its packed on-disk representation.
+ *
+ * @param version  The version number to convert
+ *
+ * @return the platform-independent representation of the version
+ **/
+static inline PackedVersionNumber packVersionNumber(VersionNumber version)
+{
+  PackedVersionNumber packed;
+  storeUInt32LE(packed.majorVersion, version.majorVersion);
+  storeUInt32LE(packed.minorVersion, version.minorVersion);
+  return packed;
+}
+
+/**
+ * Convert a PackedVersionNumber to its native in-memory representation.
+ *
+ * @param version  The version number to convert
+ *
+ * @return the platform-independent representation of the version
+ **/
+static inline VersionNumber unpackVersionNumber(PackedVersionNumber version)
+{
+  return (VersionNumber) {
+    .majorVersion = getUInt32LE(version.majorVersion),
+    .minorVersion = getUInt32LE(version.minorVersion),
+  };
+}
+
+#endif // HEADER_H
diff --git a/vdo/base/heap.c b/vdo/base/heap.c
new file mode 100644
index 0000000..0928023
--- /dev/null
+++ b/vdo/base/heap.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.c#2 $
+ */
+
+#include "heap.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "numeric.h"
+
+#include "statusCodes.h"
+
+/**********************************************************************/
+void initializeHeap(Heap           *heap,
+                    HeapComparator *comparator,
+                    HeapSwapper    *swapper,
+                    void           *array,
+                    size_t          capacity,
+                    size_t          elementSize)
+{
+  *heap = (Heap) {
+    .comparator  = comparator,
+    .swapper     = swapper,
+    .capacity    = capacity,
+    .elementSize = elementSize,
+  };
+  if (array != NULL) {
+    // Calculating child indexes is simplified by pretending the element array
+    // is 1-based.
+    heap->array = ((byte *) array - elementSize);
+  }
+}
+
+/**********************************************************************/
+static void siftHeapDown(Heap *heap, size_t topNode, size_t lastNode)
+{
+  // Keep sifting until the sub-heap rooted at topNode has no children.
+  size_t leftChild;
+  while ((leftChild = (2 * topNode)) <= lastNode) {
+    // If there are two children, select the largest child to swap with.
+    size_t swapNode = leftChild;
+    if (leftChild < lastNode) {
+      size_t rightChild = leftChild + heap->elementSize;
+      if (heap->comparator(&heap->array[leftChild],
+                           &heap->array[rightChild]) < 0) {
+        swapNode = rightChild;
+      }
+    }
+
+    // Stop sifting if topNode is at least as large as its largest child,
+    // which means the heap invariant was restored by the previous swap.
+    if (heap->comparator(&heap->array[topNode], &heap->array[swapNode]) >= 0) {
+      return;
+    }
+
+    // Swap the element we've been sifting down with the larger child.
+    heap->swapper(&heap->array[topNode], &heap->array[swapNode]);
+
+    // Descend into the sub-heap rooted at that child, going around the loop
+    // again in place of a tail-recursive call to siftHeapDown().
+    topNode = swapNode;
+  }
+
+  // We sifted the element all the way to a leaf node of the heap, so the heap
+  // invariant has now been restored.
+}
+
+/**********************************************************************/
+void buildHeap(Heap *heap, size_t count)
+{
+  heap->count = minSizeT(count, heap->capacity);
+
+  if ((heap->count < 2) || (heap->elementSize == 0)) {
+    return;
+  }
+
+  /*
+   * All the leaf nodes are trivially valid sub-heaps. Starting with the parent
+   * of the right-most leaf node, restore the heap invariant in that sub-heap
+   * by sifting the top node of the sub-heap down into one of its children's
+   * valid sub-heaps (or not, if the top node is already larger than its
+   * children). Continue iterating through all the interior nodes in the heap,
+   * in sort of a reverse breadth-first traversal, restoring the heap
+   * invariant for each (increasingly larger) sub-heap until we reach the root
+   * of the heap. Once we sift the root node down into one of its two valid
+   * children, the entire heap must be valid, by induction.
+   *
+   * Even though we operate on every node and potentially perform an O(log N)
+   * traversal for each node, the combined probabilities of actually needing
+   * to do a swap and the heights of the sub-heaps sum to a constant, so
+   * restoring a heap from the bottom-up like this has only O(N) complexity.
+   */
+  size_t size       = heap->elementSize;
+  size_t lastParent = size * (heap->count / 2);
+  size_t lastNode   = size * heap->count;
+  for (size_t topNode = lastParent; topNode > 0; topNode -= size) {
+    siftHeapDown(heap, topNode, lastNode);
+  }
+}
+
+/**********************************************************************/
+bool popMaxHeapElement(Heap *heap, void *elementPtr)
+{
+  if (heap->count == 0) {
+    return false;
+  }
+
+  size_t rootNode = (heap->elementSize * 1);
+  size_t lastNode = (heap->elementSize * heap->count);
+
+  // Return the maximum element (the root of the heap) if the caller wanted it.
+  if (elementPtr != NULL) {
+    memcpy(elementPtr, &heap->array[rootNode], heap->elementSize);
+  }
+
+  // Move the right-most leaf node to the vacated root node, reducing the
+  // number of elements by one and violating the heap invariant.
+  if (rootNode != lastNode) {
+    memcpy(&heap->array[rootNode], &heap->array[lastNode], heap->elementSize);
+  }
+  heap->count -= 1;
+  lastNode    -= heap->elementSize;
+
+  // Restore the heap invariant by sifting the root back down into the heap.
+  siftHeapDown(heap, rootNode, lastNode);
+  return true;
+}
+
+/**********************************************************************/
+static inline size_t siftAndSort(Heap *heap, size_t rootNode, size_t lastNode)
+{
+  /*
+   * We have a valid heap, so the largest unsorted element is now at the top
+   * of the heap. That element belongs at the start of the partially-sorted
+   * array, preceding all the larger elements that we've already removed
+   * from the heap. Swap that largest unsorted element with the the
+   * right-most leaf node in the heap, moving it to its sorted position in
+   * the array.
+   */
+  heap->swapper(&heap->array[rootNode], &heap->array[lastNode]);
+  // The sorted list is now one element larger and valid. The heap is
+  // one element smaller, and invalid.
+  lastNode -= heap->elementSize;
+  // Restore the heap invariant by sifting the swapped element back down
+  // into the heap.
+  siftHeapDown(heap, rootNode, lastNode);
+  return lastNode;
+}
+
+/**********************************************************************/
+size_t sortHeap(Heap *heap)
+{
+  // All zero-length records are identical and therefore already sorted, as
+  // are empty or singleton arrays.
+  if ((heap->count < 2) || (heap->elementSize == 0)) {
+    return heap->count;
+  }
+
+  // Get the byte array offset of the root node, and the right-most leaf node
+  // in the 1-based array of records that will form the heap.
+  size_t rootNode = (heap->elementSize * 1);
+  size_t lastNode = (heap->elementSize * heap->count);
+
+  while (lastNode > rootNode) {
+    lastNode = siftAndSort(heap, rootNode, lastNode);
+  }
+
+  size_t count = heap->count;
+  heap->count = 0;
+  return count;
+}
+
+/**********************************************************************/
+void *sortNextHeapElement(Heap *heap)
+{
+  if ((heap->count == 0) || (heap->elementSize == 0)) {
+    return NULL;
+  }
+
+  // Get the byte array offset of the root node, and the right-most leaf node
+  // in the 1-based array of records that will form the heap.
+  size_t rootNode = (heap->elementSize * 1);
+  size_t lastNode = (heap->elementSize * heap->count);
+  if (heap->count > 1) {
+    siftAndSort(heap, rootNode, lastNode);
+  }
+  heap->count--;
+
+  return &heap->array[lastNode];
+}
diff --git a/vdo/base/heap.h b/vdo/base/heap.h
new file mode 100644
index 0000000..916f017
--- /dev/null
+++ b/vdo/base/heap.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.h#2 $
+ */
+
+#ifndef HEAP_H
+#define HEAP_H
+
+#include "common.h"
+
+/**
+ * Prototype for functions which compare two array elements. All the time
+ * complexity claims in this module assume this operation has O(1) time
+ * complexity.
+ *
+ * @param item1  The first element to compare
+ * @param item2  The second element to compare
+ *
+ * @return An integer which is less than, equal to, or greater than 0
+ *         depending on whether item1 is less than, equal to, or greater
+ *         than item2, respectively
+ **/
+typedef int HeapComparator(const void *item1, const void *item2);
+
+/**
+ * Prototype for functions which swap two array elements.
+ *
+ * @param item1  The first element to swap
+ * @param item2  The second element to swap
+ **/
+typedef void HeapSwapper(void *item1, void *item2);
+
+/**
+ * A heap array can be any array of fixed-length elements in which the heap
+ * invariant can be established. In a max-heap, every child of a node must be
+ * at least as large as its children. Once that invariant is established in an
+ * array by calling buildHeap(), all the other heap operations may be used on
+ * that array.
+ **/
+typedef struct heap {
+  /** the 1-based array of heap elements (nodes) */
+  byte           *array;
+  /** the function to use to compare two elements */
+  HeapComparator *comparator;
+  /** the function to use to swap two elements */
+  HeapSwapper    *swapper;
+  /** the maximum number of elements that can be stored */
+  size_t          capacity;
+  /** the size of every element (in bytes) */
+  size_t          elementSize;
+  /** the current number of elements in the heap */
+  size_t          count;
+} Heap;
+
+/**
+ * Initialize an binary heap by wrapping it around an array of elements.
+ *
+ * The heap will not own the array it wraps. Use buildHeap() subsequently to
+ * arrange any elements contained in the array into a valid heap.
+ *
+ * @param heap          The heap to initialize
+ * @param comparator    The function to use to compare two heap elements
+ * @param swapper       The function to use to swap two heap elements
+ * @param array         The array of elements (not modified by this call)
+ * @param capacity      The maximum number of elements which fit in the array
+ * @param elementSize   The size of every array element, in bytes
+ **/
+void initializeHeap(Heap           *heap,
+                    HeapComparator *comparator,
+                    HeapSwapper    *swapper,
+                    void           *array,
+                    size_t          capacity,
+                    size_t          elementSize);
+
+/**
+ * Build a max-heap in place in an array (heapify it) by re-ordering the
+ * elements to establish the heap invariant. Before calling this function,
+ * first copy the elements to be arranged into a heap into the array that was
+ * passed to initializeHeap(). This operation has O(N) time complexity in the
+ * number of elements in the array.
+ *
+ * @param heap   The heap to build
+ * @param count  The number of elements in the array to build into a heap
+ **/
+void buildHeap(Heap *heap, size_t count);
+
+/**
+ * Check whether the heap is currently empty.
+ *
+ * @param heap  The heap to query
+ *
+ * @return <code>true</code> if there are no elements in the heap
+ **/
+static inline bool isHeapEmpty(const Heap *heap)
+{
+  return (heap->count == 0);
+}
+
+/**
+ * Remove the largest element from the top of the heap and restore the heap
+ * invariant on the remaining elements. This operation has O(log2(N)) time
+ * complexity.
+ *
+ * @param [in]  heap        The heap to modify
+ * @param [out] elementPtr  A pointer to receive the largest element (may be
+ *                          NULL if the caller just wishes to discard it)
+ *
+ * @return <code>false</code> if the heap was empty, so no element was removed
+ **/
+bool popMaxHeapElement(Heap *heap, void *elementPtr);
+
+/**
+ * Sort the elements contained in a heap.
+ *
+ * This function re-orders the elements contained in the heap to a sorted
+ * array in-place by repeatedly popping the maximum element off the heap and
+ * moving it to the spot vacated at the end of the heap array. When the
+ * function returns, the heap will be empty and the array will contain the
+ * elements in sorted order, from heap minimum to heap maximum. The sort is
+ * unstable--relative ordering of equal keys is not preserved. This operation
+ * has O(N*log2(N)) time complexity.
+ *
+ * @param heap  The heap containing the elements to sort
+ *
+ * @return the number of elements that were sorted
+ **/
+size_t sortHeap(Heap *heap);
+
+/**
+ * Gets the next sorted heap element and returns a pointer to it, in O(log2(N))
+ * time.
+ *
+ * @param heap  The heap to sort one more step
+ *
+ * @return a pointer to the element sorted, or NULL if already fully sorted.
+ **/
+void *sortNextHeapElement(Heap *heap);
+
+#endif /* HEAP_H */
diff --git a/vdo/base/intMap.c b/vdo/base/intMap.c
new file mode 100644
index 0000000..2c690a6
--- /dev/null
+++ b/vdo/base/intMap.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.c#1 $
+ */
+
+/**
+ * Hash table implementation of a map from integers to pointers, implemented
+ * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
+ * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does
+ * not contain any of the locking/concurrency features of the algorithm, just
+ * the collision resolution scheme.
+ *
+ * Hopscotch Hashing is based on hashing with open addressing and linear
+ * probing. All the entries are stored in a fixed array of buckets, with no
+ * dynamic allocation for collisions. Unlike linear probing, all the entries
+ * that hash to a given bucket are stored within a fixed neighborhood starting
+ * at that bucket. Chaining is effectively represented as a bit vector
+ * relative to each bucket instead of as pointers or explicit offsets.
+ *
+ * When an empty bucket cannot be found within a given neighborhood,
+ * subsequent neighborhoods are searched, and one or more entries will "hop"
+ * into those neighborhoods. When this process works, an empty bucket will
+ * move into the desired neighborhood, allowing the entry to be added. When
+ * that process fails (typically when the buckets are around 90% full), the
+ * table must be resized and the all entries rehashed and added to the
+ * expanded table.
+ *
+ * Unlike linear probing, the number of buckets that must be searched in the
+ * worst case has a fixed upper bound (the size of the neighborhood). Those
+ * entries occupy a small number of memory cache lines, leading to improved
+ * use of the cache (fewer misses on both successful and unsuccessful
+ * searches). Hopscotch hashing outperforms linear probing at much higher load
+ * factors, so even with the increased memory burden for maintaining the hop
+ * vectors, less memory is needed to achieve that performance. Hopscotch is
+ * also immune to "contamination" from deleting entries since entries are
+ * genuinely removed instead of being replaced by a placeholder.
+ *
+ * The published description of the algorithm used a bit vector, but the paper
+ * alludes to an offset scheme which is used by this implementation. Since the
+ * entries in the neighborhood are within N entries of the hash bucket at the
+ * start of the neighborhood, a pair of small offset fields each log2(N) bits
+ * wide is all that's needed to maintain the hops as a linked list. In order
+ * to encode "no next hop" (i.e. NULL) as the natural initial value of zero,
+ * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 =>
+ * offset=1, etc.) We can represent neighborhoods of up to 255 entries with
+ * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the
+ * first entry in the list is always the bucket closest to the start of the
+ * neighborhood.
+ *
+ * While individual accesses tend to be very fast, the table resize operations
+ * are very very expensive. If an upper bound on the latency of adding an
+ * entry to the table is needed, we either need to ensure the table is
+ * pre-sized to be large enough so no resize is ever needed, or we'll need to
+ * develop an approach to incrementally resize the table.
+ **/
+
+#include "intMap.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+enum {
+  DEFAULT_CAPACITY = 16,    // the number of neighborhoods in a new table
+  NEIGHBORHOOD     = 255,   // the number of buckets in each neighborhood
+  MAX_PROBES       = 1024,  // limit on the number of probes for a free bucket
+  NULL_HOP_OFFSET  = 0,     // the hop offset value terminating the hop list
+  DEFAULT_LOAD     = 75     // a compromise between memory use and performance
+};
+
+/**
+ * Buckets are packed together to reduce memory usage and improve cache
+ * efficiency. It would be tempting to encode the hop offsets separately and
+ * maintain alignment of key/value pairs, but it's crucial to keep the hop
+ * fields near the buckets that they use them so they'll tend to share cache
+ * lines.
+ **/
+typedef struct __attribute__((packed)) bucket {
+  uint8_t   firstHop;  // the biased offset of the first entry in the hop list
+                       // of the neighborhood that hashes to this bucket
+  uint8_t   nextHop;   // the biased offset of the next bucket in the hop list
+
+  uint64_t  key;       // the key stored in this bucket
+  void     *value;     // the value stored in this bucket (NULL if empty)
+} Bucket;
+
+/**
+ * The concrete definition of the opaque IntMap type. To avoid having to wrap
+ * the neighborhoods of the last entries back around to the start of the
+ * bucket array, we allocate a few more buckets at the end of the array
+ * instead, which is why capacity and bucketCount are different.
+ **/
+struct intMap {
+  size_t  size;         // the number of entries stored in the map
+  size_t  capacity;     // the number of neighborhoods in the map
+  size_t  bucketCount;  // the number of buckets in the bucket array
+  Bucket *buckets;      // the array of hash buckets
+};
+
+/**
+ * This is the Google CityHash 16-byte hash mixing function.
+ *
+ * @param input1  the first input value
+ * @param input2  the second input value
+ *
+ * @return a hash of the two inputs
+ **/
+static uint64_t mix(uint64_t input1, uint64_t input2)
+{
+  static const uint64_t CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL;
+
+  uint64_t hash = (input1 ^ input2);
+  hash *= CITY_MULTIPLIER;
+  hash ^= (hash >> 47);
+  hash ^= input2;
+  hash *= CITY_MULTIPLIER;
+  hash ^= (hash >> 47);
+  hash *= CITY_MULTIPLIER;
+  return hash;
+}
+
+/**
+ * Calculate a 64-bit non-cryptographic hash value for the provided 64-bit
+ * integer key. The implementation is based on Google's CityHash, only
+ * handling the specific case of an 8-byte input.
+ *
+ * @param key  the mapping key
+ *
+ * @return the hash of the mapping key
+ **/
+static uint64_t hashKey(uint64_t key)
+{
+  // Aliasing restrictions forbid us from casting pointer types, so use a
+  // union to convert a single uint64_t to two uint32_t values.
+  union {
+    uint64_t u64;
+    uint32_t u32[2];
+  } pun = { .u64 = key };
+  return mix(sizeof(key) + (((uint64_t) pun.u32[0]) << 3), pun.u32[1]);
+}
+
+/**
+ * Initialize an IntMap.
+ *
+ * @param map       the map to initialize
+ * @param capacity  the initial capacity of the map
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int allocateBuckets(IntMap *map, size_t capacity)
+{
+  map->size     = 0;
+  map->capacity = capacity;
+
+  // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a
+  // full neighborhood without have to wrap back around to element zero.
+  map->bucketCount = capacity + (NEIGHBORHOOD - 1);
+  return ALLOCATE(map->bucketCount, Bucket, "IntMap buckets", &map->buckets);
+}
+
+/**********************************************************************/
+int makeIntMap(size_t         initialCapacity,
+               unsigned int   initialLoad,
+               IntMap       **mapPtr)
+{
+  // Use the default initial load if the caller did not specify one.
+  if (initialLoad == 0) {
+    initialLoad = DEFAULT_LOAD;
+  }
+  if (initialLoad > 100) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  IntMap *map;
+  int result = ALLOCATE(1, IntMap, "IntMap", &map);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Use the default capacity if the caller did not specify one.
+  size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY;
+
+  // Scale up the capacity by the specified initial load factor.
+  // (i.e to hold 1000 entries at 80% load we need a capacity of 1250)
+  capacity = capacity * 100 / initialLoad;
+
+  result = allocateBuckets(map, capacity);
+  if (result != UDS_SUCCESS) {
+    freeIntMap(&map);
+    return result;
+  }
+
+  *mapPtr = map;
+  return UDS_SUCCESS;
+}
+
+/**
+ * Free the bucket array for the map.
+ *
+ * @param map  the map whose bucket array is to be freed
+ **/
+static void freeBuckets(IntMap *map)
+{
+  FREE(map->buckets);
+  map->buckets = NULL;
+}
+
+/**********************************************************************/
+void freeIntMap(IntMap **mapPtr)
+{
+  if (*mapPtr != NULL) {
+    freeBuckets(*mapPtr);
+    FREE(*mapPtr);
+    *mapPtr = NULL;
+  }
+}
+
+/**********************************************************************/
+size_t intMapSize(const IntMap *map)
+{
+  return map->size;
+}
+
+/**
+ * Convert a biased hop offset within a neighborhood to a pointer to the
+ * bucket it references.
+ *
+ * @param neighborhood  the first bucket in the neighborhood
+ * @param hopOffset     the biased hop offset to the desired bucket
+ *
+ * @return <code>NULL</code> if hopOffset is zero, otherwise a pointer to
+ *         the bucket in the neighborhood at <code>hopOffset - 1</code>
+ **/
+static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset)
+{
+  if (hopOffset == NULL_HOP_OFFSET) {
+    return NULL;
+  }
+
+  STATIC_ASSERT(NULL_HOP_OFFSET == 0);
+  return &neighborhood[hopOffset - 1];
+}
+
+/**
+ * Add a bucket into the hop list for the neighborhood, inserting it into the
+ * list so the hop list remains sorted by hop offset.
+ *
+ * @param neighborhood  the first bucket in the neighborhood
+ * @param newBucket     the bucket to add to the hop list
+ **/
+static void insertInHopList(Bucket *neighborhood, Bucket *newBucket)
+{
+  // Zero indicates a NULL hop offset, so bias the hop offset by one.
+  int hopOffset = 1 + (newBucket - neighborhood);
+
+  // Handle the special case of adding a bucket at the start of the list.
+  int nextHop = neighborhood->firstHop;
+  if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
+    newBucket->nextHop = nextHop;
+    neighborhood->firstHop = hopOffset;
+    return;
+  }
+
+  // Search the hop list for the insertion point that maintains the sort
+  // order.
+  for (;;) {
+    Bucket *bucket = dereferenceHop(neighborhood, nextHop);
+    nextHop = bucket->nextHop;
+
+    if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
+      newBucket->nextHop = nextHop;
+      bucket->nextHop = hopOffset;
+      return;
+    }
+  }
+}
+
+/**
+ * Select and return the hash bucket for a given search key.
+ *
+ * @param map  the map to search
+ * @param key  the mapping key
+ **/
+static Bucket *selectBucket(const IntMap *map, uint64_t key)
+{
+  // Calculate a good hash value for the provided key. We want exactly 32
+  // bits, so mask the result.
+  uint64_t hash = hashKey(key) & 0xFFFFFFFF;
+
+  /*
+   * Scale the 32-bit hash to a bucket index by treating it as a binary
+   * fraction and multiplying that by the capacity. If the hash is uniformly
+   * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be
+   * uniformly distributed over [0 .. capacity-1]. The multiply and shift is
+   * much faster than a divide (modulus) on X86 CPUs.
+   */
+  return &map->buckets[(hash * map->capacity) >> 32];
+}
+
+/**
+ * Search the hop list associated with given hash bucket for a given search
+ * key. If the key is found, returns a pointer to the entry (bucket or
+ * collision), otherwise returns <code>NULL</code>.
+ *
+ * @param [in]  map          the map being searched
+ * @param [in]  bucket       the map bucket to search for the key
+ * @param [in]  key          the mapping key
+ * @param [out] previousPtr  if not <code>NULL</code>, a pointer in which to
+ *                           store the bucket in the list preceding the one
+ *                           that had the matching key
+ *
+ * @return an entry that matches the key, or <code>NULL</code> if not found
+ **/
+static Bucket *searchHopList(IntMap    *map __attribute__((unused)),
+                             Bucket    *bucket,
+                             uint64_t   key,
+                             Bucket   **previousPtr)
+{
+  Bucket *previous = NULL;
+  unsigned int nextHop = bucket->firstHop;
+  while (nextHop != NULL_HOP_OFFSET) {
+    // Check the neighboring bucket indexed by the offset for the desired key.
+    Bucket *entry = dereferenceHop(bucket, nextHop);
+    if ((key == entry->key) && (entry->value != NULL)) {
+      if (previousPtr != NULL) {
+        *previousPtr = previous;
+      }
+      return entry;
+    }
+    nextHop = entry->nextHop;
+    previous = entry;
+  }
+  return NULL;
+}
+
+/**********************************************************************/
+void *intMapGet(IntMap *map, uint64_t key)
+{
+  Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL);
+  return ((match != NULL) ? match->value : NULL);
+}
+
+/**
+ * Increase the number of hash buckets and rehash all the existing entries,
+ * storing them in the new buckets.
+ *
+ * @param map  the map to resize
+ **/
+static int resizeBuckets(IntMap *map)
+{
+  // Copy the top-level map data to the stack.
+  IntMap oldMap = *map;
+
+  // Re-initialize the map to be empty and 50% larger.
+  size_t newCapacity = map->capacity / 2 * 3;
+  logInfo("%s: attempting resize from %zu to %zu, current size=%zu",
+          __func__, map->capacity, newCapacity, map->size);
+  int result = allocateBuckets(map, newCapacity);
+  if (result != UDS_SUCCESS) {
+    *map = oldMap;
+    return result;
+  }
+
+  // Populate the new hash table from the entries in the old bucket array.
+  for (size_t i = 0; i < oldMap.bucketCount; i++) {
+    Bucket *entry = &oldMap.buckets[i];
+    if (entry->value == NULL) {
+      continue;
+    }
+
+    result = intMapPut(map, entry->key, entry->value, true, NULL);
+    if (result != UDS_SUCCESS) {
+      // Destroy the new partial map and restore the map from the stack.
+      freeBuckets(map);
+      *map = oldMap;
+      return result;
+    }
+  }
+
+  // Destroy the old bucket array.
+  freeBuckets(&oldMap);
+  return UDS_SUCCESS;
+}
+
+/**
+ * Probe the bucket array starting at the given bucket for the next empty
+ * bucket, returning a pointer to it. <code>NULL</code> will be returned if
+ * the search reaches the end of the bucket array or if the number of linear
+ * probes exceeds a specified limit.
+ *
+ * @param map        the map containing the buckets to search
+ * @param bucket     the bucket at which to start probing
+ * @param maxProbes  the maximum number of buckets to search
+ *
+ * @return the next empty bucket, or <code>NULL</code> if the search failed
+ **/
+static Bucket *findEmptyBucket(IntMap       *map,
+                               Bucket       *bucket,
+                               unsigned int  maxProbes)
+{
+  // Limit the search to either the nearer of the end of the bucket array or a
+  // fixed distance beyond the initial bucket.
+  size_t remaining = &map->buckets[map->bucketCount] - bucket;
+  Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)];
+
+  for (Bucket *entry = bucket; entry < sentinel; entry++) {
+    if (entry->value == NULL) {
+      return entry;
+    }
+  }
+  return NULL;
+}
+
+/**
+ * Move an empty bucket closer to the start of the bucket array. This searches
+ * the neighborhoods that contain the empty bucket for a non-empty bucket
+ * closer to the start of the array. If such a bucket is found, this swaps the
+ * two buckets by moving the entry to the empty bucket.
+ *
+ * @param map   the map containing the bucket
+ * @param hole  the empty bucket to fill with an entry that precedes it in one
+ *              of its enclosing neighborhoods
+ *
+ * @return the bucket that was vacated by moving its entry to the provided
+ *         hole, or <code>NULL</code> if no entry could be moved
+ **/
+static Bucket *moveEmptyBucket(IntMap *map __attribute__((unused)),
+                               Bucket *hole)
+{
+  /*
+   * Examine every neighborhood that the empty bucket is part of, starting
+   * with the one in which it is the last bucket. No boundary check is needed
+   * for the negative array arithmetic since this function is only called when
+   * hole is at least NEIGHBORHOOD cells deeper into the array than a valid
+   * bucket.
+   */
+  for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
+    // Find the entry that is nearest to the bucket, which means it will be
+    // nearest to the hash bucket whose neighborhood is full.
+    Bucket *newHole = dereferenceHop(bucket, bucket->firstHop);
+    if (newHole == NULL) {
+      // There are no buckets in this neighborhood that are in use by this one
+      // (they must all be owned by overlapping neighborhoods).
+      continue;
+    }
+
+    // Skip this bucket if its first entry is actually further away than the
+    // hole that we're already trying to fill.
+    if (hole < newHole) {
+      continue;
+    }
+
+    /*
+     * We've found an entry in this neighborhood that we can "hop" further
+     * away, moving the hole closer to the hash bucket, if not all the way
+     * into its neighborhood.
+     */
+
+    // The entry that will be the new hole is the first bucket in the list,
+    // so setting firstHop is all that's needed remove it from the list.
+    bucket->firstHop = newHole->nextHop;
+    newHole->nextHop = NULL_HOP_OFFSET;
+
+    // Move the entry into the original hole.
+    hole->key      = newHole->key;
+    hole->value    = newHole->value;
+    newHole->value = NULL;
+
+    // Insert the filled hole into the hop list for the neighborhood.
+    insertInHopList(bucket, hole);
+    return newHole;
+  }
+
+  // We couldn't find an entry to relocate to the hole.
+  return NULL;
+}
+
+/**
+ * Find and update any existing mapping for a given key, returning the value
+ * associated with the key in the provided pointer.
+ *
+ * @param [in]  map           the IntMap to attempt to modify
+ * @param [in]  neighborhood  the first bucket in the neighborhood that
+ *                            would contain the search key
+ * @param [in]  key           the key with which to associate the new value
+ * @param [in]  newValue      the value to be associated with the key
+ * @param [in]  update        whether to overwrite an existing value
+ * @param [out] oldValuePtr   a pointer in which to store the old value
+ *                            (unmodified if no mapping was found)
+ *
+ * @return <code>true</code> if the map contains a mapping for the key
+ *         <code>false</code> if it does not
+ **/
+static bool updateMapping(IntMap    *map,
+                          Bucket    *neighborhood,
+                          uint64_t   key,
+                          void      *newValue,
+                          bool       update,
+                          void     **oldValuePtr)
+{
+  Bucket *bucket = searchHopList(map, neighborhood, key, NULL);
+  if (bucket == NULL) {
+    // There is no bucket containing the key in the neighborhood.
+    return false;
+  }
+
+  // Return the value of the current mapping (if desired) and update the
+  // mapping with the new value (if desired).
+  if (oldValuePtr != NULL) {
+    *oldValuePtr = bucket->value;
+  }
+  if (update) {
+    bucket->value = newValue;
+  }
+  return true;
+}
+
+/**
+ * Find an empty bucket in a specified neighborhood for a new mapping or
+ * attempt to re-arrange mappings so there is such a bucket. This operation
+ * may fail (returning NULL) if an empty bucket is not available or could not
+ * be relocated to the neighborhood.
+ *
+ * @param map           the IntMap to search or modify
+ * @param neighborhood  the first bucket in the neighborhood in which
+ *                      an empty bucket is needed for a new mapping
+ *
+ * @return a pointer to an empty bucket in the desired neighborhood, or
+ *         <code>NULL</code> if a vacancy could not be found or arranged
+ **/
+static Bucket *findOrMakeVacancy(IntMap *map, Bucket *neighborhood)
+{
+  // Probe within and beyond the neighborhood for the first empty bucket.
+  Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES);
+
+  // Keep trying until the empty bucket is in the bucket's neighborhood or we
+  // are unable to move it any closer by swapping it with a filled bucket.
+  while (hole != NULL) {
+    int distance = hole - neighborhood;
+    if (distance < NEIGHBORHOOD) {
+      // We've found or relocated an empty bucket close enough to the initial
+      // hash bucket to be referenced by its hop vector.
+      return hole;
+    }
+
+    // The nearest empty bucket isn't within the neighborhood that must
+    // contain the new entry, so try to swap it with bucket that is closer.
+    hole = moveEmptyBucket(map, hole);
+  }
+
+  return NULL;
+}
+
+/**********************************************************************/
+int intMapPut(IntMap    *map,
+              uint64_t   key,
+              void      *newValue,
+              bool       update,
+              void     **oldValuePtr)
+{
+  if (newValue == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  // Select the bucket at the start of the neighborhood that must contain any
+  // entry for the provided key.
+  Bucket *neighborhood = selectBucket(map, key);
+
+  // Check whether the neighborhood already contains an entry for the key, in
+  // which case we optionally update it, returning the old value.
+  if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) {
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * Find an empty bucket in the desired neighborhood for the new entry or
+   * re-arrange entries in the map so there is such a bucket. This operation
+   * will usually succeed; the loop body will only be executed on the rare
+   * occasions that we have to resize the map.
+   */
+  Bucket *bucket;
+  while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) {
+    /*
+     * There is no empty bucket in which to put the new entry in the current
+     * map, so we're forced to allocate a new bucket array with a larger
+     * capacity, re-hash all the entries into those buckets, and try again (a
+     * very expensive operation for large maps).
+     */
+    int result = resizeBuckets(map);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    // Resizing the map invalidates all pointers to buckets, so recalculate
+    // the neighborhood pointer.
+    neighborhood = selectBucket(map, key);
+  }
+
+  // Put the new entry in the empty bucket, adding it to the neighborhood.
+  bucket->key   = key;
+  bucket->value = newValue;
+  insertInHopList(neighborhood, bucket);
+  map->size += 1;
+
+  // There was no existing entry, so there was no old value to be returned.
+  if (oldValuePtr != NULL) {
+    *oldValuePtr = NULL;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void *intMapRemove(IntMap *map, uint64_t key)
+{
+  // Select the bucket to search and search it for an existing entry.
+  Bucket *bucket = selectBucket(map, key);
+  Bucket *previous;
+  Bucket *victim = searchHopList(map, bucket, key, &previous);
+
+  if (victim == NULL) {
+    // There is no matching entry to remove.
+    return NULL;
+  }
+
+  // We found an entry to remove. Save the mapped value to return later and
+  // empty the bucket.
+  map->size -= 1;
+  void *value   = victim->value;
+  victim->value = NULL;
+  victim->key   = 0;
+
+  // The victim bucket is now empty, but it still needs to be spliced out of
+  // the hop list.
+  if (previous == NULL) {
+    // The victim is the head of the list, so swing firstHop.
+    bucket->firstHop  = victim->nextHop;
+  } else {
+    previous->nextHop = victim->nextHop;
+  }
+  victim->nextHop = NULL_HOP_OFFSET;
+
+  return value;
+}
diff --git a/vdo/base/intMap.h b/vdo/base/intMap.h
new file mode 100644
index 0000000..0b18209
--- /dev/null
+++ b/vdo/base/intMap.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.h#1 $
+ */
+
+#ifndef INT_MAP_H
+#define INT_MAP_H
+
+#include "common.h"
+
+/**
+ * IntMap associates pointers (<code>void *</code>) with integer keys
+ * (<code>uint64_t</code>). <code>NULL</code> pointer values are not
+ * supported.
+ *
+ * The map is implemented as hash table, which should provide constant-time
+ * insert, query, and remove operations, although the insert may occasionally
+ * grow the table, which is linear in the number of entries in the map. The
+ * table will grow as needed to hold new entries, but will not shrink as
+ * entries are removed.
+ **/
+
+typedef struct intMap IntMap;
+
+/**
+ * Allocate and initialize an IntMap.
+ *
+ * @param [in]  initialCapacity  the number of entries the map should
+ *                               initially be capable of holding (zero tells
+ *                               the map to use its own small default)
+ * @param [in]  initialLoad      the load factor of the map, expressed as an
+ *                               integer percentage (typically in the range
+ *                               50 to 90, with zero telling the map to use
+ *                               its own default)
+ * @param [out] mapPtr           a pointer to hold the new IntMap
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makeIntMap(size_t         initialCapacity,
+               unsigned int   initialLoad,
+               IntMap       **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free an IntMap and null out the reference to it. NOTE: The map does not own
+ * the pointer values stored in the map and they are not freed by this call.
+ *
+ * @param [in,out] mapPtr  the reference to the IntMap to free
+ **/
+void freeIntMap(IntMap **mapPtr);
+
+/**
+ * Get the number of entries stored in an IntMap.
+ *
+ * @param map  the IntMap to query
+ *
+ * @return the number of entries in the map
+ **/
+size_t intMapSize(const IntMap *map);
+
+/**
+ * Retrieve the value associated with a given key from the IntMap.
+ *
+ * @param map  the IntMap to query
+ * @param key  the key to look up
+ *
+ * @return the value associated with the given key, or <code>NULL</code>
+ *         if the key is not mapped to any value
+ **/
+void *intMapGet(IntMap *map, uint64_t key);
+
+/**
+ * Try to associate a value (a pointer) with an integer in an IntMap. If the
+ * map already contains a mapping for the provided key, the old value is
+ * only replaced with the specified value if update is true. In either case
+ * the old value is returned. If the map does not already contain a value for
+ * the specified key, the new value is added regardless of the value of update.
+ *
+ * @param [in]  map          the IntMap to attempt to modify
+ * @param [in]  key          the key with which to associate the new value
+ * @param [in]  newValue     the value to be associated with the key
+ * @param [in]  update       whether to overwrite an existing value
+ * @param [out] oldValuePtr  a pointer in which to store either the old value
+ *                           (if the key was already mapped) or
+ *                           <code>NULL</code> if the map did not contain the
+ *                           key; <code>NULL</code> may be provided if the
+ *                           caller does not need to know the old value
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int intMapPut(IntMap    *map,
+              uint64_t   key,
+              void      *newValue,
+              bool       update,
+              void     **oldValuePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove the mapping for a given key from the IntMap.
+ *
+ * @param map  the IntMap from which to remove the mapping
+ * @param key  the key whose mapping is to be removed
+ *
+ * @return the value that was associated with the key, or
+ *         <code>NULL</code> if it was not mapped
+ **/
+void *intMapRemove(IntMap *map, uint64_t key);
+
+#endif /* INT_MAP_H */
diff --git a/vdo/base/journalPoint.h b/vdo/base/journalPoint.h
new file mode 100644
index 0000000..30d44cd
--- /dev/null
+++ b/vdo/base/journalPoint.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/journalPoint.h#2 $
+ */
+
+#ifndef JOURNAL_POINT_H
+#define JOURNAL_POINT_H
+
+#include "numeric.h"
+#include "types.h"
+
+typedef uint16_t JournalEntryCount;
+
+/**
+ * The absolute position of an entry in a recovery journal or slab journal.
+ **/
+typedef struct {
+  SequenceNumber    sequenceNumber;
+  JournalEntryCount entryCount;
+} JournalPoint;
+
+/**
+ * A packed, platform-independent encoding of a JournalPoint.
+ **/
+typedef struct {
+  /**
+   * The packed representation is the little-endian 64-bit representation of
+   * the low-order 48 bits of the sequence number, shifted up 16 bits, or'ed
+   * with the 16-bit entry count.
+   *
+   * Very long-term, the top 16 bits of the sequence number may not always be
+   * zero, as this encoding assumes--see BZ 1523240.
+   **/
+  byte encodedPoint[8];
+} __attribute__((packed)) PackedJournalPoint;
+
+/**
+ * Move the given journal point forward by one entry.
+ *
+ * @param point            the journal point to adjust
+ * @param entriesPerBlock  the number of entries in one full block
+ **/
+static inline void advanceJournalPoint(JournalPoint      *point,
+                                       JournalEntryCount  entriesPerBlock)
+{
+  point->entryCount++;
+  if (point->entryCount == entriesPerBlock) {
+    point->sequenceNumber++;
+    point->entryCount = 0;
+  }
+}
+
+/**
+ * Check whether a journal point is valid.
+ *
+ * @param point  the journal point
+ *
+ * @return <code>true</code> if the journal point is valid
+ **/
+static inline bool isValidJournalPoint(const JournalPoint *point)
+{
+  return ((point != NULL) && (point->sequenceNumber > 0));
+}
+
+/**
+ * Check whether the first point precedes the second point.
+ *
+ * @param first   the first journal point
+ * @param second  the second journal point
+
+ *
+ * @return <code>true</code> if the first point precedes the second point.
+ **/
+static inline bool beforeJournalPoint(const JournalPoint *first,
+                                      const JournalPoint *second)
+{
+  return ((first->sequenceNumber < second->sequenceNumber)
+          || ((first->sequenceNumber == second->sequenceNumber)
+              && (first->entryCount < second->entryCount)));
+}
+
+/**
+ * Check whether the first point is the same as the second point.
+ *
+ * @param first   the first journal point
+ * @param second  the second journal point
+ *
+ * @return <code>true</code> if both points reference the same logical
+ *         position of an entry the journal
+ **/
+static inline bool areEquivalentJournalPoints(const JournalPoint *first,
+                                              const JournalPoint *second)
+{
+  return ((first->sequenceNumber == second->sequenceNumber)
+          && (first->entryCount  == second->entryCount));
+}
+
+/**
+ * Encode the journal location represented by a JournalPoint into a
+ * PackedJournalPoint.
+ *
+ * @param unpacked  The unpacked input point
+ * @param packed    The packed output point
+ **/
+static inline void packJournalPoint(const JournalPoint *unpacked,
+                                    PackedJournalPoint *packed)
+{
+  uint64_t native = ((unpacked->sequenceNumber << 16) | unpacked->entryCount);
+  storeUInt64LE(packed->encodedPoint, native);
+}
+
+/**
+ * Decode the journal location represented by a PackedJournalPoint into a
+ * JournalPoint.
+ *
+ * @param packed    The packed input point
+ * @param unpacked  The unpacked output point
+ **/
+static inline void unpackJournalPoint(const PackedJournalPoint *packed,
+                                      JournalPoint             *unpacked)
+{
+  uint64_t native          = getUInt64LE(packed->encodedPoint);
+  unpacked->sequenceNumber = (native >> 16);
+  unpacked->entryCount     = (native & 0xffff);
+}
+
+#endif // JOURNAL_POINT_H
diff --git a/vdo/base/lockCounter.c b/vdo/base/lockCounter.c
new file mode 100644
index 0000000..e762576
--- /dev/null
+++ b/vdo/base/lockCounter.c
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.c#3 $
+ */
+
+#include "lockCounter.h"
+
+#include "atomic.h"
+#include "memoryAlloc.h"
+
+/**
+ * LockCounter is intended to keep all of the locks for the blocks in the
+ * recovery journal. The per-zone counters are all kept in a single array which
+ * is arranged by zone (i.e. zone 0's lock 0 is at index 0, zone 0's lock 1 is
+ * at index 1, and zone 1's lock 0 is at index 'locks'.  This arrangement is
+ * intended to minimize cache-line contention for counters from different
+ * zones.
+ *
+ * The locks are implemented as a single object instead of as a lock counter
+ * per lock both to afford this opportunity to reduce cache line contention and
+ * also to eliminate the need to have a completion per lock.
+ *
+ * Lock sets are laid out with the set for recovery journal first, followed by
+ * the logical zones, and then the physical zones.
+ **/
+typedef enum lockCounterState {
+  LOCK_COUNTER_STATE_NOT_NOTIFYING = 0,
+  LOCK_COUNTER_STATE_NOTIFYING,
+  LOCK_COUNTER_STATE_SUSPENDED,
+} LockCounterState;
+
+struct lockCounter {
+  /** The completion for notifying the owner of a lock release */
+  VDOCompletion  completion;
+  /** The number of logical zones which may hold locks */
+  ZoneCount      logicalZones;
+  /** The number of physical zones which may hold locks */
+  ZoneCount      physicalZones;
+  /** The number of locks */
+  BlockCount     locks;
+  /** Whether the lock release notification is in flight */
+  Atomic32       state;
+  /** The number of logical zones which hold each lock */
+  Atomic32      *logicalZoneCounts;
+  /** The number of physical zones which hold each lock */
+  Atomic32      *physicalZoneCounts;
+  /** The per-zone, per-lock counts for the journal zone */
+  uint16_t      *journalCounters;
+  /** The per-zone, per-lock decrement counts for the journal zone */
+  Atomic32      *journalDecrementCounts;
+  /** The per-zone, per-lock reference counts for logical zones */
+  uint16_t      *logicalCounters;
+  /** The per-zone, per-lock reference counts for physical zones */
+  uint16_t      *physicalCounters;
+};
+
+/**********************************************************************/
+int makeLockCounter(PhysicalLayer  *layer,
+                    void           *parent,
+                    VDOAction       callback,
+                    ThreadID        threadID,
+                    ZoneCount       logicalZones,
+                    ZoneCount       physicalZones,
+                    BlockCount      locks,
+                    LockCounter   **lockCounterPtr)
+{
+  LockCounter *lockCounter;
+
+  int result = ALLOCATE(1, LockCounter, __func__, &lockCounter);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(locks, uint16_t, __func__, &lockCounter->journalCounters);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = ALLOCATE(locks, Atomic32, __func__,
+                    &lockCounter->journalDecrementCounts);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = ALLOCATE(locks * logicalZones, uint16_t, __func__,
+                    &lockCounter->logicalCounters);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = ALLOCATE(locks, Atomic32, __func__,
+                    &lockCounter->logicalZoneCounts);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = ALLOCATE(locks * physicalZones, uint16_t, __func__,
+                    &lockCounter->physicalCounters);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = ALLOCATE(locks, Atomic32, __func__,
+                    &lockCounter->physicalZoneCounts);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&lockCounter->completion,
+                                           LOCK_COUNTER_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    freeLockCounter(&lockCounter);
+    return result;
+  }
+
+  setCallbackWithParent(&lockCounter->completion, callback, threadID, parent);
+  lockCounter->logicalZones  = logicalZones;
+  lockCounter->physicalZones = physicalZones;
+  lockCounter->locks         = locks;
+  *lockCounterPtr            = lockCounter;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeLockCounter(LockCounter **lockCounterPtr)
+{
+  if (*lockCounterPtr == NULL) {
+    return;
+  }
+
+  LockCounter *lockCounter = *lockCounterPtr;
+  destroyEnqueueable(&lockCounter->completion);
+  freeVolatile(lockCounter->physicalZoneCounts);
+  freeVolatile(lockCounter->logicalZoneCounts);
+  freeVolatile(lockCounter->journalDecrementCounts);
+  FREE(lockCounter->journalCounters);
+  FREE(lockCounter->logicalCounters);
+  FREE(lockCounter->physicalCounters);
+  FREE(lockCounter);
+  *lockCounterPtr = NULL;
+}
+
+/**
+ * Get a pointer to the zone count for a given lock on a given zone.
+ *
+ * @param counter     The lock counter
+ * @param lockNumber  The lock to get
+ * @param zoneType    The zone type whose count is desired
+ *
+ * @return A pointer to the zone count for the given lock and zone
+ **/
+static inline Atomic32 *getZoneCountPtr(LockCounter *counter,
+                                        BlockCount   lockNumber,
+                                        ZoneType     zoneType)
+{
+  return ((zoneType == ZONE_TYPE_LOGICAL)
+          ? &counter->logicalZoneCounts[lockNumber]
+          : &counter->physicalZoneCounts[lockNumber]);
+}
+
+/**
+ * Get the zone counter for a given lock on a given zone.
+ *
+ * @param counter     The lock counter
+ * @param lockNumber  The lock to get
+ * @param zoneType    The zone type whose count is desired
+ * @param zoneID      The zone index whose count is desired
+ *
+ * @return The counter for the given lock and zone
+ **/
+static inline uint16_t *getCounter(LockCounter *counter,
+                                   BlockCount   lockNumber,
+                                   ZoneType     zoneType,
+                                   ZoneCount    zoneID)
+{
+  BlockCount zoneCounter = (counter->locks * zoneID) + lockNumber;
+  if (zoneType == ZONE_TYPE_JOURNAL) {
+    return &counter->journalCounters[zoneCounter];
+  }
+
+  if (zoneType == ZONE_TYPE_LOGICAL) {
+    return &counter->logicalCounters[zoneCounter];
+  }
+
+  return &counter->physicalCounters[zoneCounter];
+}
+
+/**
+ * Check whether the journal zone is locked for a given lock.
+ *
+ * @param counter     The LockCounter
+ * @param lockNumber  The lock to check
+ *
+ * @return <code>true</code> if the journal zone is locked
+ **/
+static bool isJournalZoneLocked(LockCounter *counter, BlockCount lockNumber)
+{
+  uint16_t journalValue
+    = *(getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, 0));
+  uint32_t decrements
+    = atomicLoad32(&(counter->journalDecrementCounts[lockNumber]));
+  ASSERT_LOG_ONLY((decrements <= journalValue),
+                  "journal zone lock counter must not underflow");
+
+  return (journalValue != decrements);
+}
+
+/**********************************************************************/
+bool isLocked(LockCounter *lockCounter,
+              BlockCount   lockNumber,
+              ZoneType     zoneType)
+{
+  ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL),
+                  "isLocked() called for non-journal zone");
+  return (isJournalZoneLocked(lockCounter, lockNumber)
+          || (atomicLoad32(getZoneCountPtr(lockCounter, lockNumber, zoneType))
+              != 0));
+}
+
+/**
+ * Check that we are on the journal thread.
+ *
+ * @param counter  The LockCounter
+ * @param caller   The name of the caller (for logging)
+ **/
+static void assertOnJournalThread(LockCounter *counter, const char *caller)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == counter->completion.callbackThreadID),
+                  "%s() called from journal zone", caller);
+}
+
+/**********************************************************************/
+void initializeLockCount(LockCounter *counter,
+                         BlockCount   lockNumber,
+                         uint16_t     value)
+{
+  assertOnJournalThread(counter, __func__);
+  uint16_t *journalValue   = getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL,
+                                        0);
+  Atomic32 *decrementCount = &(counter->journalDecrementCounts[lockNumber]);
+  ASSERT_LOG_ONLY((*journalValue == atomicLoad32(decrementCount)),
+                  "count to be initialized not in use");
+
+  *journalValue = value;
+  atomicStore32(decrementCount, 0);
+}
+
+/**********************************************************************/
+void acquireLockCountReference(LockCounter *counter,
+                               BlockCount   lockNumber,
+                               ZoneType     zoneType,
+                               ZoneCount    zoneID)
+{
+  ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL),
+                  "invalid lock count increment from journal zone");
+
+  uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID);
+  ASSERT_LOG_ONLY(*currentValue < UINT16_MAX,
+                  "increment of lock counter must not overflow");
+
+  if (*currentValue == 0) {
+    // This zone is acquiring this lock for the first time.
+    atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), 1);
+  }
+  *currentValue += 1;
+}
+
+/**
+ * Decrement a non-atomic counter.
+ *
+ * @param counter     The LockCounter
+ * @param lockNumber  Which lock to decrement
+ * @param zoneType    The type of the zone releasing the reference
+ * @param zoneID      The ID of the zone releasing the reference
+ *
+ * @return The new value of the counter
+ **/
+static uint16_t releaseReference(LockCounter *counter,
+                                 BlockCount   lockNumber,
+                                 ZoneType     zoneType,
+                                 ZoneCount    zoneID)
+{
+  uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID);
+  ASSERT_LOG_ONLY((*currentValue >= 1),
+                  "decrement of lock counter must not underflow");
+
+  *currentValue -= 1;
+  return *currentValue;
+}
+
+/**
+ * Attempt to notify the owner of this LockCounter that some lock has been
+ * released for some zone type. Will do nothing if another notification is
+ * already in progress.
+ *
+ * @param counter  The LockCounter
+ **/
+static void attemptNotification(LockCounter *counter)
+{
+  if (compareAndSwap32(&counter->state,
+                       LOCK_COUNTER_STATE_NOT_NOTIFYING,
+                       LOCK_COUNTER_STATE_NOTIFYING)) {
+    resetCompletion(&counter->completion);
+    invokeCallback(&counter->completion);
+  }
+}
+
+/**********************************************************************/
+void releaseLockCountReference(LockCounter *counter,
+                               BlockCount   lockNumber,
+                               ZoneType     zoneType,
+                               ZoneCount    zoneID)
+{
+  ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL),
+                  "invalid lock count decrement from journal zone");
+  if (releaseReference(counter, lockNumber, zoneType, zoneID) != 0) {
+    return;
+  }
+
+  if (atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), -1) == 0) {
+    // This zone was the last lock holder of its type, so try to notify the
+    // owner.
+    attemptNotification(counter);
+  }
+}
+
+/**********************************************************************/
+void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber)
+{
+  assertOnJournalThread(counter, __func__);
+  releaseReference(counter, lockNumber, ZONE_TYPE_JOURNAL, 0);
+  if (!isJournalZoneLocked(counter, lockNumber)) {
+    // The journal zone is not locked, so try to notify the owner.
+    attemptNotification(counter);
+  }
+}
+
+/**********************************************************************/
+void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter,
+                                              BlockCount   lockNumber)
+{
+  atomicAdd32(&(counter->journalDecrementCounts[lockNumber]), 1);
+}
+
+/**********************************************************************/
+void acknowledgeUnlock(LockCounter *counter)
+{
+  atomicStore32(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING);
+}
+
+/**********************************************************************/
+bool suspendLockCounter(LockCounter *counter)
+{
+  assertOnJournalThread(counter, __func__);
+  return ((atomicLoad32(&counter->state) == LOCK_COUNTER_STATE_SUSPENDED)
+          || compareAndSwap32(&counter->state,
+                              LOCK_COUNTER_STATE_NOT_NOTIFYING,
+                              LOCK_COUNTER_STATE_SUSPENDED));
+}
+
+/**********************************************************************/
+bool resumeLockCounter(LockCounter *counter)
+{
+  assertOnJournalThread(counter, __func__);
+  return compareAndSwap32(&counter->state,
+                          LOCK_COUNTER_STATE_SUSPENDED,
+                          LOCK_COUNTER_STATE_NOT_NOTIFYING);
+}
diff --git a/vdo/base/lockCounter.h b/vdo/base/lockCounter.h
new file mode 100644
index 0000000..cbda7bd
--- /dev/null
+++ b/vdo/base/lockCounter.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.h#2 $
+ */
+
+#ifndef LOCK_COUNTER_H
+#define LOCK_COUNTER_H
+
+#include "completion.h"
+#include "types.h"
+
+/**
+ * LockCounter provides a set of shared reference count locks which is safe
+ * across multiple zones with a minimum of cross-thread synchronization
+ * operations. For each lock in the set, it maintains a set of per-zone lock
+ * counts, and a single, atomic count of the number of zones holding locks.
+ * Whenever a zone's individual counter for a lock goes from 0 to 1, the
+ * zone count for that lock is incremented. Whenever a zone's individual
+ * counter for a lock goes from 1 to 0, the zone count for that lock is
+ * decremented. If the zone count goes to 0, and the lock counter's
+ * completion is not in use, the completion is launched to inform the counter's
+ * owner that some lock has been released. It is the owner's responsibility to
+ * check for which locks have been released, and to inform the lock counter
+ * that it has received the notification by calling acknowledgeUnlock().
+ **/
+
+/**
+ * Create a lock counter.
+ *
+ * @param [in]  layer           The physical layer of the VDO
+ * @param [in]  parent          The parent to notify when the lock count goes
+ *                              to zero
+ * @param [in]  callback        The function to call when the lock count goes
+ *                              to zero
+ * @param [in]  threadID        The id of thread on which to run the callback
+ * @param [in]  logicalZones    The total number of logical zones
+ * @param [in]  physicalZones   The total number of physical zones
+ * @param [in]  locks           The number of locks
+ * @param [out] lockCounterPtr  A pointer to hold the new counter
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeLockCounter(PhysicalLayer  *layer,
+                    void           *parent,
+                    VDOAction       callback,
+                    ThreadID        threadID,
+                    ZoneCount       logicalZones,
+                    ZoneCount       physicalZones,
+                    BlockCount      locks,
+                    LockCounter   **lockCounterPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a lock counter and NULL out the reference to it.
+ *
+ * @param lockCounterPtr  A pointer to the lock counter reference to free
+ **/
+void freeLockCounter(LockCounter **lockCounterPtr);
+
+/**
+ * Check whether a lock is locked for a zone type. If the recovery journal has
+ * a lock on the lock number, both logical and physical zones are considered
+ * locked.
+ *
+ * @param lockCounter  The set of locks to check
+ * @param lockNumber   The lock to check
+ * @param zoneType     The type of the zone
+ *
+ * @return <code>true</code> if the specified lock has references (is locked)
+ **/
+bool isLocked(LockCounter *lockCounter,
+              BlockCount   lockNumber,
+              ZoneType     zoneType)
+  __attribute__((warn_unused_result));
+
+/**
+ * Initialize the value of the journal zone's counter for a given lock. This
+ * must be called from the journal zone.
+ *
+ * @param counter     The counter to initialize
+ * @param lockNumber  Which lock to initialize
+ * @param value       The value to set
+ **/
+void initializeLockCount(LockCounter *counter,
+                         BlockCount   lockNumber,
+                         uint16_t     value);
+
+/**
+ * Acquire a reference to a given lock in the specified zone. This method must
+ * not be used from the journal zone.
+ *
+ * @param counter     The LockCounter
+ * @param lockNumber  Which lock to increment
+ * @param zoneType    The type of the zone acquiring the reference
+ * @param zoneID      The ID of the zone acquiring the reference
+ **/
+void acquireLockCountReference(LockCounter *counter,
+                               BlockCount   lockNumber,
+                               ZoneType     zoneType,
+                               ZoneCount    zoneID);
+
+/**
+ * Release a reference to a given lock in the specified zone. This method
+ * must not be used from the journal zone.
+ *
+ * @param counter     The LockCounter
+ * @param lockNumber  Which lock to increment
+ * @param zoneType    The type of the zone releasing the reference
+ * @param zoneID      The ID of the zone releasing the reference
+ **/
+void releaseLockCountReference(LockCounter *counter,
+                               BlockCount   lockNumber,
+                               ZoneType     zoneType,
+                               ZoneCount    zoneID);
+
+/**
+ * Release a single journal zone reference from the journal zone. This method
+ * must be called from the journal zone.
+ *
+ * @param counter     The counter from which to release a reference
+ * @param lockNumber  The lock from which to release a reference
+ **/
+void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber);
+
+/**
+ * Release a single journal zone reference from any zone. This method shouldn't
+ * be called from the journal zone as it would be inefficient; use
+ * releaseJournalZoneReference() instead.
+ *
+ * @param counter     The counter from which to release a reference
+ * @param lockNumber  The lock from which to release a reference
+ **/
+void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter,
+                                              BlockCount   lockNumber);
+
+/**
+ * Inform a lock counter that an unlock notification was received by the
+ * caller.
+ *
+ * @param counter  The counter to inform
+ **/
+void acknowledgeUnlock(LockCounter *counter);
+
+/**
+ * Prevent the lock counter from issuing notifications.
+ *
+ * @param counter  The counter
+ *
+ * @return <code>true</code> if the lock counter was not notifying and hence
+ *         the suspend was efficacious
+ **/
+bool suspendLockCounter(LockCounter *counter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Re-allow notifications from a suspended lock counter.
+ *
+ * @param counter  The counter
+ *
+ * @return <code>true</code> if the lock counter was suspended
+ **/
+bool resumeLockCounter(LockCounter *counter)
+  __attribute__((warn_unused_result));
+
+#endif // LOCK_COUNTER_H
diff --git a/vdo/base/logicalZone.c b/vdo/base/logicalZone.c
new file mode 100644
index 0000000..0834ff1
--- /dev/null
+++ b/vdo/base/logicalZone.c
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.c#6 $
+ */
+
+#include "logicalZone.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "actionManager.h"
+#include "adminState.h"
+#include "allocationSelector.h"
+#include "atomic.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "flush.h"
+#include "intMap.h"
+#include "vdoInternal.h"
+
+struct logicalZone {
+  /** The completion for flush notifications */
+  VDOCompletion       completion;
+  /** The owner of this zone */
+  LogicalZones       *zones;
+  /** Which logical zone this is */
+  ZoneCount           zoneNumber;
+  /** The thread id for this zone */
+  ThreadID            threadID;
+  /** In progress operations keyed by LBN */
+  IntMap             *lbnOperations;
+  /** The logical to physical map */
+  BlockMapZone       *blockMapZone;
+  /** The current flush generation */
+  SequenceNumber      flushGeneration;
+  /** The oldest active generation in this zone */
+  SequenceNumber      oldestActiveGeneration;
+  /** The number of IOs in the current flush generation */
+  BlockCount          iosInFlushGeneration;
+  /**
+   * The oldest locked generation in this zone (an atomic copy of
+   *                  oldestActiveGeneration)
+   **/
+  Atomic64            oldestLockedGeneration;
+  /** The youngest generation of the current notification */
+  SequenceNumber      notificationGeneration;
+  /** Whether a notification is in progress */
+  bool                notifying;
+  /** The queue of active data write VIOs */
+  RingNode            writeVIOs;
+  /** The administrative state of the zone */
+  AdminState          state;
+  /** The selector for determining which physical zone to allocate from */
+  AllocationSelector *selector;
+};
+
+struct logicalZones {
+  /** The VDO whose zones these are */
+  VDO           *vdo;
+  /** The manager for administrative actions */
+  ActionManager *manager;
+  /** The number of zones */
+  ZoneCount      zoneCount;
+  /** The logical zones themselves */
+  LogicalZone    zones[];
+};
+
+/**
+ * Convert a generic VDOCompletion to a LogicalZone.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a LogicalZone
+ **/
+static LogicalZone *asLogicalZone(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(LogicalZone, completion) == 0);
+  assertCompletionType(completion->type, GENERATION_FLUSHED_COMPLETION);
+  return (LogicalZone *) completion;
+}
+
+/**********************************************************************/
+LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber)
+{
+  return (zoneNumber < zones->zoneCount) ? &zones->zones[zoneNumber] : NULL;
+}
+
+/**
+ * Implements ZoneThreadGetter
+ **/
+static ThreadID getThreadIDForZone(void *context, ZoneCount zoneNumber)
+{
+  return getLogicalZoneThreadID(getLogicalZone(context, zoneNumber));
+}
+
+/**
+ * Initialize a logical zone.
+ *
+ * @param zones       The LogicalZones to which this zone belongs
+ * @param zoneNumber  The LogicalZone's index
+ **/
+static int initializeZone(LogicalZones *zones, ZoneCount zoneNumber)
+{
+  LogicalZone *zone   = &zones->zones[zoneNumber];
+  zone->zones         = zones;
+  int          result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->lbnOperations);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDO *vdo = zones->vdo;
+  result = initializeEnqueueableCompletion(&zone->completion,
+                                           GENERATION_FLUSHED_COMPLETION,
+                                           vdo->layer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  zone->zoneNumber   = zoneNumber;
+  zone->threadID     = getLogicalZoneThread(getThreadConfig(vdo),
+                                            zoneNumber);
+  zone->blockMapZone = getBlockMapZone(vdo->blockMap, zoneNumber);
+  initializeRing(&zone->writeVIOs);
+  atomicStore64(&zone->oldestLockedGeneration, 0);
+
+  return makeAllocationSelector(getThreadConfig(vdo)->physicalZoneCount,
+                                zone->threadID, &zone->selector);
+}
+
+/**********************************************************************/
+int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr)
+{
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  if (threadConfig->logicalZoneCount == 0) {
+    return VDO_SUCCESS;
+  }
+
+  LogicalZones *zones;
+  int result = ALLOCATE_EXTENDED(LogicalZones, threadConfig->logicalZoneCount,
+                                 LogicalZone, __func__, &zones);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  zones->vdo = vdo;
+  zones->zoneCount = threadConfig->logicalZoneCount;
+  for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) {
+    result = initializeZone(zones, zone);
+    if (result != VDO_SUCCESS) {
+      freeLogicalZones(&zones);
+      return result;
+    }
+  }
+
+  result = makeActionManager(zones->zoneCount, getThreadIDForZone,
+                             getAdminThread(threadConfig), zones, NULL,
+                             vdo->layer, &zones->manager);
+  if (result != VDO_SUCCESS) {
+    freeLogicalZones(&zones);
+    return result;
+  }
+
+  *zonesPtr = zones;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeLogicalZones(LogicalZones **zonesPtr)
+{
+  LogicalZones *zones = *zonesPtr;
+  if (zones == NULL) {
+    return;
+  }
+
+  freeActionManager(&zones->manager);
+
+  for (ZoneCount index = 0; index < zones->zoneCount; index++) {
+    LogicalZone *zone = &zones->zones[index];
+    freeAllocationSelector(&zone->selector);
+    destroyEnqueueable(&zone->completion);
+    freeIntMap(&zone->lbnOperations);
+  }
+
+  FREE(zones);
+  *zonesPtr = NULL;
+}
+
+/**********************************************************************/
+static inline void assertOnZoneThread(LogicalZone *zone, const char *what)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == zone->threadID),
+                  "%s() called on correct thread", what);
+}
+
+/**
+ * Check whether this zone has drained.
+ *
+ * @param zone  The zone to check
+ **/
+static void checkForDrainComplete(LogicalZone *zone)
+{
+  if (!isDraining(&zone->state) || zone->notifying
+      || !isRingEmpty(&zone->writeVIOs)) {
+    return;
+  }
+
+  finishDraining(&zone->state);
+}
+
+/**
+ * Initiate a drain.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateDrain(AdminState *state)
+{
+  checkForDrainComplete(container_of(state, LogicalZone, state));
+}
+
+/**
+ * Drain a logical zone.
+ *
+ * <p>Implements ZoneAction.
+ **/
+static void drainLogicalZone(void          *context,
+                             ZoneCount      zoneNumber,
+                             VDOCompletion *parent)
+{
+  LogicalZone *zone = getLogicalZone(context, zoneNumber);
+  startDraining(&zone->state, getCurrentManagerOperation(zone->zones->manager),
+                parent, initiateDrain);
+}
+
+/**********************************************************************/
+void drainLogicalZones(LogicalZones   *zones,
+                       AdminStateCode  operation,
+                       VDOCompletion  *parent)
+{
+  scheduleOperation(zones->manager, operation, NULL, drainLogicalZone, NULL,
+                    parent);
+}
+
+/**
+ * Resume a logical zone.
+ *
+ * <p>Implements ZoneAction.
+ **/
+static void resumeLogicalZone(void          *context,
+                              ZoneCount      zoneNumber,
+                              VDOCompletion *parent)
+{
+  LogicalZone *zone = getLogicalZone(context, zoneNumber);
+  finishCompletion(parent, resumeIfQuiescent(&zone->state));
+}
+
+/**********************************************************************/
+void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent)
+{
+  scheduleOperation(zones->manager, ADMIN_STATE_RESUMING, NULL,
+                    resumeLogicalZone, NULL, parent);
+}
+
+/**********************************************************************/
+ThreadID getLogicalZoneThreadID(const LogicalZone *zone)
+{
+  return zone->threadID;
+}
+
+/**********************************************************************/
+BlockMapZone *getBlockMapForZone(const LogicalZone *zone)
+{
+  return zone->blockMapZone;
+}
+
+/**********************************************************************/
+IntMap *getLBNLockMap(const LogicalZone *zone)
+{
+  return zone->lbnOperations;
+}
+
+/**********************************************************************/
+LogicalZone *getNextLogicalZone(const LogicalZone *zone)
+{
+  return getLogicalZone(zone->zones, zone->zoneNumber + 1);
+}
+
+/**
+ * Convert a RingNode to a DataVIO.
+ *
+ * @param ringNode The RingNode to convert
+ *
+ * @return The DataVIO which owns the RingNode
+ **/
+static inline DataVIO *dataVIOFromRingNode(RingNode *ringNode)
+{
+  return (DataVIO *) ((byte *) ringNode - offsetof(DataVIO, writeNode));
+}
+
+/**
+ * Update the oldest active generation. If it has changed, update the
+ * atomic copy as well.
+ *
+ * @param zone  The zone
+ *
+ * @return <code>true</code> if the oldest active generation has changed
+ **/
+static bool updateOldestActiveGeneration(LogicalZone *zone)
+{
+  SequenceNumber currentOldest = zone->oldestActiveGeneration;
+  if (isRingEmpty(&zone->writeVIOs)) {
+    zone->oldestActiveGeneration = zone->flushGeneration;
+  } else {
+    zone->oldestActiveGeneration
+      = dataVIOFromRingNode(zone->writeVIOs.next)->flushGeneration;
+  }
+
+  if (zone->oldestActiveGeneration == currentOldest) {
+    return false;
+  }
+
+  atomicStore64(&zone->oldestLockedGeneration, zone->oldestActiveGeneration);
+  return true;
+}
+
+/**********************************************************************/
+void incrementFlushGeneration(LogicalZone    *zone,
+                              SequenceNumber  expectedGeneration)
+{
+  assertOnZoneThread(zone, __func__);
+  ASSERT_LOG_ONLY((zone->flushGeneration == expectedGeneration),
+                  "logical zone %u flush generation %" PRIu64
+                  " should be %llu before increment",
+                  zone->zoneNumber, zone->flushGeneration,
+                  expectedGeneration);
+
+  zone->flushGeneration++;
+  zone->iosInFlushGeneration = 0;
+  updateOldestActiveGeneration(zone);
+}
+
+/**********************************************************************/
+SequenceNumber getOldestLockedGeneration(const LogicalZone *zone)
+{
+  return (SequenceNumber) atomicLoad64(&zone->oldestLockedGeneration);
+}
+
+/**********************************************************************/
+int acquireFlushGenerationLock(DataVIO *dataVIO)
+{
+  LogicalZone *zone = dataVIO->logical.zone;
+  assertOnZoneThread(zone, __func__);
+  if (!isNormal(&zone->state)) {
+    return VDO_INVALID_ADMIN_STATE;
+  }
+
+  dataVIO->flushGeneration = zone->flushGeneration;
+  pushRingNode(&zone->writeVIOs, &dataVIO->writeNode);
+  dataVIO->hasFlushGenerationLock = true;
+  zone->iosInFlushGeneration++;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void attemptGenerationCompleteNotification(VDOCompletion *completion);
+
+/**
+ * Notify the flush that at least one generation no longer has active VIOs.
+ * This callback is registered in attemptGenerationCompleteNotification().
+ *
+ * @param completion  The zone completion
+ **/
+static void notifyFlusher(VDOCompletion *completion)
+{
+  LogicalZone *zone = asLogicalZone(completion);
+  completeFlushes(zone->zones->vdo->flusher);
+  launchCallback(completion, attemptGenerationCompleteNotification,
+                 zone->threadID);
+}
+
+/**
+ * Notify the flusher if some generation no longer has active VIOs.
+ *
+ * @param completion  The zone completion
+ **/
+static void attemptGenerationCompleteNotification(VDOCompletion *completion)
+{
+  LogicalZone *zone = asLogicalZone(completion);
+  assertOnZoneThread(zone, __func__);
+  if (zone->oldestActiveGeneration <= zone->notificationGeneration) {
+    zone->notifying = false;
+    checkForDrainComplete(zone);
+    return;
+  }
+
+  zone->notifying              = true;
+  zone->notificationGeneration = zone->oldestActiveGeneration;
+  launchCallback(&zone->completion, notifyFlusher,
+                 getFlusherThreadID(zone->zones->vdo->flusher));
+}
+
+/**********************************************************************/
+void releaseFlushGenerationLock(DataVIO *dataVIO)
+{
+  LogicalZone *zone = dataVIO->logical.zone;
+  assertOnZoneThread(zone, __func__);
+  if (isRingEmpty(&dataVIO->writeNode)) {
+    // This VIO never got a lock, either because it is a read, or because
+    // we are in read-only mode.
+    ASSERT_LOG_ONLY(!dataVIO->hasFlushGenerationLock,
+                    "hasFlushGenerationLock false for VIO not on active list");
+    return;
+  }
+
+  unspliceRingNode(&dataVIO->writeNode);
+  dataVIO->hasFlushGenerationLock = false;
+  ASSERT_LOG_ONLY(zone->oldestActiveGeneration <= dataVIO->flushGeneration,
+                  "DataVIO releasing lock on generation %" PRIu64
+                  " is not older than oldest active generation %llu",
+                  dataVIO->flushGeneration, zone->oldestActiveGeneration);
+
+  if (!updateOldestActiveGeneration(zone) || zone->notifying) {
+    return;
+  }
+
+  attemptGenerationCompleteNotification(&zone->completion);
+}
+
+/**********************************************************************/
+AllocationSelector *getAllocationSelector(LogicalZone *zone)
+{
+  return zone->selector;
+}
+
+/**********************************************************************/
+void dumpLogicalZone(const LogicalZone *zone)
+{
+  logInfo("LogicalZone %u", zone->zoneNumber);
+  logInfo("  flushGeneration=%llu oldestActiveGeneration=%" PRIu64
+          " oldestLockedGeneration=%llu notificationGeneration=%" PRIu64
+          " notifying=%s iosInCurrentGeneration=%llu",
+          zone->flushGeneration, zone->oldestActiveGeneration,
+          relaxedLoad64(&zone->oldestLockedGeneration),
+          zone->notificationGeneration, boolToString(zone->notifying),
+          zone->iosInFlushGeneration);
+}
diff --git a/vdo/base/logicalZone.h b/vdo/base/logicalZone.h
new file mode 100644
index 0000000..8e0eae6
--- /dev/null
+++ b/vdo/base/logicalZone.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.h#3 $
+ */
+
+#ifndef LOGICAL_ZONE_H
+#define LOGICAL_ZONE_H
+
+#include "adminState.h"
+#include "intMap.h"
+#include "types.h"
+
+/**
+ * Get a logical zone by number.
+ *
+ * @param zones       A set of logical zones
+ * @param zoneNumber  The number of the zone to get
+ *
+ * @return The requested zone
+ **/
+LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a set of logical zones.
+ *
+ * @param [in]  vdo       The VDO to which the zones will belong
+ * @param [out] zonesPtr  A pointer to hold the new zones
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a set of logical zones and null out the reference to it.
+ *
+ * @param zonePtr  A pointer to the zone to free
+ **/
+void freeLogicalZones(LogicalZones **zonePtr);
+
+/**
+ * Drain a set of logical zones.
+ *
+ * @param zones       The logical zones to suspend
+ * @param operation   The type of drain to perform
+ * @param completion  The object to notify when the zones are suspended
+ **/
+void drainLogicalZones(LogicalZones   *zones,
+                       AdminStateCode  operation,
+                       VDOCompletion  *completion);
+
+/**
+ * Resume a set of logical zones.
+ *
+ * @param zones   The logical zones to resume
+ * @param parent  The object to notify when the zones have resumed
+ **/
+void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent);
+
+/**
+ * Get the ID of a logical zone's thread.
+ *
+ * @param zone  The zone
+ *
+ * @return The zone's thread ID
+ **/
+ThreadID getLogicalZoneThreadID(const LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the portion of the block map for this zone.
+ *
+ * @param zone  The zone
+ *
+ * @return The block map zone
+ **/
+BlockMapZone *getBlockMapForZone(const LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the logical lock map for this zone.
+ *
+ * @param zone  The zone
+ *
+ * @return The logical lock map for the zone
+ **/
+IntMap *getLBNLockMap(const LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the next-highest-numbered logical zone, or <code>NULL</code> if the
+ * zone is the highest-numbered zone in its VDO.
+ *
+ * @param zone  The logical zone to query
+ *
+ * @return The logical zone whose zone number is one greater than the given
+ *         zone, or <code>NULL</code> if there is no such zone
+ **/
+LogicalZone *getNextLogicalZone(const LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Increment the flush generation in a logical zone.
+ *
+ * @param zone                The logical zone
+ * @param expectedGeneration  The expected value of the flush generation
+ *                            before the increment
+ **/
+void incrementFlushGeneration(LogicalZone    *zone,
+                              SequenceNumber  expectedGeneration);
+
+/**
+ * Get the oldest flush generation which is locked by a logical zone.
+ *
+ * @param zone   The logical zone
+ *
+ * @return The oldest generation locked by the zone
+ **/
+SequenceNumber getOldestLockedGeneration(const LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire the shared lock on a flush generation by a write DataVIO.
+ *
+ * @param dataVIO   The DataVIO
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int acquireFlushGenerationLock(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release the shared lock on a flush generation held by a write DataVIO. If
+ * there are pending flushes, and this DataVIO completes the oldest generation
+ * active in this zone, an attempt will be made to finish any flushes which may
+ * now be complete.
+ *
+ * @param dataVIO  The DataVIO whose lock is to be released
+ **/
+void releaseFlushGenerationLock(DataVIO *dataVIO);
+
+/**
+ * Get the selector for deciding which physical zone should be allocated from
+ * next for activities in a logical zone.
+ *
+ * @param zone  The logical zone of the operation which needs an allocation
+ *
+ * @return The allocation selector for this zone
+ **/
+AllocationSelector *getAllocationSelector(LogicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump information about a logical zone to the log for debugging, in a
+ * thread-unsafe fashion.
+ *
+ * @param zone   The zone to dump
+ **/
+void dumpLogicalZone(const LogicalZone *zone);
+
+#endif // LOGICAL_ZONE_H
diff --git a/vdo/base/lz4.c b/vdo/base/lz4.c
new file mode 100644
index 0000000..1114aa8
--- /dev/null
+++ b/vdo/base/lz4.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lz4.c#2 $
+ */
+
+// Get the memcpy fixup from common.h.
+#include "common.h"
+
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2012, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+   - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+/*
+ * With authors permission dual licensed as BSD/GPL for linux kernel
+ *
+ * Origin: http://lz4.googlecode.com/svn/trunk
+ * Revision: 88
+ */
+
+//**************************************
+// Tuning parameters
+//**************************************
+// MEMORY_USAGE :
+// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+// Increasing memory usage improves compression ratio
+// Reduced memory usage can improve speed, due to cache effect
+// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+#define MEMORY_USAGE 14
+
+// NOTCOMPRESSIBLE_DETECTIONLEVEL :
+// Decreasing this value will make the algorithm skip faster data segments considered "incompressible"
+// This may decrease compression ratio dramatically, but will be faster on incompressible data
+// Increasing this value will make the algorithm search more before declaring a segment "incompressible"
+// This could improve compression a bit, but will be slower on incompressible data
+// The default value (6) is recommended
+#define NOTCOMPRESSIBLE_DETECTIONLEVEL 6
+
+// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
+// This will provide a small boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU.
+// You can set this option to 1 in situations where data will remain within closed environment
+// This option is useless on Little_Endian CPU (such as x86)
+//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1
+
+
+
+//**************************************
+// CPU Feature Detection
+//**************************************
+// 32 or 64 bits ?
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) )   // Detects 64 bits mode
+#  define LZ4_ARCH64 1
+#else
+#  define LZ4_ARCH64 0
+#endif
+
+// Little Endian or Big Endian ?
+// GCC normally defines these three macros (and PDP-endian which we ignore).
+#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \
+  || !defined(__BYTE_ORDER__)
+#error "GCC byte order macros not defined?"
+#endif
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  define LZ4_BIG_ENDIAN 1
+#elif __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+#  error "fix byte order check"
+#endif
+
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
+#if defined(__ARM_FEATURE_UNALIGNED)
+#  define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+// Define this parameter if your target system or compiler does not support hardware bit count
+#if defined(_MSC_VER) && defined(_WIN32_WCE)            // Visual Studio for Windows CE does not support Hardware bit count
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if __STDC_VERSION__ >= 199901L   // C99
+/* "restrict" is a known keyword */
+#else
+#  define restrict // Disable restrict
+#endif
+
+#define _GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef _MSC_VER  // Visual Studio
+#  include <intrin.h>   // For Visual 2005
+#  if LZ4_ARCH64	// 64-bit
+#    pragma intrinsic(_BitScanForward64) // For Visual 2005
+#    pragma intrinsic(_BitScanReverse64) // For Visual 2005
+#  else
+#    pragma intrinsic(_BitScanForward)   // For Visual 2005
+#    pragma intrinsic(_BitScanReverse)   // For Visual 2005
+#  endif
+#endif
+
+#ifdef _MSC_VER
+#  define lz4_bswap16(x) _byteswap_ushort(x)
+#else
+#  define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
+#endif
+
+#if (_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+//**************************************
+// Includes
+//**************************************
+#ifdef __KERNEL__
+#  include <linux/string.h>   // for memset
+#else /* __KERNEL__ */
+#  include <stdlib.h>   // for malloc
+#  include <string.h>   // for memset
+#endif /* __KERNEL__ */
+#include "lz4.h"
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined(_MSC_VER)    // Visual Studio does not support 'stdint' natively
+#  define BYTE	unsigned __int8
+#  define U16		unsigned __int16
+#  define U32		unsigned __int32
+#  define S32		__int32
+#  define U64		unsigned __int64
+#else
+#  ifdef __KERNEL__
+#    include <linux/types.h>
+#  else /* __KERNEL__ */
+#    include <stdint.h>
+#  endif /* __KERNEL__ */
+#  define BYTE	uint8_t
+#  define U16		uint16_t
+#  define U32		uint32_t
+#  define S32		int32_t
+#  define U64		uint64_t
+#endif
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#  pragma pack(push, 1)
+#endif
+
+typedef struct _U16_S { U16 v; } U16_S;
+typedef struct _U32_S { U32 v; } U32_S;
+typedef struct _U64_S { U64 v; } U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#  pragma pack(pop)
+#endif
+
+#define A64(x) (((U64_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A16(x) (((U16_S *)(x))->v)
+
+
+//**************************************
+// Constants
+//**************************************
+#define MINMATCH 4
+
+#define HASH_LOG (MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASHTABLESIZE - 1)
+
+#define SKIPSTRENGTH (NOTCOMPRESSIBLE_DETECTIONLEVEL>2?NOTCOMPRESSIBLE_DETECTIONLEVEL:2)
+#define STACKLIMIT 13
+#define HEAPMODE (HASH_LOG>STACKLIMIT)  // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()).
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+#define MINLENGTH (MFLIMIT+1)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+/*
+ * Disable on-stack context allocation for linux kernel
+ */
+#undef STACKLIMIT
+#define STACKLIMIT 0
+
+
+//**************************************
+// Architecture-specific macros
+//**************************************
+#if LZ4_ARCH64	// 64-bit
+#  define STEPSIZE 8
+#  define UARCH U64
+#  define AARCH A64
+#  define LZ4_COPYSTEP(s,d)       A64(d) = A64(s); d+=8; s+=8;
+#  define LZ4_COPYPACKET(s,d)     LZ4_COPYSTEP(s,d)
+#  define LZ4_SECURECOPY(s,d,e)   if (d<e) LZ4_WILDCOPY(s,d,e)
+#  define HTYPE                   U32
+#  define INITBASE(base)          const BYTE* const base = ip
+#else		// 32-bit
+#  define STEPSIZE 4
+#  define UARCH U32
+#  define AARCH A32
+#  define LZ4_COPYSTEP(s,d)       A32(d) = A32(s); d+=4; s+=4;
+#  define LZ4_COPYPACKET(s,d)     LZ4_COPYSTEP(s,d); LZ4_COPYSTEP(s,d);
+#  define LZ4_SECURECOPY          LZ4_WILDCOPY
+#  define HTYPE                   const BYTE*
+#  define INITBASE(base)          const int base = 0
+#endif
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,i)  { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
+#else		// Little Endian
+#  define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
+#  define LZ4_WRITE_LITTLEENDIAN_16(p,v)  { A16(p) = v; p+=2; }
+#endif
+
+
+//**************************************
+// Local structures
+//**************************************
+struct refTables
+{
+    HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+//**************************************
+// Macros
+//**************************************
+#define LZ4_HASH_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8)-HASH_LOG))
+#define LZ4_HASH_VALUE(p)		LZ4_HASH_FUNCTION(A32(p))
+#define LZ4_WILDCOPY(s,d,e)		do { LZ4_COPYPACKET(s,d) } while (d<e);
+#define LZ4_BLINDCOPY(s,d,l)	{ BYTE* e=(d)+l; LZ4_WILDCOPY(s,d,e); d=e; }
+
+
+//****************************
+// Private functions
+//****************************
+#if LZ4_ARCH64
+
+static inline int LZ4_NbCommonBytes (register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse64( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clzll(val) >> 3);
+    #else
+    int r;
+    if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+    if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+    r += (!val);
+    return r;
+    #endif
+#else
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanForward64( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctzll(val) >> 3);
+    #else
+    static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+    return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58];
+    #endif
+#endif
+}
+
+#else
+
+static inline int LZ4_NbCommonBytes (register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanReverse( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_clz(val) >> 3);
+    #else
+    int r;
+    if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+    r += (!val);
+    return r;
+    #endif
+#else
+    #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    unsigned long r = 0;
+    _BitScanForward( &r, val );
+    return (int)(r>>3);
+    #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+    return (__builtin_ctz(val) >> 3);
+    #else
+    static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+    return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+    #endif
+#endif
+}
+
+#endif
+
+
+
+//******************************
+// Compression functions
+//******************************
+
+// LZ4_compressCtx :
+// -----------------
+// Compress 'isize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+// If it cannot achieve it, compression will stop, and result of the function will be zero.
+// return : the number of bytes written in buffer 'dest', or 0 if the compression fails
+
+static inline int LZ4_compressCtx(void** ctx,
+                 const char* source,
+                 char* dest,
+                 int isize,
+                 int maxOutputSize)
+{
+#if HEAPMODE
+    struct refTables *srt = (struct refTables *) (*ctx);
+    HTYPE* HashTable;
+#else
+    HTYPE HashTable[HASHTABLESIZE] = {0};
+#endif
+
+    const BYTE* ip = (BYTE*) source;
+    INITBASE(base);
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + isize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + maxOutputSize;
+
+    int len, length;
+    const int skipStrength = SKIPSTRENGTH;
+    U32 forwardH;
+
+
+    // Init
+    if (isize<MINLENGTH) goto _last_literals;
+#if HEAPMODE
+    HashTable = (HTYPE*)(srt->hashTable);
+    memset((void*)HashTable, 0, sizeof(srt->hashTable));
+#else
+    (void) ctx;
+#endif
+
+
+    // First Byte
+    HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+    ip++; forwardH = LZ4_HASH_VALUE(ip);
+
+    // Main Loop
+    for ( ; ; )
+    {
+        int findMatchAttempts = (1U << skipStrength) + 3;
+        const BYTE* forwardIp = ip;
+        const BYTE* ref;
+        BYTE* token;
+
+        // Find a match
+        do {
+            U32 h = forwardH;
+            int step = findMatchAttempts++ >> skipStrength;
+            ip = forwardIp;
+            forwardIp = ip + step;
+
+            if (unlikely(forwardIp > mflimit)) { goto _last_literals; }
+
+            forwardH = LZ4_HASH_VALUE(forwardIp);
+            ref = base + HashTable[h];
+            HashTable[h] = ip - base;
+
+        } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+        // Catch up
+        while ((ip>anchor) && (ref>(BYTE*)source) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; }
+
+        // Encode Literal length
+        length = (int)(ip - anchor);
+        token = op++;
+        if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; 		// Check output limit
+#ifdef _MSC_VER
+        if (length>=(int)RUN_MASK)
+        {
+            int len = length-RUN_MASK;
+            *token=(RUN_MASK<<ML_BITS);
+            if (len>254)
+            {
+                do { *op++ = 255; len -= 255; } while (len>254);
+                *op++ = (BYTE)len;
+                memcpy(op, anchor, length);
+                op += length;
+                goto _next_match;
+            }
+            else
+            *op++ = (BYTE)len;
+        }
+        else *token = (length<<ML_BITS);
+#else
+        if (length>=(int)RUN_MASK) { *token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; }
+        else *token = (length<<ML_BITS);
+#endif
+
+        // Copy Literals
+        LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+        // Encode Offset
+        LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));
+
+        // Start Counting
+        ip+=MINMATCH; ref+=MINMATCH;   // MinMatch verified
+        anchor = ip;
+        while (likely(ip<matchlimit-(STEPSIZE-1)))
+        {
+            UARCH diff = AARCH(ref) ^ AARCH(ip);
+            if (!diff) { ip+=STEPSIZE; ref+=STEPSIZE; continue; }
+            ip += LZ4_NbCommonBytes(diff);
+            goto _endCount;
+        }
+        if (LZ4_ARCH64) if ((ip<(matchlimit-3)) && (A32(ref) == A32(ip))) { ip+=4; ref+=4; }
+        if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
+        if ((ip<matchlimit) && (*ref == *ip)) ip++;
+_endCount:
+
+        // Encode MatchLength
+        len = (int)(ip - anchor);
+        if (unlikely(op + (1 + LASTLITERALS) + (len>>8) > oend)) return 0; 		// Check output limit
+        if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; }
+        else *token += len;
+
+        // Test end of chunk
+        if (ip > mflimit) { anchor = ip;  break; }
+
+        // Fill table
+        HashTable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+
+        // Test next position
+        ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+        HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+        if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; }
+
+        // Prepare next loop
+        anchor = ip++;
+        forwardH = LZ4_HASH_VALUE(ip);
+    }
+
+_last_literals:
+    // Encode Last Literals
+    {
+        int lastRun = (int)(iend - anchor);
+        if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0;
+        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+        else *op++ = (lastRun<<ML_BITS);
+        memcpy(op, anchor, iend - anchor);
+        op += iend-anchor;
+    }
+
+    // End
+    return (int) (((char*)op)-dest);
+}
+
+
+
+// Note : this function is valid only if isize < LZ4_64KLIMIT
+#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1))
+#define HASHLOG64K (HASH_LOG+1)
+#define HASH64KTABLESIZE (1U<<HASHLOG64K)
+#define LZ4_HASH64K_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8)-HASHLOG64K))
+#define LZ4_HASH64K_VALUE(p)	LZ4_HASH64K_FUNCTION(A32(p))
+static inline int LZ4_compress64kCtx(void** ctx,
+                 const char* source,
+                 char* dest,
+                 int isize,
+                 int maxOutputSize)
+{
+#if HEAPMODE
+    struct refTables *srt = (struct refTables *) (*ctx);
+    U16* HashTable;
+#else
+    U16 HashTable[HASH64KTABLESIZE] = {0};
+#endif
+
+    const BYTE* ip = (BYTE*) source;
+    const BYTE* anchor = ip;
+    const BYTE* const base = ip;
+    const BYTE* const iend = ip + isize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + maxOutputSize;
+
+    int len, length;
+    const int skipStrength = SKIPSTRENGTH;
+    U32 forwardH;
+
+
+    // Init
+    if (isize<MINLENGTH) goto _last_literals;
+#if HEAPMODE
+    HashTable = (U16*)(srt->hashTable);
+    memset((void*)HashTable, 0, sizeof(srt->hashTable));
+#else
+    (void) ctx;
+#endif
+
+
+    // First Byte
+    ip++; forwardH = LZ4_HASH64K_VALUE(ip);
+
+    // Main Loop
+    for ( ; ; )
+    {
+        int findMatchAttempts = (1U << skipStrength) + 3;
+        const BYTE* forwardIp = ip;
+        const BYTE* ref;
+        BYTE* token;
+
+        // Find a match
+        do {
+            U32 h = forwardH;
+            int step = findMatchAttempts++ >> skipStrength;
+            ip = forwardIp;
+            forwardIp = ip + step;
+
+            if (forwardIp > mflimit) { goto _last_literals; }
+
+            forwardH = LZ4_HASH64K_VALUE(forwardIp);
+            ref = base + HashTable[h];
+            HashTable[h] = (U16)(ip - base);
+
+        } while (A32(ref) != A32(ip));
+
+        // Catch up
+        while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; }
+
+        // Encode Literal length
+        length = (int)(ip - anchor);
+        token = op++;
+        if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; 		// Check output limit
+#ifdef _MSC_VER
+        if (length>=(int)RUN_MASK)
+        {
+            int len = length-RUN_MASK;
+            *token=(RUN_MASK<<ML_BITS);
+            if (len>254)
+            {
+                do { *op++ = 255; len -= 255; } while (len>254);
+                *op++ = (BYTE)len;
+                memcpy(op, anchor, length);
+                op += length;
+                goto _next_match;
+            }
+            else
+            *op++ = (BYTE)len;
+        }
+        else *token = (length<<ML_BITS);
+#else
+        if (length>=(int)RUN_MASK) { *token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; }
+        else *token = (length<<ML_BITS);
+#endif
+
+        // Copy Literals
+        LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+        // Encode Offset
+        LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));
+
+        // Start Counting
+        ip+=MINMATCH; ref+=MINMATCH;   // MinMatch verified
+        anchor = ip;
+        while (ip<matchlimit-(STEPSIZE-1))
+        {
+            UARCH diff = AARCH(ref) ^ AARCH(ip);
+            if (!diff) { ip+=STEPSIZE; ref+=STEPSIZE; continue; }
+            ip += LZ4_NbCommonBytes(diff);
+            goto _endCount;
+        }
+        if (LZ4_ARCH64) if ((ip<(matchlimit-3)) && (A32(ref) == A32(ip))) { ip+=4; ref+=4; }
+        if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
+        if ((ip<matchlimit) && (*ref == *ip)) ip++;
+_endCount:
+
+        // Encode MatchLength
+        len = (int)(ip - anchor);
+        if (unlikely(op + (1 + LASTLITERALS) + (len>>8) > oend)) return 0; 		// Check output limit
+        if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; }
+        else *token += len;
+
+        // Test end of chunk
+        if (ip > mflimit) { anchor = ip;  break; }
+
+        // Fill table
+        HashTable[LZ4_HASH64K_VALUE(ip-2)] = (U16)(ip - 2 - base);
+
+        // Test next position
+        ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+        HashTable[LZ4_HASH64K_VALUE(ip)] = (U16)(ip - base);
+        if (A32(ref) == A32(ip)) { token = op++; *token=0; goto _next_match; }
+
+        // Prepare next loop
+        anchor = ip++;
+        forwardH = LZ4_HASH64K_VALUE(ip);
+    }
+
+_last_literals:
+    // Encode Last Literals
+    {
+        int lastRun = (int)(iend - anchor);
+        if (op + lastRun + 1 + (lastRun-RUN_MASK+255)/255 > oend) return 0;
+        if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+        else *op++ = (lastRun<<ML_BITS);
+        memcpy(op, anchor, iend - anchor);
+        op += iend-anchor;
+    }
+
+    // End
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress_ctx_limitedOutput(void       *ctx,
+                                   const char *source,
+                                   char       *dest,
+                                   int         isize,
+                                   int         maxOutputSize)
+{
+  if (isize < LZ4_64KLIMIT) {
+    return LZ4_compress64kCtx(&ctx, source, dest, isize, maxOutputSize);
+  } else {
+    return LZ4_compressCtx(&ctx, source, dest, isize, maxOutputSize);
+  }
+}
+
+
+
+//****************************
+// Decompression functions
+//****************************
+
+// Note : The decoding functions LZ4_uncompress() and LZ4_uncompress_unknownOutputSize()
+//		are safe against "buffer overflow" attack type.
+//		They will never write nor read outside of the provided output buffers.
+//      LZ4_uncompress_unknownOutputSize() also insures that it will never read outside of the input buffer.
+//		A corrupted input will produce an error result, a negative int, indicating the position of the error within input stream.
+
+#ifdef LZ4_EVERYTHING
+int LZ4_uncompress(const char* source, char* dest, int osize)
+{
+    // Local Variables
+    const BYTE* restrict ip = (const BYTE*) source;
+    const BYTE* ref;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + osize;
+    BYTE* cpy;
+
+    unsigned token;
+
+    size_t length;
+    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+    size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+
+    // Main Loop
+    while (1)
+    {
+        // get runlength
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)  { size_t len; for (;(len=*ip++)==255;length+=255){} length += len; }
+
+        // copy literals
+        cpy = op+length;
+        if (unlikely(cpy>oend-COPYLENGTH))
+        {
+            if (cpy != oend) goto _output_error;         // Error : not enough place for another match (min 4) + 5 literals
+            memcpy(op, ip, length);
+            ip += length;
+            break;                                       // EOF
+        }
+        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
+
+        // get offset
+        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+        if (unlikely(ref < (BYTE* const)dest)) goto _output_error;   // Error : offset create reference outside destination buffer
+
+        // get matchlength
+        if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; }
+
+        // copy repeated sequence
+        if (unlikely((op-ref)<STEPSIZE))
+        {
+#if LZ4_ARCH64
+            size_t dec64 = dec64table[op-ref];
+#else
+            const int dec64 = 0;
+#endif
+            op[0] = ref[0];
+            op[1] = ref[1];
+            op[2] = ref[2];
+            op[3] = ref[3];
+            op += 4, ref += 4; ref -= dec32table[op-ref];
+            A32(op) = A32(ref);
+            op += STEPSIZE-4; ref -= dec64;
+        } else { LZ4_COPYSTEP(ref,op); }
+        cpy = op + length - (STEPSIZE-4);
+        if (cpy>oend-COPYLENGTH)
+        {
+            if (cpy > oend) goto _output_error;             // Error : request to write beyond destination buffer
+            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
+            while(op<cpy) *op++=*ref++;
+            op=cpy;
+            if (op == oend) goto _output_error;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
+            continue;
+        }
+        LZ4_SECURECOPY(ref, op, cpy);
+        op=cpy;		// correction
+    }
+
+    // end of decoding
+    return (int) (((char*)ip)-source);
+
+    // write overflow error detected
+_output_error:
+    return (int) (-(((char*)ip)-source));
+}
+#endif /* LZ4_EVERYTHING */
+
+int LZ4_uncompress_unknownOutputSize(
+                const char* source,
+                char* dest,
+                int isize,
+                int maxOutputSize)
+{
+    // Local Variables
+    const BYTE* restrict ip = (const BYTE*) source;
+    const BYTE* const iend = ip + isize;
+    const BYTE* ref;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + maxOutputSize;
+    BYTE* cpy;
+
+    size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+    size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+
+    // Main Loop
+    while (ip<iend)
+    {
+        unsigned token;
+        size_t length;
+
+        // get runlength
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK) { int s=255; while ((ip<iend) && (s==255)) { s=*ip++; length += s; } }
+
+        // copy literals
+        cpy = op+length;
+        if ((cpy>oend-COPYLENGTH) || (ip+length>iend-COPYLENGTH))
+        {
+            if (cpy > oend) goto _output_error;          // Error : writes beyond output buffer
+            if (ip+length != iend) goto _output_error;   // Error : LZ4 format requires to consume all input at this stage
+            memcpy(op, ip, length);
+            op += length;
+            break;                                       // Necessarily EOF, due to parsing restrictions
+        }
+        LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy;
+
+        // get offset
+        LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+        if (ref < (BYTE* const)dest) goto _output_error;   // Error : offset creates reference outside of destination buffer
+
+        // get matchlength
+        if ((length=(token&ML_MASK)) == ML_MASK) { while (ip<iend) { int s = *ip++; length +=s; if (s==255) continue; break; } }
+
+        // copy repeated sequence
+        if (unlikely(op-ref<STEPSIZE))
+        {
+#if LZ4_ARCH64
+            size_t dec64 = dec64table[op-ref];
+#else
+            const int dec64 = 0;
+#endif
+            op[0] = ref[0];
+            op[1] = ref[1];
+            op[2] = ref[2];
+            op[3] = ref[3];
+            op += 4, ref += 4; ref -= dec32table[op-ref];
+            A32(op) = A32(ref);
+            op += STEPSIZE-4; ref -= dec64;
+        } else { LZ4_COPYSTEP(ref,op); }
+        cpy = op + length - (STEPSIZE-4);
+        if (cpy>oend-COPYLENGTH)
+        {
+            if (cpy > oend) goto _output_error;    // Error : request to write outside of destination buffer
+            LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH));
+            while(op<cpy) *op++=*ref++;
+            op=cpy;
+            if (op == oend) goto _output_error;    // Check EOF (should never happen, since last 5 bytes are supposed to be literals)
+            continue;
+        }
+        LZ4_SECURECOPY(ref, op, cpy);
+        op=cpy;		// correction
+    }
+
+    // end of decoding
+    return (int) (((char*)op)-dest);
+
+    // write overflow error detected
+_output_error:
+    return (int) (-(((char*)ip)-source));
+}
+
+int LZ4_context_size(void)
+{
+	return sizeof(struct refTables);
+}
diff --git a/vdo/base/lz4.h b/vdo/base/lz4.h
new file mode 100644
index 0000000..3281dae
--- /dev/null
+++ b/vdo/base/lz4.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lz4.h#1 $
+ */
+
+#ifndef LZ4_H
+#define LZ4_H
+
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2013, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+   - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+
+/*
+ * With authors permission dual licensed as BSD/GPL for linux kernel
+ *
+ * Origin: http://lz4.googlecode.com/svn/trunk
+ * Revision: 88
+ */
+
+/**
+ * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'.  If it cannot achieve it, compression will
+ * stop, and result of the function will be zero.  This function never
+ * writes outside of provided output buffer.
+ *
+ * @param ctx            Scratch space that will not fit on the stack
+ * @param source         Input data
+ * @param dest           Output data
+ * @param isize          Input size. Max supported value is ~1.9GB
+ * @param maxOutputSize  Size of the destination buffer
+ *
+ * @return the number of bytes written in buffer 'dest' or 0 if the
+ *         compression fails
+ **/
+int LZ4_compress_ctx_limitedOutput(void       *ctx,
+                                   const char *source,
+                                   char       *dest,
+                                   int         isize,
+                                   int         maxOutputSize);
+
+/**
+ * Return the size of the "ctx" block needed by the compression method.
+ *
+ * @return the number of bytes needed in the "ctx" argument to
+ *         Z4_compress_ctx_limitedOutput
+ **/
+int LZ4_context_size(void);
+
+/**
+ * Uncompress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'.  This function never writes beyond dest +
+ * maxOutputSize, and is therefore protected against malicious data packets
+ *
+ * @param source         Input data
+ * @param dest           Output data
+ * @param isize          Input size, therefore the compressed size
+ * @param maxOutputSize  Size of the destination buffer
+ *
+ * @return the number of bytes decoded in the destination buffer
+ *         (necessarily <= maxOutputSize).  If the source stream is
+ *         malformed, the function will stop decoding and return a negative
+ *         result, indicating the byte position of the faulty instruction
+ **/
+int LZ4_uncompress_unknownOutputSize(const char *source,
+                                     char       *dest,
+                                     int         isize,
+                                     int         maxOutputSize);
+
+#endif // LZ4_H
diff --git a/vdo/base/numUtils.h b/vdo/base/numUtils.h
new file mode 100644
index 0000000..2f79135
--- /dev/null
+++ b/vdo/base/numUtils.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/numUtils.h#1 $
+ *
+ * THIS FILE IS A CANDIDATE FOR THE EVENTUAL UTILITY LIBRARY.
+ */
+
+#ifndef NUM_UTILS_H
+#define NUM_UTILS_H
+
+#include "common.h"
+#include "numeric.h"
+
+#include "types.h"
+
+/**
+ * Return true if and only if a number is a power of two.
+ **/
+static inline bool isPowerOfTwo(uint64_t n)
+{
+  return (n > 0) && ((n & (n - 1)) == 0);
+}
+
+/**
+ * Efficiently calculate the base-2 logarithm of a number truncated to an
+ * integer value.
+ *
+ * This also happens to be the bit index of the highest-order non-zero bit in
+ * the binary representation of the number, which can easily be used to
+ * calculate the bit shift corresponding to a bit mask or an array capacity,
+ * or to calculate the binary floor or ceiling (next lowest or highest power
+ * of two).
+ *
+ * @param n  The input value
+ *
+ * @return the integer log2 of the value, or -1 if the value is zero
+ **/
+static inline int logBaseTwo(uint64_t n)
+{
+  if (n == 0) {
+    return -1;
+  }
+  // Many CPUs, including x86, directly support this calculation, so use the
+  // GCC function for counting the number of leading high-order zero bits.
+  return 63 - __builtin_clzll(n);
+}
+
+/**
+ * Find the minimum of two physical block numbers.
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber minBlock(PhysicalBlockNumber a,
+                                           PhysicalBlockNumber b)
+{
+  return (a < b) ? a : b;
+}
+
+/**
+ * Find the maximum of two physical block numbers.
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber maxBlock(PhysicalBlockNumber a,
+                                           PhysicalBlockNumber b)
+{
+  return (a > b) ? a : b;
+}
+
+/**
+ * Find the minimum of two block counts.
+ **/
+__attribute__((warn_unused_result))
+static inline BlockCount minBlockCount(BlockCount a, BlockCount b)
+{
+  return (a < b) ? a : b;
+}
+
+/**
+ * Find the maximum of two block counts.
+ **/
+__attribute__((warn_unused_result))
+static inline BlockCount maxBlockCount(BlockCount a, BlockCount b)
+{
+  return (a > b) ? a : b;
+}
+
+/**
+ * Find the minimum of two sequence numbers.
+ **/
+__attribute__((warn_unused_result))
+static inline SequenceNumber minSequenceNumber(SequenceNumber a,
+                                               SequenceNumber b)
+{
+  return (a < b) ? a : b;
+}
+
+/**
+ * Return the minimum of two page counts.
+ **/
+__attribute__((warn_unused_result))
+static inline PageCount minPageCount(PageCount a, PageCount b)
+{
+  return (a < b) ? a : b;
+}
+
+/**
+ * Return the maximum of two page counts.
+ **/
+__attribute__((warn_unused_result))
+static inline PageCount maxPageCount(PageCount a, PageCount b)
+{
+  return (a > b) ? a : b;
+}
+
+/**
+ * Round upward towards the nearest multiple of quantum.
+ *
+ * @param number        a number
+ * @param quantum       the quantum
+ *
+ * @return the least multiple of quantum not less than number
+ **/
+__attribute__((warn_unused_result))
+static inline size_t roundUpToMultipleSizeT(size_t number, size_t quantum)
+{
+  return number + quantum - 1 - ((number + quantum - 1) % quantum);
+}
+
+/**
+ * Round upward towards the nearest multiple of quantum for uint64_t
+ *
+ * @param number        a number
+ * @param quantum       the quantum
+ *
+ * @return the least multiple of quantum not less than number
+ **/
+__attribute__((warn_unused_result))
+static inline uint64_t roundUpToMultipleUInt64T(uint64_t number,
+                                                uint64_t quantum)
+{
+  return number + quantum - 1 - ((number + quantum - 1) % quantum);
+}
+
+/**
+ * Check whether the given value is between the lower and upper bounds,
+ * within a cyclic range of values from 0 to (modulus - 1). The value
+ * and both bounds must be smaller than the modulus.
+ *
+ * @param lower    The lowest value to accept
+ * @param value    The value to check
+ * @param upper    The highest value to accept
+ * @param modulus  The size of the cyclic space, no more than 2^15
+ *
+ * @return <code>true</code> if the value is in range
+ **/
+static inline bool inCyclicRange(uint16_t lower,
+                                 uint16_t value,
+                                 uint16_t upper,
+                                 uint16_t modulus)
+{
+  if (value < lower) {
+    value += modulus;
+  }
+  if (upper < lower) {
+    upper += modulus;
+  }
+  return (value <= upper);
+}
+
+/**
+ * Compute the number of buckets of a given size which are required to hold a
+ * given number of objects.
+ *
+ * @param objectCount  The number of objects to hold
+ * @param bucketSize   The size of a bucket
+ *
+ * @return The number of buckets required
+ **/
+static inline uint64_t computeBucketCount(uint64_t objectCount,
+                                          uint64_t bucketSize)
+{
+  uint64_t quotient = objectCount / bucketSize;
+  if ((objectCount % bucketSize) > 0) {
+    ++quotient;
+  }
+  return quotient;
+}
+
+#endif // NUM_UTILS_H
diff --git a/vdo/base/packedRecoveryJournalBlock.h b/vdo/base/packedRecoveryJournalBlock.h
new file mode 100644
index 0000000..b592225
--- /dev/null
+++ b/vdo/base/packedRecoveryJournalBlock.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packedRecoveryJournalBlock.h#3 $
+ */
+
+#ifndef PACKED_RECOVERY_JOURNAL_BLOCK_H
+#define PACKED_RECOVERY_JOURNAL_BLOCK_H
+
+#include "numeric.h"
+
+#include "constants.h"
+#include "recoveryJournalEntry.h"
+#include "types.h"
+
+typedef struct {
+  SequenceNumber    blockMapHead;       // Block map head sequence number
+  SequenceNumber    slabJournalHead;    // Slab journal head sequence number
+  SequenceNumber    sequenceNumber;     // Sequence number for this block
+  Nonce             nonce;              // A given VDO instance's nonce
+  BlockCount        logicalBlocksUsed;  // Count of logical blocks in use
+  BlockCount        blockMapDataBlocks; // Count of allocated block map pages
+  JournalEntryCount entryCount;         // Number of entries written
+  uint8_t           checkByte;          // The protection check byte
+  uint8_t           recoveryCount;      // The number of recoveries completed
+  VDOMetadataType   metadataType;       // Metadata type
+} RecoveryBlockHeader;
+
+/**
+ * The packed, on-disk representation of a recovery journal block header.
+ * All fields are kept in little-endian byte order.
+ **/
+typedef union __attribute__((packed)) {
+  struct __attribute__((packed)) {
+    /** Block map head 64-bit sequence number */
+    byte    blockMapHead[8];
+
+    /** Slab journal head 64-bit sequence number */
+    byte    slabJournalHead[8];
+
+    /** The 64-bit sequence number for this block */
+    byte    sequenceNumber[8];
+
+    /** A given VDO instance's 64-bit nonce */
+    byte    nonce[8];
+
+    /** 8-bit metadata type (should always be one for the recovery journal) */
+    uint8_t metadataType;
+
+    /** 16-bit count of the entries encoded in the block */
+    byte    entryCount[2];
+
+    /** 64-bit count of the logical blocks used when this block was opened */
+    byte    logicalBlocksUsed[8];
+
+    /** 64-bit count of the block map blocks used when this block was opened */
+    byte    blockMapDataBlocks[8];
+
+    /** The protection check byte */
+    uint8_t checkByte;
+
+    /** The number of recoveries completed */
+    uint8_t recoveryCount;
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[8 + 8 + 8 + 8 + 1 + 2 + 8 + 8 + 1 + 1];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    SequenceNumber    blockMapHead;
+    SequenceNumber    slabJournalHead;
+    SequenceNumber    sequenceNumber;
+    Nonce             nonce;
+    VDOMetadataType   metadataType;
+    JournalEntryCount entryCount;
+    BlockCount        logicalBlocksUsed;
+    BlockCount        blockMapDataBlocks;
+    uint8_t           checkByte;
+    uint8_t           recoveryCount;
+  } littleEndian;
+#endif
+} PackedJournalHeader;
+
+typedef struct {
+  /** The protection check byte */
+  uint8_t checkByte;
+
+  /** The number of recoveries completed */
+  uint8_t recoveryCount;
+
+  /** The number of entries in this sector */
+  uint8_t entryCount;
+
+  /** Journal entries for this sector */
+  PackedRecoveryJournalEntry entries[];
+} __attribute__((packed)) PackedJournalSector;
+
+enum {
+  // Allowing more than 311 entries in each block changes the math
+  // concerning the amortization of metadata writes and recovery speed.
+  RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = 311,
+  /** The number of entries in each sector (except the last) when filled */
+  RECOVERY_JOURNAL_ENTRIES_PER_SECTOR
+    = ((VDO_SECTOR_SIZE - sizeof(PackedJournalSector))
+       / sizeof(PackedRecoveryJournalEntry)),
+  /** The number of entries in the last sector when a block is full */
+  RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR
+    = (RECOVERY_JOURNAL_ENTRIES_PER_BLOCK
+       % RECOVERY_JOURNAL_ENTRIES_PER_SECTOR),
+};
+
+/**
+ * Find the recovery journal sector from the block header and sector number.
+ *
+ * @param header        The header of the recovery journal block
+ * @param sectorNumber  The index of the sector (1-based)
+ *
+ * @return A packed recovery journal sector
+ **/
+__attribute__((warn_unused_result))
+static inline
+PackedJournalSector *getJournalBlockSector(PackedJournalHeader *header,
+                                           int                  sectorNumber)
+{
+  char *sectorData = ((char *) header) + (VDO_SECTOR_SIZE * sectorNumber);
+  return (PackedJournalSector *) sectorData;
+}
+
+/**
+ * Generate the packed representation of a recovery block header.
+ *
+ * @param header  The header containing the values to encode
+ * @param packed  The header into which to pack the values
+ **/
+static inline void packRecoveryBlockHeader(const RecoveryBlockHeader *header,
+                                           PackedJournalHeader       *packed)
+{
+  storeUInt64LE(packed->fields.blockMapHead,       header->blockMapHead);
+  storeUInt64LE(packed->fields.slabJournalHead,    header->slabJournalHead);
+  storeUInt64LE(packed->fields.sequenceNumber,     header->sequenceNumber);
+  storeUInt64LE(packed->fields.nonce,              header->nonce);
+  storeUInt64LE(packed->fields.logicalBlocksUsed,  header->logicalBlocksUsed);
+  storeUInt64LE(packed->fields.blockMapDataBlocks, header->blockMapDataBlocks);
+  storeUInt16LE(packed->fields.entryCount,         header->entryCount);
+
+  packed->fields.checkByte = header->checkByte;
+  packed->fields.recoveryCount = header->recoveryCount;
+  packed->fields.metadataType = header->metadataType;
+}
+
+/**
+ * Decode the packed representation of a recovery block header.
+ *
+ * @param packed  The packed header to decode
+ * @param header  The header into which to unpack the values
+ **/
+static inline void unpackRecoveryBlockHeader(const PackedJournalHeader *packed,
+                                             RecoveryBlockHeader       *header)
+{
+  *header = (RecoveryBlockHeader) {
+    .blockMapHead       = getUInt64LE(packed->fields.blockMapHead),
+    .slabJournalHead    = getUInt64LE(packed->fields.slabJournalHead),
+    .sequenceNumber     = getUInt64LE(packed->fields.sequenceNumber),
+    .nonce              = getUInt64LE(packed->fields.nonce),
+    .logicalBlocksUsed  = getUInt64LE(packed->fields.logicalBlocksUsed),
+    .blockMapDataBlocks = getUInt64LE(packed->fields.blockMapDataBlocks),
+    .entryCount         = getUInt16LE(packed->fields.entryCount),
+    .checkByte          = packed->fields.checkByte,
+    .recoveryCount      = packed->fields.recoveryCount,
+    .metadataType       = packed->fields.metadataType,
+  };
+}
+
+#endif // PACKED_RECOVERY_JOURNAL_BLOCK_H
diff --git a/vdo/base/packer.c b/vdo/base/packer.c
new file mode 100644
index 0000000..efb4dd4
--- /dev/null
+++ b/vdo/base/packer.c
@@ -0,0 +1,1023 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.c#8 $
+ */
+
+#include "packerInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "allocatingVIO.h"
+#include "allocationSelector.h"
+#include "compressionState.h"
+#include "dataVIO.h"
+#include "hashLock.h"
+#include "pbnLock.h"
+#include "vdo.h"
+#include "vdoInternal.h"
+
+/**
+ * Check that we are on the packer thread.
+ *
+ * @param packer  The packer
+ * @param caller  The function which is asserting
+ **/
+static inline void assertOnPackerThread(Packer *packer, const char *caller)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == packer->threadID),
+                  "%s() called from packer thread", caller);
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static inline InputBin *inputBinFromRingNode(RingNode *node)
+{
+  STATIC_ASSERT(offsetof(InputBin, ring) == 0);
+  return (InputBin *) node;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static inline OutputBin *outputBinFromRingNode(RingNode *node)
+{
+  STATIC_ASSERT(offsetof(OutputBin, ring) == 0);
+  return (OutputBin *) node;
+}
+
+/**********************************************************************/
+InputBin *nextBin(const Packer *packer, InputBin *bin)
+{
+  if (bin->ring.next == &packer->inputBins) {
+    return NULL;
+  } else {
+    return inputBinFromRingNode(bin->ring.next);
+  }
+}
+
+/**********************************************************************/
+InputBin *getFullestBin(const Packer *packer)
+{
+  if (isRingEmpty(&packer->inputBins)) {
+    return NULL;
+  } else {
+    return inputBinFromRingNode(packer->inputBins.next);
+  }
+}
+
+/**
+ * Insert an input bin to the list, which is in ascending order of free space.
+ * Since all bins are already in the list, this actually moves the bin to the
+ * correct position in the list.
+ *
+ * @param packer  The packer
+ * @param bin     The input bin to move to its sorted position
+ **/
+static void insertInSortedList(Packer *packer, InputBin *bin)
+{
+  for (InputBin *activeBin = getFullestBin(packer);
+       activeBin != NULL;
+       activeBin = nextBin(packer, activeBin)) {
+    if (activeBin->freeSpace > bin->freeSpace) {
+      pushRingNode(&activeBin->ring, &bin->ring);
+      return;
+    }
+  }
+
+  pushRingNode(&packer->inputBins, &bin->ring);
+}
+
+/**
+ * Allocate an input bin and put it into the packer's list.
+ *
+ * @param packer  The packer
+ **/
+__attribute__((warn_unused_result))
+static int makeInputBin(Packer *packer)
+{
+  InputBin *bin;
+  int result = ALLOCATE_EXTENDED(InputBin, MAX_COMPRESSION_SLOTS, VIO *,
+                                 __func__, &bin);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  bin->freeSpace = packer->binDataSize;
+  initializeRing(&bin->ring);
+  pushRingNode(&packer->inputBins, &bin->ring);
+  return VDO_SUCCESS;
+}
+
+/**
+ * Push an output bin onto the stack of idle bins.
+ *
+ * @param packer  The packer
+ * @param bin     The output bin
+ **/
+static void pushOutputBin(Packer *packer, OutputBin *bin)
+{
+  ASSERT_LOG_ONLY(!hasWaiters(&bin->outgoing),
+                  "idle output bin has no waiters");
+  packer->idleOutputBins[packer->idleOutputBinCount++] = bin;
+}
+
+/**
+ * Pop an output bin off the end of the stack of idle bins.
+ *
+ * @param packer  The packer
+ *
+ * @return an idle output bin, or <code>NULL</code> if there are no idle bins
+ **/
+__attribute__((warn_unused_result))
+static OutputBin *popOutputBin(Packer *packer)
+{
+  if (packer->idleOutputBinCount == 0) {
+    return NULL;
+  }
+
+  size_t     index = --packer->idleOutputBinCount;
+  OutputBin *bin   = packer->idleOutputBins[index];
+  packer->idleOutputBins[index] = NULL;
+  return bin;
+}
+
+/**
+ * Allocate a new output bin and push it onto the packer's stack of idle bins.
+ *
+ * @param packer  The packer
+ * @param layer   The physical layer that will receive the compressed block
+ *                writes from the output bin
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int makeOutputBin(Packer *packer, PhysicalLayer *layer)
+{
+  OutputBin *output;
+  int result = ALLOCATE(1, OutputBin, __func__, &output);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Add the bin to the stack even before it's fully initialized so it will
+  // be freed even if we fail to initialize it below.
+  initializeRing(&output->ring);
+  pushRingNode(&packer->outputBins, &output->ring);
+  pushOutputBin(packer, output);
+
+  result = ALLOCATE_EXTENDED(CompressedBlock, packer->binDataSize, char,
+                             "compressed block", &output->block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return layer->createCompressedWriteVIO(layer, output, (char *) output->block,
+                                         &output->writer);
+}
+
+/**
+ * Free an idle output bin and null out the reference to it.
+ *
+ * @param binPtr  The reference to the output bin to free
+ **/
+static void freeOutputBin(OutputBin **binPtr)
+{
+  OutputBin *bin = *binPtr;
+  if (bin == NULL) {
+    return;
+  }
+
+  unspliceRingNode(&bin->ring);
+
+  VIO *vio = allocatingVIOAsVIO(bin->writer);
+  freeVIO(&vio);
+  FREE(bin->block);
+  FREE(bin);
+  *binPtr = NULL;
+}
+
+/**********************************************************************/
+int makePacker(PhysicalLayer       *layer,
+               BlockCount           inputBinCount,
+               BlockCount           outputBinCount,
+               const ThreadConfig  *threadConfig,
+               Packer             **packerPtr)
+{
+  Packer *packer;
+  int result = ALLOCATE_EXTENDED(Packer, outputBinCount,
+                                 OutputBin *, __func__, &packer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  packer->threadID       = getPackerZoneThread(threadConfig);
+  packer->binDataSize    = VDO_BLOCK_SIZE - sizeof(CompressedBlockHeader);
+  packer->size           = inputBinCount;
+  packer->maxSlots       = MAX_COMPRESSION_SLOTS;
+  packer->outputBinCount = outputBinCount;
+  initializeRing(&packer->inputBins);
+  initializeRing(&packer->outputBins);
+
+  result = makeAllocationSelector(threadConfig->physicalZoneCount,
+                                  packer->threadID, &packer->selector);
+  if (result != VDO_SUCCESS) {
+    freePacker(&packer);
+    return result;
+  }
+
+  for (BlockCount i = 0; i < inputBinCount; i++) {
+    int result = makeInputBin(packer);
+    if (result != VDO_SUCCESS) {
+      freePacker(&packer);
+      return result;
+    }
+  }
+
+  /*
+   * The canceled bin can hold up to half the number of user VIOs. Every
+   * canceled VIO in the bin must have a canceler for which it is waiting, and
+   * any canceler will only have canceled one lock holder at a time.
+   */
+  result = ALLOCATE_EXTENDED(InputBin, MAXIMUM_USER_VIOS / 2, VIO *, __func__,
+                             &packer->canceledBin);
+  if (result != VDO_SUCCESS) {
+    freePacker(&packer);
+    return result;
+  }
+
+  for (BlockCount i = 0; i < outputBinCount; i++) {
+    int result = makeOutputBin(packer, layer);
+    if (result != VDO_SUCCESS) {
+      freePacker(&packer);
+      return result;
+    }
+  }
+
+  *packerPtr = packer;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freePacker(Packer **packerPtr)
+{
+  Packer *packer = *packerPtr;
+  if (packer == NULL) {
+    return;
+  }
+
+  InputBin *input;
+  while ((input = getFullestBin(packer)) != NULL) {
+    unspliceRingNode(&input->ring);
+    FREE(input);
+  }
+
+  FREE(packer->canceledBin);
+
+  OutputBin *output;
+  while ((output = popOutputBin(packer)) != NULL) {
+    freeOutputBin(&output);
+  }
+
+  freeAllocationSelector(&packer->selector);
+  FREE(packer);
+  *packerPtr = NULL;
+}
+
+/**
+ * Get the Packer from a DataVIO.
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return The Packer from the VDO to which the DataVIO belongs
+ **/
+static inline Packer *getPackerFromDataVIO(DataVIO *dataVIO)
+{
+  return getVDOFromDataVIO(dataVIO)->packer;
+}
+
+/**********************************************************************/
+bool isSufficientlyCompressible(DataVIO *dataVIO)
+{
+  Packer *packer = getPackerFromDataVIO(dataVIO);
+  return (dataVIO->compression.size < packer->binDataSize);
+}
+
+/**********************************************************************/
+ThreadID getPackerThreadID(Packer *packer)
+{
+  return packer->threadID;
+}
+
+/**********************************************************************/
+PackerStatistics getPackerStatistics(const Packer *packer)
+{
+  /*
+   * This is called from getVDOStatistics(), which is called from outside the
+   * packer thread. These are just statistics with no semantics that could
+   * rely on memory order, so unfenced reads are sufficient.
+   */
+  return (PackerStatistics) {
+    .compressedFragmentsWritten  = relaxedLoad64(&packer->fragmentsWritten),
+    .compressedBlocksWritten     = relaxedLoad64(&packer->blocksWritten),
+    .compressedFragmentsInPacker = relaxedLoad64(&packer->fragmentsPending),
+  };
+}
+
+/**
+ * Abort packing a DataVIO.
+ *
+ * @param dataVIO     The DataVIO to abort
+ **/
+static void abortPacking(DataVIO *dataVIO)
+{
+  setCompressionDone(dataVIO);
+  relaxedAdd64(&getPackerFromDataVIO(dataVIO)->fragmentsPending, -1);
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  continueDataVIO(dataVIO, VDO_SUCCESS);
+}
+
+/**
+ * This continues the VIO completion without packing the VIO.
+ *
+ * @param waiter  The wait queue entry of the VIO to continue
+ * @param unused  An argument required so this function may be called
+ *                from notifyAllWaiters
+ **/
+static void continueVIOWithoutPacking(Waiter *waiter,
+                                      void *unused __attribute__((unused)))
+{
+  abortPacking(waiterAsDataVIO(waiter));
+}
+
+/**
+ * Check whether the packer has drained.
+ *
+ * @param packer  The packer
+ **/
+static void checkForDrainComplete(Packer *packer)
+{
+  if (isDraining(&packer->state)
+      && (packer->canceledBin->slotsUsed == 0)
+      && (packer->idleOutputBinCount == packer->outputBinCount)) {
+    finishDraining(&packer->state);
+  }
+}
+
+/**********************************************************************/
+static void writePendingBatches(Packer *packer);
+
+/**
+ * Ensure that a completion is running on the packer thread.
+ *
+ * @param completion  The compressed write VIO
+ *
+ * @return <code>true</code> if the completion is on the packer thread
+ **/
+__attribute__((warn_unused_result))
+static bool switchToPackerThread(VDOCompletion *completion)
+{
+  VIO      *vio      = asVIO(completion);
+  ThreadID  threadID = vio->vdo->packer->threadID;
+  if (completion->callbackThreadID == threadID) {
+    return true;
+  }
+
+  completion->callbackThreadID = threadID;
+  invokeCallback(completion);
+  return false;
+}
+
+/**
+ * Finish processing an output bin whose write has completed. If there was
+ * an error, any DataVIOs waiting on the bin write will be notified.
+ *
+ * @param packer  The packer which owns the bin
+ * @param bin     The bin which has finished
+ **/
+static void finishOutputBin(Packer *packer, OutputBin *bin)
+{
+  if (hasWaiters(&bin->outgoing)) {
+    notifyAllWaiters(&bin->outgoing, continueVIOWithoutPacking, NULL);
+  } else {
+    // No waiters implies no error, so the compressed block was written.
+    relaxedAdd64(&packer->fragmentsPending, -bin->slotsUsed);
+    relaxedAdd64(&packer->fragmentsWritten, bin->slotsUsed);
+    relaxedAdd64(&packer->blocksWritten, 1);
+  }
+
+  bin->slotsUsed = 0;
+  pushOutputBin(packer, bin);
+}
+
+/**
+ * This finishes the bin write process after the bin is written to disk. This
+ * is the VIO callback function registered by writeOutputBin().
+ *
+ * @param completion  The compressed write VIO
+ **/
+static void completeOutputBin(VDOCompletion *completion)
+{
+  if (!switchToPackerThread(completion)) {
+    return;
+  }
+
+  VIO *vio = asVIO(completion);
+  if (completion->result != VDO_SUCCESS) {
+    updateVIOErrorStats(vio,
+                        "Completing compressed write VIO for physical block %"
+                        PRIu64 " with error",
+                        vio->physical);
+  }
+
+  Packer *packer = vio->vdo->packer;
+  finishOutputBin(packer, completion->parent);
+  writePendingBatches(packer);
+  checkForDrainComplete(packer);
+}
+
+/**
+ * Implements WaiterCallback. Continues the DataVIO waiter.
+ **/
+static void continueWaiter(Waiter *waiter,
+                           void   *context __attribute__((unused)))
+{
+  DataVIO *dataVIO = waiterAsDataVIO(waiter);
+  continueDataVIO(dataVIO, VDO_SUCCESS);
+}
+
+/**
+ * Implements WaiterCallback. Updates the DataVIO waiter to refer to its slot
+ * in the compressed block, gives the DataVIO a share of the PBN lock on that
+ * block, and reserves a reference count increment on the lock.
+ **/
+static void shareCompressedBlock(Waiter *waiter, void *context)
+{
+  DataVIO   *dataVIO = waiterAsDataVIO(waiter);
+  OutputBin *bin     = context;
+
+  dataVIO->newMapped = (ZonedPBN) {
+    .pbn   = bin->writer->allocation,
+    .zone  = bin->writer->zone,
+    .state = getStateForSlot(dataVIO->compression.slot),
+  };
+  dataVIOAsVIO(dataVIO)->physical = dataVIO->newMapped.pbn;
+
+  shareCompressedWriteLock(dataVIO, bin->writer->allocationLock);
+
+  // Wait again for all the waiters to get a share.
+  int result = enqueueWaiter(&bin->outgoing, waiter);
+  // Cannot fail since this waiter was just dequeued.
+  ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error");
+}
+
+/**
+ * Finish a compressed block write. This callback is registered in
+ * continueAfterAllocation().
+ *
+ * @param completion  The compressed write completion
+ **/
+static void finishCompressedWrite(VDOCompletion *completion)
+{
+  OutputBin *bin = completion->parent;
+  assertInPhysicalZone(bin->writer);
+
+  if (completion->result != VDO_SUCCESS) {
+    releaseAllocationLock(bin->writer);
+    // Invokes completeOutputBin() on the packer thread, which will deal with
+    // the waiters.
+    vioDoneCallback(completion);
+    return;
+  }
+
+  // First give every DataVIO/HashLock a share of the PBN lock to ensure it
+  // can't be released until they've all done their incRefs.
+  notifyAllWaiters(&bin->outgoing, shareCompressedBlock, bin);
+
+  // The waiters now hold the (downgraded) PBN lock.
+  bin->writer->allocationLock = NULL;
+
+  // Invokes the callbacks registered before entering the packer.
+  notifyAllWaiters(&bin->outgoing, continueWaiter, NULL);
+
+  // Invokes completeOutputBin() on the packer thread.
+  vioDoneCallback(completion);
+}
+
+/**
+ * Continue the write path for a compressed write AllocatingVIO now that block
+ * allocation is complete (the AllocatingVIO may or may not have actually
+ * received an allocation).
+ *
+ * @param allocatingVIO  The AllocatingVIO which has finished the allocation
+ *                       process
+ **/
+static void continueAfterAllocation(AllocatingVIO *allocatingVIO)
+{
+  VIO           *vio        = allocatingVIOAsVIO(allocatingVIO);
+  VDOCompletion *completion = vioAsCompletion(vio);
+  if (allocatingVIO->allocation == ZERO_BLOCK) {
+    completion->requeue = true;
+    setCompletionResult(completion, VDO_NO_SPACE);
+    vioDoneCallback(completion);
+    return;
+  }
+
+  setPhysicalZoneCallback(allocatingVIO, finishCompressedWrite,
+                          THIS_LOCATION("$F(meta);cb=finishCompressedWrite"));
+  completion->layer->writeCompressedBlock(allocatingVIO);
+}
+
+/**
+ * Launch an output bin.
+ *
+ * @param packer  The packer which owns the bin
+ * @param bin     The output bin to launch
+ **/
+static void launchCompressedWrite(Packer *packer, OutputBin *bin)
+{
+  if (isReadOnly(getVDOFromAllocatingVIO(bin->writer)->readOnlyNotifier)) {
+    finishOutputBin(packer, bin);
+    return;
+  }
+
+  VIO *vio = allocatingVIOAsVIO(bin->writer);
+  resetCompletion(vioAsCompletion(vio));
+  vio->callback = completeOutputBin;
+  vio->priority = VIO_PRIORITY_COMPRESSED_DATA;
+  allocateDataBlock(bin->writer, packer->selector, VIO_COMPRESSED_WRITE_LOCK,
+                    continueAfterAllocation);
+}
+
+/**
+ * Consume from the pending queue the next batch of VIOs that can be packed
+ * together in a single compressed block. VIOs that have been mooted since
+ * being placed in the pending queue will not be returned.
+ *
+ * @param packer  The packer
+ * @param batch   The counted array to fill with the next batch of VIOs
+ **/
+static void getNextBatch(Packer *packer, OutputBatch *batch)
+{
+  BlockSize spaceRemaining = packer->binDataSize;
+  batch->slotsUsed         = 0;
+
+  DataVIO *dataVIO;
+  while ((dataVIO = waiterAsDataVIO(getFirstWaiter(&packer->batchedDataVIOs)))
+         != NULL) {
+    // If there's not enough space for the next DataVIO, the batch is done.
+    if ((dataVIO->compression.size > spaceRemaining)
+        || (batch->slotsUsed == packer->maxSlots)) {
+      break;
+    }
+
+    // Remove the next DataVIO from the queue and put it in the output batch.
+    dequeueNextWaiter(&packer->batchedDataVIOs);
+    batch->slots[batch->slotsUsed++]  = dataVIO;
+    spaceRemaining                   -= dataVIO->compression.size;
+  }
+}
+
+/**
+ * Pack the next batch of compressed VIOs from the batched queue into an
+ * output bin and write the output bin.
+ *
+ * @param packer  The packer
+ * @param output  The output bin to fill
+ *
+ * @return <code>true</code> if a write was issued for the output bin
+ **/
+__attribute__((warn_unused_result))
+static bool writeNextBatch(Packer *packer, OutputBin *output)
+{
+  OutputBatch batch;
+  getNextBatch(packer, &batch);
+
+  if (batch.slotsUsed == 0) {
+    // The pending queue must now be empty (there may have been mooted VIOs).
+    return false;
+  }
+
+  // If the batch contains only a single VIO, then we save nothing by saving
+  // the compressed form. Continue processing the single VIO in the batch.
+  if (batch.slotsUsed == 1) {
+    abortPacking(batch.slots[0]);
+    return false;
+  }
+
+  resetCompressedBlockHeader(&output->block->header);
+
+  size_t spaceUsed = 0;
+  for (SlotNumber slot = 0; slot < batch.slotsUsed; slot++) {
+    DataVIO *dataVIO = batch.slots[slot];
+    dataVIO->compression.slot = slot;
+    putCompressedBlockFragment(output->block, slot, spaceUsed,
+                               dataVIO->compression.data,
+                               dataVIO->compression.size);
+    spaceUsed += dataVIO->compression.size;
+
+    int result = enqueueDataVIO(&output->outgoing, dataVIO,
+                                THIS_LOCATION(NULL));
+    if (result != VDO_SUCCESS) {
+      abortPacking(dataVIO);
+      continue;
+    }
+
+    output->slotsUsed += 1;
+  }
+
+  launchCompressedWrite(packer, output);
+  return true;
+}
+
+/**
+ * Put a DataVIO in a specific InputBin in which it will definitely fit.
+ *
+ * @param bin      The bin in which to put the DataVIO
+ * @param dataVIO  The DataVIO to add
+ **/
+static void addToInputBin(InputBin *bin, DataVIO *dataVIO)
+{
+  dataVIO->compression.bin  = bin;
+  dataVIO->compression.slot = bin->slotsUsed;
+  bin->incoming[bin->slotsUsed++] = dataVIO;
+}
+
+/**
+ * Start a new batch of VIOs in an InputBin, moving the existing batch, if
+ * any, to the queue of pending batched VIOs in the packer.
+ *
+ * @param packer  The packer
+ * @param bin     The bin to prepare
+ **/
+static void startNewBatch(Packer *packer, InputBin *bin)
+{
+  // Move all the DataVIOs in the current batch to the batched queue so they
+  // will get packed into the next free output bin.
+  for (SlotNumber slot = 0; slot < bin->slotsUsed; slot++) {
+    DataVIO *dataVIO = bin->incoming[slot];
+    dataVIO->compression.bin = NULL;
+
+    if (!mayWriteCompressedDataVIO(dataVIO)) {
+      /*
+       * Compression of this DataVIO was canceled while it was waiting; put it
+       * in the canceled bin so it can be rendezvous with the canceling
+       * DataVIO.
+       */
+      addToInputBin(packer->canceledBin, dataVIO);
+      continue;
+    }
+
+    int result = enqueueDataVIO(&packer->batchedDataVIOs, dataVIO,
+                                THIS_LOCATION(NULL));
+    if (result != VDO_SUCCESS) {
+      // Impossible but we're required to check the result from enqueue.
+      abortPacking(dataVIO);
+    }
+  }
+
+  // The bin is now empty.
+  bin->slotsUsed = 0;
+  bin->freeSpace = packer->binDataSize;
+}
+
+/**
+ * Add a DataVIO to a bin's incoming queue, handle logical space change, and
+ * call physical space processor.
+ *
+ * @param packer   The packer
+ * @param bin      The bin to which to add the the DataVIO
+ * @param dataVIO  The DataVIO to add to the bin's queue
+ **/
+static void addDataVIOToInputBin(Packer   *packer,
+                                 InputBin *bin,
+                                 DataVIO  *dataVIO)
+{
+  // If the selected bin doesn't have room, start a new batch to make room.
+  if (bin->freeSpace < dataVIO->compression.size) {
+    startNewBatch(packer, bin);
+  }
+
+  addToInputBin(bin, dataVIO);
+  bin->freeSpace -= dataVIO->compression.size;
+
+  // If we happen to exactly fill the bin, start a new input batch.
+  if ((bin->slotsUsed == packer->maxSlots) || (bin->freeSpace == 0)) {
+    startNewBatch(packer, bin);
+  }
+
+  // Now that we've finished changing the free space, restore the sort order.
+  insertInSortedList(packer, bin);
+}
+
+/**
+ * Move DataVIOs in pending batches from the batchedDataVIOs to all free output
+ * bins, issuing writes for the output bins as they are packed. This will loop
+ * until either the pending queue is drained or all output bins are busy
+ * writing a compressed block.
+ *
+ * @param packer  The packer
+ **/
+static void writePendingBatches(Packer *packer)
+{
+  if (packer->writingBatches) {
+    /*
+     * We've attempted to re-enter this function recursively due to completion
+     * handling, which can lead to kernel stack overflow as in VDO-1340. It's
+     * perfectly safe to break the recursion and do nothing since we know any
+     * pending batches will eventually be handled by the earlier call.
+     */
+    return;
+  }
+
+  // Record that we are in this function for the above check. IMPORTANT: never
+  // return from this function without clearing this flag.
+  packer->writingBatches = true;
+
+  OutputBin *output;
+  while (hasWaiters(&packer->batchedDataVIOs)
+         && ((output = popOutputBin(packer)) != NULL)) {
+    if (!writeNextBatch(packer, output)) {
+      // We didn't use the output bin to write, so push it back on the stack.
+      pushOutputBin(packer, output);
+    }
+  }
+
+  packer->writingBatches = false;
+}
+
+/**
+ * Select the input bin that should be used to pack the compressed data in a
+ * DataVIO with other DataVIOs.
+ *
+ * @param packer   The packer
+ * @param dataVIO  The DataVIO
+ **/
+__attribute__((warn_unused_result))
+static InputBin *selectInputBin(Packer *packer, DataVIO *dataVIO)
+{
+  // First best fit: select the bin with the least free space that has enough
+  // room for the compressed data in the DataVIO.
+  InputBin *fullestBin = getFullestBin(packer);
+  for (InputBin *bin = fullestBin; bin != NULL; bin = nextBin(packer, bin)) {
+    if (bin->freeSpace >= dataVIO->compression.size) {
+      return bin;
+    }
+  }
+
+  /*
+   * None of the bins have enough space for the DataVIO. We're not allowed to
+   * create new bins, so we have to overflow one of the existing bins. It's
+   * pretty intuitive to select the fullest bin, since that "wastes" the least
+   * amount of free space in the compressed block. But if the space currently
+   * used in the fullest bin is smaller than the compressed size of the
+   * incoming block, it seems wrong to force that bin to write when giving up
+   * on compressing the incoming DataVIO would likewise "waste" the the least
+   * amount of free space.
+   */
+  if (dataVIO->compression.size
+      >= (packer->binDataSize - fullestBin->freeSpace)) {
+    return NULL;
+  }
+
+  // The fullest bin doesn't have room, but writing it out and starting a new
+  // batch with the incoming DataVIO will increase the packer's free space.
+  return fullestBin;
+}
+
+/**********************************************************************/
+void attemptPacking(DataVIO *dataVIO)
+{
+  Packer *packer = getPackerFromDataVIO(dataVIO);
+  assertOnPackerThread(packer, __func__);
+
+  VIOCompressionState state = getCompressionState(dataVIO);
+  int result = ASSERT((state.status == VIO_COMPRESSING),
+                      "attempt to pack DataVIO not ready for packing, state: "
+                      "%u",
+                      state.status);
+  if (result != VDO_SUCCESS) {
+    return;
+  }
+
+  /*
+   * Increment whether or not this DataVIO will be packed or not since
+   * abortPacking() always decrements the counter.
+   */
+  relaxedAdd64(&packer->fragmentsPending, 1);
+
+  // If packing of this DataVIO is disallowed for administrative reasons, give
+  // up before making any state changes.
+  if (!isNormal(&packer->state)
+      || (dataVIO->flushGeneration < packer->flushGeneration)) {
+    abortPacking(dataVIO);
+    return;
+  }
+
+  /*
+   * The check of mayBlockInPacker() here will set the DataVIO's compression
+   * state to VIO_PACKING if the DataVIO is allowed to be compressed (if it has
+   * already been canceled, we'll fall out here). Once the DataVIO is in the
+   * VIO_PACKING state, it must be guaranteed to be put in an input bin before
+   * any more requests can be processed by the packer thread. Otherwise, a
+   * canceling DataVIO could attempt to remove the canceled DataVIO from the
+   * packer and fail to rendezvous with it (VDO-2809). We must also make sure
+   * that we will actually bin the DataVIO and not give up on it as being
+   * larger than the space used in the fullest bin. Hence we must call
+   * selectInputBin() before calling mayBlockInPacker() (VDO-2826).
+   */
+  InputBin *bin = selectInputBin(packer, dataVIO);
+  if ((bin == NULL) || !mayBlockInPacker(dataVIO)) {
+    abortPacking(dataVIO);
+    return;
+  }
+
+  addDataVIOToInputBin(packer, bin, dataVIO);
+  writePendingBatches(packer);
+}
+
+/**
+ * Force a pending write for all non-empty bins on behalf of a flush or
+ * suspend.
+ *
+ * @param packer  The packer being flushed
+ **/
+static void writeAllNonEmptyBins(Packer *packer)
+{
+  for (InputBin *bin = getFullestBin(packer);
+       bin != NULL;
+       bin = nextBin(packer, bin)) {
+    startNewBatch(packer, bin);
+    // We don't need to re-sort the bin here since this loop will make every
+    // bin have the same amount of free space, so every ordering is sorted.
+  }
+
+  writePendingBatches(packer);
+}
+
+/**********************************************************************/
+void flushPacker(Packer *packer)
+{
+  assertOnPackerThread(packer, __func__);
+  if (isNormal(&packer->state)) {
+    writeAllNonEmptyBins(packer);
+  }
+}
+
+/*
+ * This method is only exposed for unit tests and should not normally be called
+ * directly; use removeLockHolderFromPacker() instead.
+ */
+void removeFromPacker(DataVIO *dataVIO)
+{
+  InputBin *bin    = dataVIO->compression.bin;
+  ASSERT_LOG_ONLY((bin != NULL), "DataVIO in packer has an input bin");
+
+  SlotNumber slot = dataVIO->compression.slot;
+  bin->slotsUsed--;
+  if (slot < bin->slotsUsed) {
+    bin->incoming[slot] = bin->incoming[bin->slotsUsed];
+    bin->incoming[slot]->compression.slot = slot;
+  }
+
+  dataVIO->compression.bin  = NULL;
+  dataVIO->compression.slot = 0;
+
+  Packer *packer = getPackerFromDataVIO(dataVIO);
+  if (bin != packer->canceledBin) {
+    bin->freeSpace += dataVIO->compression.size;
+    insertInSortedList(packer, bin);
+  }
+
+  abortPacking(dataVIO);
+  checkForDrainComplete(packer);
+}
+
+/**********************************************************************/
+void removeLockHolderFromPacker(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInPackerZone(dataVIO);
+
+  DataVIO *lockHolder             = dataVIO->compression.lockHolder;
+  dataVIO->compression.lockHolder = NULL;
+  removeFromPacker(lockHolder);
+}
+
+/**********************************************************************/
+void incrementPackerFlushGeneration(Packer *packer)
+{
+  assertOnPackerThread(packer, __func__);
+  packer->flushGeneration++;
+  flushPacker(packer);
+}
+
+/**
+ * Initiate a drain.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateDrain(AdminState *state)
+{
+  Packer *packer = container_of(state, Packer, state);
+  writeAllNonEmptyBins(packer);
+  checkForDrainComplete(packer);
+}
+
+/**********************************************************************/
+void drainPacker(Packer *packer, VDOCompletion *completion)
+{
+  assertOnPackerThread(packer, __func__);
+  startDraining(&packer->state, ADMIN_STATE_SUSPENDING, completion,
+                initiateDrain);
+}
+
+/**********************************************************************/
+void resumePacker(Packer *packer, VDOCompletion *parent)
+{
+  assertOnPackerThread(packer, __func__);
+  finishCompletion(parent, resumeIfQuiescent(&packer->state));
+}
+
+/**********************************************************************/
+void resetSlotCount(Packer *packer, CompressedFragmentCount slots)
+{
+  if (slots > MAX_COMPRESSION_SLOTS) {
+    return;
+  }
+
+  packer->maxSlots = slots;
+}
+
+/**********************************************************************/
+static void dumpInputBin(const InputBin *bin, bool canceled)
+{
+  if (bin->slotsUsed == 0) {
+    // Don't dump empty input bins.
+    return;
+  }
+
+  logInfo("    %sBin slotsUsed=%u freeSpace=%zu",
+          (canceled ? "Canceled" : "Input"), bin->slotsUsed, bin->freeSpace);
+
+  // XXX dump VIOs in bin->incoming? The VIOs should have been dumped from the
+  // VIO pool. Maybe just dump their addresses so it's clear they're here?
+}
+
+/**********************************************************************/
+static void dumpOutputBin(const OutputBin *bin)
+{
+  size_t count = countWaiters(&bin->outgoing);
+  if (bin->slotsUsed == 0) {
+    // Don't dump empty output bins.
+    return;
+  }
+
+  logInfo("    OutputBin contains %zu outgoing waiters", count);
+
+  // XXX dump VIOs in bin->outgoing? The VIOs should have been dumped from the
+  // VIO pool. Maybe just dump their addresses so it's clear they're here?
+
+  // XXX dump writer VIO?
+}
+
+/**********************************************************************/
+void dumpPacker(const Packer *packer)
+{
+  logInfo("Packer");
+  logInfo("  flushGeneration=%llu state %s writingBatches=%s",
+          packer->flushGeneration, getAdminStateName(&packer->state),
+          boolToString(packer->writingBatches));
+
+  logInfo("  inputBinCount=%llu", packer->size);
+  for (InputBin *bin = getFullestBin(packer);
+       bin != NULL;
+       bin = nextBin(packer, bin)) {
+    dumpInputBin(bin, false);
+  }
+
+  dumpInputBin(packer->canceledBin, true);
+
+  logInfo("  outputBinCount=%zu idleOutputBinCount=%zu",
+          packer->outputBinCount, packer->idleOutputBinCount);
+  const RingNode *head = &packer->outputBins;
+  for (RingNode *node = head->next; node != head; node = node->next) {
+    dumpOutputBin(outputBinFromRingNode(node));
+  }
+}
diff --git a/vdo/base/packer.h b/vdo/base/packer.h
new file mode 100644
index 0000000..6661552
--- /dev/null
+++ b/vdo/base/packer.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.h#3 $
+ */
+
+#ifndef PACKER_H
+#define PACKER_H
+
+#include "completion.h"
+#include "physicalLayer.h"
+#include "statistics.h"
+#include "threadConfig.h"
+#include "types.h"
+
+enum {
+  DEFAULT_PACKER_INPUT_BINS  = 16,
+  DEFAULT_PACKER_OUTPUT_BINS = 256,
+};
+
+typedef struct packer Packer;
+
+/**
+ * Make a new block packer.
+ *
+ * @param [in]  layer           The physical layer to which compressed blocks
+ *                              will be written
+ * @param [in]  inputBinCount   The number of partial bins to keep in memory
+ * @param [in]  outputBinCount  The number of compressed blocks that can be
+ *                              written concurrently
+ * @param [in]  threadConfig    The thread configuration of the VDO
+ * @param [out] packerPtr       A pointer to hold the new packer
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makePacker(PhysicalLayer       *layer,
+               BlockCount           inputBinCount,
+               BlockCount           outputBinCount,
+               const ThreadConfig  *threadConfig,
+               Packer             **packerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a block packer and null out the reference to it.
+ *
+ * @param packerPtr  A pointer to the packer to free
+ **/
+void freePacker(Packer **packerPtr);
+
+/**
+ * Check whether the compressed data in a DataVIO will fit in a packer bin.
+ *
+ * @param dataVIO  The DataVIO
+ *
+ * @return <code>true</code> if the DataVIO will fit in a bin
+ **/
+bool isSufficientlyCompressible(DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the thread ID of the packer's zone.
+ *
+ * @param packer  The packer
+ *
+ * @return The packer's thread ID
+ **/
+ThreadID getPackerThreadID(Packer *packer);
+
+/**
+ * Get the current statistics from the packer.
+ *
+ * @param packer  The packer to query
+ *
+ * @return a copy of the current statistics for the packer
+ **/
+PackerStatistics getPackerStatistics(const Packer *packer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to rewrite the data in this DataVIO as part of a compressed block.
+ *
+ * @param dataVIO  The DataVIO to pack
+ **/
+void attemptPacking(DataVIO *dataVIO);
+
+/**
+ * Request that the packer flush asynchronously. All bins with at least two
+ * compressed data blocks will be written out, and any solitary pending VIOs
+ * will be released from the packer. While flushing is in progress, any VIOs
+ * submitted to attemptPacking() will be continued immediately without
+ * attempting to pack them.
+ *
+ * @param packer  The packer to flush
+ **/
+void flushPacker(Packer *packer);
+
+/**
+ * Remove a lock holder from the packer.
+ *
+ * @param completion  The DataVIO which needs a lock held by a DataVIO in the
+ *                    packer. The dataVIO's compressedVIO.lockHolder field will
+ *                    point to the DataVIO to remove.
+ **/
+void removeLockHolderFromPacker(VDOCompletion *completion);
+
+/**
+ * Increment the flush generation in the packer. This will also cause the
+ * packer to flush so that any VIOs from previous generations will exit the
+ * packer.
+ *
+ * @param packer  The packer
+ **/
+void incrementPackerFlushGeneration(Packer *packer);
+
+/**
+ * Drain the packer by preventing any more VIOs from entering the packer and
+ * then flushing.
+ *
+ * @param packer      The packer to drain
+ * @param completion  The completion to finish when the packer has drained
+ **/
+void drainPacker(Packer *packer, VDOCompletion *completion);
+
+/**
+ * Resume a packer which has been suspended.
+ *
+ * @param packer  The packer to resume
+ * @param parent  The completion to finish when the packer has resumed
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+void resumePacker(Packer *packer, VDOCompletion *parent);
+
+/**
+ * Dump the packer, in a thread-unsafe fashion.
+ *
+ * @param packer  The packer
+ **/
+void dumpPacker(const Packer *packer);
+
+#endif /* PACKER_H */
diff --git a/vdo/base/packerInternals.h b/vdo/base/packerInternals.h
new file mode 100644
index 0000000..e5aa500
--- /dev/null
+++ b/vdo/base/packerInternals.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packerInternals.h#4 $
+ */
+
+#ifndef PACKER_INTERNALS_H
+#define PACKER_INTERNALS_H
+
+#include "packer.h"
+
+#include "atomic.h"
+
+#include "adminState.h"
+#include "compressedBlock.h"
+#include "header.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * Each InputBin holds an incomplete batch of DataVIOs that only partially fill
+ * a compressed block. The InputBins are kept in a ring sorted by the amount of
+ * unused space so the first bin with enough space to hold a newly-compressed
+ * DataVIO can easily be found. When the bin fills up or is flushed, the
+ * incoming DataVIOs are moved to the Packer's batchedDataVIOs queue, from
+ * which they will eventually be routed to an idle OutputBin.
+ *
+ * There is one special input bin which is used to hold DataVIOs which have
+ * been canceled and removed from their input bin by the packer. These DataVIOs
+ * need to wait for the canceller to rendezvous with them (VDO-2809) and so
+ * they sit in this special bin.
+ **/
+struct inputBin {
+  /** List links for Packer.sortedBins */
+  RingNode    ring;
+  /** The number of items in the bin */
+  SlotNumber  slotsUsed;
+  /** The number of compressed block bytes remaining in the current batch */
+  size_t      freeSpace;
+  /** The current partial batch of DataVIOs, waiting for more */
+  DataVIO    *incoming[];
+};
+
+/**
+ * Each OutputBin allows a single compressed block to be packed and written.
+ * When it is not idle, it holds a batch of DataVIOs that have been packed
+ * into the compressed block, written asynchronously, and are waiting for the
+ * write to complete.
+ **/
+typedef struct {
+  /** List links for Packer.outputBins */
+  RingNode         ring;
+  /** The storage for encoding the compressed block representation */
+  CompressedBlock *block;
+  /** The AllocatingVIO wrapping the compressed block for writing */
+  AllocatingVIO   *writer;
+  /** The number of compression slots used in the compressed block */
+  SlotNumber       slotsUsed;
+  /** The DataVIOs packed into the block, waiting for the write to complete */
+  WaitQueue        outgoing;
+} OutputBin;
+
+/**
+ * A counted array holding a batch of DataVIOs that should be packed into an
+ * output bin.
+ **/
+typedef struct {
+  size_t   slotsUsed;
+  DataVIO *slots[MAX_COMPRESSION_SLOTS];
+} OutputBatch;
+
+struct packer {
+  /** The ID of the packer's callback thread */
+  ThreadID            threadID;
+  /** The selector for determining which physical zone to allocate from */
+  AllocationSelector *selector;
+  /** The number of input bins */
+  BlockCount          size;
+  /** The block size minus header size */
+  size_t              binDataSize;
+  /** The number of compression slots */
+  size_t              maxSlots;
+  /** A ring of all InputBins, kept sorted by freeSpace */
+  RingNode            inputBins;
+  /** A ring of all OutputBins */
+  RingNode            outputBins;
+  /**
+   * A bin to hold DataVIOs which were canceled out of the packer and are
+   * waiting to rendezvous with the canceling DataVIO.
+   **/
+  InputBin           *canceledBin;
+
+  /** The current flush generation */
+  SequenceNumber      flushGeneration;
+
+  /** The administrative state of the packer */
+  AdminState          state;
+  /** True when writing batched DataVIOs */
+  bool                writingBatches;
+
+  // Atomic counters corresponding to the fields of PackerStatistics:
+
+  /** Number of compressed data items written since startup */
+  Atomic64            fragmentsWritten;
+  /** Number of blocks containing compressed items written since startup */
+  Atomic64            blocksWritten;
+  /** Number of DataVIOs that are pending in the packer */
+  Atomic64            fragmentsPending;
+
+  /** Queue of batched DataVIOs waiting to be packed */
+  WaitQueue           batchedDataVIOs;
+
+  /** The total number of output bins allocated */
+  size_t              outputBinCount;
+  /** The number of idle output bins on the stack */
+  size_t              idleOutputBinCount;
+  /** The stack of idle output bins (0=bottom) */
+  OutputBin          *idleOutputBins[];
+};
+
+/**
+ * This returns the first bin in the freeSpace-sorted list.
+ **/
+InputBin *getFullestBin(const Packer *packer);
+
+/**
+ * This returns the next bin in the freeSpace-sorted list.
+ **/
+InputBin *nextBin(const Packer *packer, InputBin *bin);
+
+/**
+ * Change the maxiumum number of compression slots the packer will use. The new
+ * number of slots must be less than or equal to MAX_COMPRESSION_SLOTS. Bins
+ * which already have fragments will not be resized until they are next written
+ * out.
+ *
+ * @param packer  The packer
+ * @param slots   The new number of slots
+ **/
+void resetSlotCount(Packer *packer, CompressedFragmentCount slots);
+
+/**
+ * Remove a DataVIO from the packer. This method is exposed for testing.
+ *
+ * @param dataVIO  The DataVIO to remove
+ **/
+void removeFromPacker(DataVIO *dataVIO);
+
+#endif /* PACKER_INTERNALS_H */
diff --git a/vdo/base/partitionCopy.c b/vdo/base/partitionCopy.c
new file mode 100644
index 0000000..d5fa6de
--- /dev/null
+++ b/vdo/base/partitionCopy.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.c#2 $
+ */
+
+#include "partitionCopy.h"
+
+#include "memoryAlloc.h"
+
+#include "completion.h"
+#include "constants.h"
+#include "extent.h"
+#include "numUtils.h"
+
+enum {
+  STRIDE_LENGTH = 2048
+};
+
+/**
+ * A partition copy completion.
+ **/
+typedef struct {
+  /** completion header */
+  VDOCompletion        completion;
+  /** the source partition to copy from */
+  Partition           *source;
+  /** the target partition to copy to */
+  Partition           *target;
+  /** the current in-partition PBN the copy is beginning at */
+  PhysicalBlockNumber  currentIndex;
+  /** the last block to copy */
+  PhysicalBlockNumber  endingIndex;
+  /** the backing data used by the extent */
+  char                *data;
+  /** the extent being used to copy */
+  VDOExtent           *extent;
+} CopyCompletion;
+
+/**
+ * Convert a VDOCompletion to a CopyCompletion.
+ *
+ * @param completion The completion to convert
+ *
+ * @return the completion as a CopyCompletion
+ **/
+__attribute__((warn_unused_result))
+static inline
+CopyCompletion *asCopyCompletion(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(CopyCompletion, completion) == 0);
+  assertCompletionType(completion->type, PARTITION_COPY_COMPLETION);
+  return (CopyCompletion *) completion;
+}
+
+/**********************************************************************/
+int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr)
+{
+  CopyCompletion *copy;
+  int result = ALLOCATE(1, CopyCompletion, __func__, &copy);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  initializeCompletion(&copy->completion, PARTITION_COPY_COMPLETION, layer);
+
+  result = ALLOCATE((VDO_BLOCK_SIZE * STRIDE_LENGTH), char,
+                    "partition copy extent", &copy->data);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &copy->completion;
+    freeCopyCompletion(&completion);
+    return result;
+  }
+
+  result = createExtent(layer, VIO_TYPE_PARTITION_COPY, VIO_PRIORITY_HIGH,
+                        STRIDE_LENGTH, copy->data, &copy->extent);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &copy->completion;
+    freeCopyCompletion(&completion);
+    return result;
+  }
+
+  *completionPtr = &copy->completion;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeCopyCompletion(VDOCompletion **completionPtr)
+{
+  if (*completionPtr == NULL) {
+    return;
+  }
+
+  CopyCompletion *copy = asCopyCompletion(*completionPtr);
+  freeExtent(&copy->extent);
+  FREE(copy->data);
+  FREE(copy);
+  *completionPtr = NULL;
+}
+
+/**********************************************************************/
+static void copyPartitionStride(CopyCompletion *copy);
+
+/**
+ * Determine the number of blocks to copy in the current stride.
+ *
+ * @param copy  The copy completion
+ *
+ * @return The number of blocks to copy in the current stride
+ **/
+static inline BlockCount getStrideSize(CopyCompletion *copy)
+{
+  return minBlockCount(STRIDE_LENGTH, copy->endingIndex - copy->currentIndex);
+}
+
+/**
+ * Process a completed write during a partition copy.
+ *
+ * @param completion  The extent which has just completed writing
+ **/
+static void completeWriteForCopy(VDOCompletion *completion)
+{
+  CopyCompletion *copy = asCopyCompletion(completion->parent);
+  copy->currentIndex += getStrideSize(copy);
+  if (copy->currentIndex >= copy->endingIndex) {
+    // We're done.
+    finishCompletion(completion->parent, VDO_SUCCESS);
+    return;
+  }
+  copyPartitionStride(copy);
+}
+
+/**
+ * Process a completed read during a partition copy, and launch the
+ * corresponding write to the new partition.
+ *
+ * @param completion  The extent which has just completed reading
+ **/
+static void completeReadForCopy(VDOCompletion *completion)
+{
+  CopyCompletion *copy = asCopyCompletion(completion->parent);
+  PhysicalBlockNumber layerStartBlock;
+  int result = translateToPBN(copy->target, copy->currentIndex,
+                              &layerStartBlock);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(completion->parent, result);
+    return;
+  }
+
+  completion->callback = completeWriteForCopy;
+  writePartialMetadataExtent(asVDOExtent(completion), layerStartBlock,
+                             getStrideSize(copy));
+}
+
+/**
+ * Copy a stride from one partition to the new partition.
+ *
+ * @param copy  The CopyCompletion
+ **/
+static void copyPartitionStride(CopyCompletion *copy)
+{
+  PhysicalBlockNumber layerStartBlock;
+  int result = translateToPBN(copy->source, copy->currentIndex,
+                              &layerStartBlock);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(&copy->completion, result);
+    return;
+  }
+
+  prepareCompletion(&copy->extent->completion, completeReadForCopy,
+                    finishParentCallback, copy->completion.callbackThreadID,
+                    &copy->completion);
+  readPartialMetadataExtent(copy->extent, layerStartBlock,
+                            getStrideSize(copy));
+}
+
+/**
+ * Verify that the source can be copied to the target safely.
+ *
+ * @param source        The source partition
+ * @param target        The target partition
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int validatePartitionCopy(Partition *source, Partition *target)
+{
+  BlockCount sourceSize = getFixedLayoutPartitionSize(source);
+  BlockCount targetSize = getFixedLayoutPartitionSize(target);
+
+  PhysicalBlockNumber sourceStart = getFixedLayoutPartitionOffset(source);
+  PhysicalBlockNumber sourceEnd   = sourceStart + sourceSize;
+  PhysicalBlockNumber targetStart = getFixedLayoutPartitionOffset(target);
+  PhysicalBlockNumber targetEnd   = targetStart + targetSize;
+
+  int result = ASSERT(sourceSize <= targetSize,
+                      "target partition must be not smaller than source"
+                      " partition");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return ASSERT(((sourceEnd <= targetStart) || (targetEnd <= sourceStart)),
+                "target partition must not overlap source partition");
+}
+
+/**********************************************************************/
+void copyPartitionAsync(VDOCompletion *completion,
+                        Partition     *source,
+                        Partition     *target,
+                        VDOCompletion *parent)
+{
+  int result = validatePartitionCopy(source, target);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  CopyCompletion *copy = asCopyCompletion(completion);
+  prepareToFinishParent(&copy->completion, parent);
+  copy->source       = source;
+  copy->target       = target;
+  copy->currentIndex = 0;
+  copy->endingIndex  = getFixedLayoutPartitionSize(source);
+  copyPartitionStride(copy);
+}
diff --git a/vdo/base/partitionCopy.h b/vdo/base/partitionCopy.h
new file mode 100644
index 0000000..574ac13
--- /dev/null
+++ b/vdo/base/partitionCopy.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.h#2 $
+ */
+
+#ifndef PARTITION_COPY_H
+#define PARTITION_COPY_H
+
+#include "fixedLayout.h"
+#include "physicalLayer.h"
+#include "types.h"
+
+/**
+ * Make a copy completion.
+ *
+ * @param [in]  layer          The layer on which the partitions reside
+ * @param [out] completionPtr  A pointer to hold the copy completion
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a copy completion and NULL out the reference to it.
+ *
+ * @param completionPtr  A pointer to the complete to be freed
+ **/
+void freeCopyCompletion(VDOCompletion **completionPtr);
+
+/**
+ * Copy a partition.
+ *
+ * @param completion    The copy completion to use
+ * @param source        The partition to copy from
+ * @param target        The partition to copy to
+ * @param parent        The parent to finish when the copy is complete
+ **/
+void copyPartitionAsync(VDOCompletion *completion,
+                        Partition     *source,
+                        Partition     *target,
+                        VDOCompletion *parent);
+
+#endif /* PARTITION_COPY_H */
diff --git a/vdo/base/pbnLock.c b/vdo/base/pbnLock.c
new file mode 100644
index 0000000..5e9a274
--- /dev/null
+++ b/vdo/base/pbnLock.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.c#3 $
+ */
+
+#include "pbnLock.h"
+
+#include "logger.h"
+
+#include "blockAllocator.h"
+#include "referenceBlock.h"
+
+struct pbnLockImplementation {
+  PBNLockType  type;
+  const char  *name;
+  const char  *releaseReason;
+};
+
+/**
+ * This array must have an entry for every PBNLockType value.
+ **/
+static const PBNLockImplementation LOCK_IMPLEMENTATIONS[] = {
+  [VIO_READ_LOCK] = {
+    .type          = VIO_READ_LOCK,
+    .name          = "read",
+    .releaseReason = "candidate duplicate",
+  },
+  [VIO_WRITE_LOCK] = {
+    .type          = VIO_WRITE_LOCK,
+    .name          = "write",
+    .releaseReason = "newly allocated",
+  },
+  [VIO_COMPRESSED_WRITE_LOCK] = {
+    .type          = VIO_COMPRESSED_WRITE_LOCK,
+    .name          = "compressed write",
+    .releaseReason = "failed compression",
+  },
+  [VIO_BLOCK_MAP_WRITE_LOCK] = {
+    .type          = VIO_BLOCK_MAP_WRITE_LOCK,
+    .name          = "block map write",
+    .releaseReason = "block map write",
+  },
+};
+
+/**********************************************************************/
+static inline bool hasLockType(const PBNLock *lock, PBNLockType type)
+{
+  return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]);
+}
+
+/**********************************************************************/
+bool isPBNReadLock(const PBNLock *lock)
+{
+  return hasLockType(lock, VIO_READ_LOCK);
+}
+
+/**********************************************************************/
+static inline void setPBNLockType(PBNLock *lock, PBNLockType type)
+{
+  lock->implementation = &LOCK_IMPLEMENTATIONS[type];
+}
+
+/**********************************************************************/
+void initializePBNLock(PBNLock *lock, PBNLockType type)
+{
+  lock->holderCount = 0;
+  setPBNLockType(lock, type);
+}
+
+/**********************************************************************/
+void downgradePBNWriteLock(PBNLock *lock)
+{
+  ASSERT_LOG_ONLY(!isPBNReadLock(lock),
+                  "PBN lock must not already have been downgraded");
+  ASSERT_LOG_ONLY(!hasLockType(lock, VIO_BLOCK_MAP_WRITE_LOCK),
+                  "must not downgrade block map write locks");
+  ASSERT_LOG_ONLY(lock->holderCount == 1,
+                  "PBN write lock should have one holder but has %u",
+                  lock->holderCount);
+  if (hasLockType(lock, VIO_WRITE_LOCK)) {
+    // DataVIO write locks are downgraded in place--the writer retains the
+    // hold on the lock. They've already had a single incRef journaled.
+    lock->incrementLimit = MAXIMUM_REFERENCE_COUNT - 1;
+  } else {
+    // Compressed block write locks are downgraded when they are shared with
+    // all their hash locks. The writer is releasing its hold on the lock.
+    lock->holderCount = 0;
+    lock->incrementLimit = MAXIMUM_REFERENCE_COUNT;
+  }
+  setPBNLockType(lock, VIO_READ_LOCK);
+}
+
+/**********************************************************************/
+bool claimPBNLockIncrement(PBNLock *lock)
+{
+  /*
+   * Claim the next free reference atomically since hash locks from multiple
+   * hash zone threads might be concurrently deduplicating against a single
+   * PBN lock on compressed block. As long as hitting the increment limit will
+   * lead to the PBN lock being released in a sane time-frame, we won't
+   * overflow a 32-bit claim counter, allowing a simple add instead of a
+   * compare-and-swap.
+   */
+  uint32_t claimNumber = atomicAdd32(&lock->incrementsClaimed, 1);
+  return (claimNumber <= lock->incrementLimit);
+}
+
+/**********************************************************************/
+void assignProvisionalReference(PBNLock *lock)
+{
+  ASSERT_LOG_ONLY(!lock->hasProvisionalReference,
+                  "lock does not have a provisional reference");
+  lock->hasProvisionalReference = true;
+}
+
+/**********************************************************************/
+void unassignProvisionalReference(PBNLock *lock)
+{
+  lock->hasProvisionalReference = false;
+}
+
+/**********************************************************************/
+void releaseProvisionalReference(PBNLock             *lock,
+                                 PhysicalBlockNumber  lockedPBN,
+                                 BlockAllocator      *allocator)
+{
+  if (hasProvisionalReference(lock)) {
+    releaseBlockReference(allocator, lockedPBN,
+                          lock->implementation->releaseReason);
+    unassignProvisionalReference(lock);
+  }
+}
diff --git a/vdo/base/pbnLock.h b/vdo/base/pbnLock.h
new file mode 100644
index 0000000..bd6512b
--- /dev/null
+++ b/vdo/base/pbnLock.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.h#3 $
+ */
+
+#ifndef PBN_LOCK_H
+#define PBN_LOCK_H
+
+#include "atomic.h"
+#include "types.h"
+
+/**
+ * The type of a PBN lock.
+ **/
+typedef enum {
+  VIO_READ_LOCK = 0,
+  VIO_WRITE_LOCK,
+  VIO_COMPRESSED_WRITE_LOCK,
+  VIO_BLOCK_MAP_WRITE_LOCK,
+} PBNLockType;
+
+typedef struct pbnLockImplementation PBNLockImplementation;
+
+/**
+ * A PBN lock.
+ **/
+struct pbnLock {
+  /** The implementation of the lock */
+  const PBNLockImplementation *implementation;
+
+  /** The number of VIOs holding or sharing this lock */
+  VIOCount holderCount;
+  /**
+   * The number of compressed block writers holding a share of this lock while
+   * they are acquiring a reference to the PBN.
+   **/
+  uint8_t fragmentLocks;
+
+  /**
+   * Whether the locked PBN has been provisionally referenced on behalf of the
+   * lock holder.
+   **/
+  bool hasProvisionalReference;
+
+  /**
+   * For read locks, the number of references that were known to be available
+   * on the locked block at the time the lock was acquired.
+   **/
+  uint8_t incrementLimit;
+
+  /**
+   * For read locks, the number of DataVIOs that have tried to claim one of
+   * the available increments during the lifetime of the lock. Each claim will
+   * first increment this counter, so it can exceed the increment limit.
+   **/
+  Atomic32 incrementsClaimed;
+};
+
+/**
+ * Initialize a PBNLock.
+ *
+ * @param lock  The lock to initialize
+ * @param type  The type of the lock
+ **/
+void initializePBNLock(PBNLock *lock, PBNLockType type);
+
+/**
+ * Check whether a PBNLock is a read lock.
+ *
+ * @param lock  The lock to check
+ *
+ * @return <code>true</code> if the lock is a read lock
+ **/
+bool isPBNReadLock(const PBNLock *lock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Downgrade a PBN write lock to a PBN read lock. The lock holder count is
+ * cleared and the caller is responsible for setting the new count.
+ *
+ * @param lock  The PBN write lock to downgrade
+ **/
+void downgradePBNWriteLock(PBNLock *lock);
+
+/**
+ * Try to claim one of the available reference count increments on a read
+ * lock. Claims may be attempted from any thread. A claim is only valid until
+ * the PBN lock is released.
+ *
+ * @param lock  The PBN read lock from which to claim an increment
+ *
+ * @return <code>true</code> if the claim succeeded, guaranteeing one
+ *         increment can be made without overflowing the PBN's reference count
+ **/
+bool claimPBNLockIncrement(PBNLock *lock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a PBN lock has a provisional reference.
+ *
+ * @param lock  The PBN lock
+ **/
+static inline bool hasProvisionalReference(PBNLock *lock)
+{
+  return ((lock != NULL) && lock->hasProvisionalReference);
+}
+
+/**
+ * Inform a PBN lock that it is responsible for a provisional reference.
+ *
+ * @param lock  The PBN lock
+ **/
+void assignProvisionalReference(PBNLock *lock);
+
+/**
+ * Inform a PBN lock that it is no longer responsible for a provisional
+ * reference.
+ *
+ * @param lock  The PBN lock
+ **/
+void unassignProvisionalReference(PBNLock *lock);
+
+/**
+ * If the lock is responsible for a provisional reference, release that
+ * reference. This method is called when the lock is released.
+ *
+ * @param lock       The lock
+ * @param lockedPBN  The PBN covered by the lock
+ * @param allocator  The block allocator from which to release the reference
+ **/
+void releaseProvisionalReference(PBNLock             *lock,
+                                 PhysicalBlockNumber  lockedPBN,
+                                 BlockAllocator      *allocator);
+
+#endif /* PBN_LOCK_H */
diff --git a/vdo/base/pbnLockPool.c b/vdo/base/pbnLockPool.c
new file mode 100644
index 0000000..38e2f32
--- /dev/null
+++ b/vdo/base/pbnLockPool.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.c#2 $
+ */
+
+#include "pbnLockPool.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "ringNode.h"
+#include "pbnLock.h"
+
+/**
+ * Unused (idle) PBN locks are kept in a ring. Just like in a malloc
+ * implementation, the lock structure is unused memory, so we can save a bit
+ * of space (and not pollute the lock structure proper) by using a union to
+ * overlay the lock structure with the free list.
+ **/
+typedef union idlePBNLock {
+  /** Only used while locks are in the pool */
+  RingNode node;
+  /** Only used while locks are not in the pool */
+  PBNLock  lock;
+} IdlePBNLock;
+
+/**
+ * The lock pool is little more than the memory allocated for the locks.
+ **/
+struct pbnLockPool {
+  /** The number of locks allocated for the pool */
+  size_t      capacity;
+  /** The number of locks currently borrowed from the pool */
+  size_t      borrowed;
+  /** A ring containing all idle PBN lock instances */
+  RingNode    idleRing;
+  /** The memory for all the locks allocated by this pool */
+  IdlePBNLock locks[];
+};
+
+/**********************************************************************/
+int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr)
+{
+  PBNLockPool *pool;
+  int result = ALLOCATE_EXTENDED(PBNLockPool, capacity, IdlePBNLock, __func__,
+                                 &pool);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  pool->capacity = capacity;
+  pool->borrowed = capacity;
+  initializeRing(&pool->idleRing);
+
+  for (size_t i = 0; i < capacity; i++) {
+    PBNLock *lock = &pool->locks[i].lock;
+    returnPBNLockToPool(pool, &lock);
+  }
+
+  *poolPtr = pool;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freePBNLockPool(PBNLockPool **poolPtr)
+{
+  if (*poolPtr == NULL) {
+    return;
+  }
+
+  PBNLockPool *pool = *poolPtr;
+  ASSERT_LOG_ONLY(pool->borrowed == 0,
+                  "All PBN locks must be returned to the pool before it is"
+                  " freed, but %zu locks are still on loan",
+                  pool->borrowed);
+  FREE(pool);
+  *poolPtr = NULL;
+}
+
+/**********************************************************************/
+int borrowPBNLockFromPool(PBNLockPool  *pool,
+                          PBNLockType   type,
+                          PBNLock     **lockPtr)
+{
+  if (pool->borrowed >= pool->capacity) {
+    return logErrorWithStringError(VDO_LOCK_ERROR,
+                                   "no free PBN locks left to borrow");
+  }
+  pool->borrowed += 1;
+
+  RingNode *idleNode = popRingNode(&pool->idleRing);
+  // The lock was zeroed when it was placed in the pool, but the overlapping
+  // ring pointers are non-zero after a pop.
+  memset(idleNode, 0, sizeof(*idleNode));
+
+  STATIC_ASSERT(offsetof(IdlePBNLock, node) == offsetof(IdlePBNLock, lock));
+  PBNLock *lock = (PBNLock *) idleNode;
+  initializePBNLock(lock, type);
+
+  *lockPtr = lock;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr)
+{
+  // Take what should be the last lock reference from the caller
+  PBNLock *lock = *lockPtr;
+  *lockPtr = NULL;
+
+  // A bit expensive, but will promptly catch some use-after-free errors.
+  memset(lock, 0, sizeof(*lock));
+
+  RingNode *idleNode = (RingNode *) lock;
+  initializeRing(idleNode);
+  pushRingNode(&pool->idleRing, idleNode);
+
+  ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed");
+  pool->borrowed -= 1;
+}
diff --git a/vdo/base/pbnLockPool.h b/vdo/base/pbnLockPool.h
new file mode 100644
index 0000000..6853f84
--- /dev/null
+++ b/vdo/base/pbnLockPool.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.h#1 $
+ */
+
+#ifndef PBN_LOCK_POOL_H
+#define PBN_LOCK_POOL_H
+
+#include "pbnLock.h"
+#include "types.h"
+
+typedef struct pbnLockPool PBNLockPool;
+
+/**
+ * Create a new PBN lock pool and all the lock instances it can loan out.
+ *
+ * @param [in]  capacity   The number of PBN locks to allocate for the pool
+ * @param [out] poolPtr    A pointer to receive the new pool
+ *
+ * @return a VDO_SUCCESS or an error code
+ **/
+int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a PBN lock pool null out the reference to it. This also frees all all
+ * the PBN locks it allocated, so the caller must ensure that all locks have
+ * been returned to the pool.
+ *
+ * @param [in,out] poolPtr  The reference to the lock pool to free
+ **/
+void freePBNLockPool(PBNLockPool **poolPtr);
+
+/**
+ * Borrow a PBN lock from the pool and initialize it with the provided type.
+ * Pools do not grow on demand or allocate memory, so this will fail if the
+ * pool is empty. Borrowed locks are still associated with this pool and must
+ * be returned to only this pool.
+ *
+ * @param [in]  pool     The pool from which to borrow
+ * @param [in]  type     The type with which to initialize the lock
+ * @param [out] lockPtr  A pointer to receive the borrowed lock
+ *
+ * @return VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty
+ **/
+int borrowPBNLockFromPool(PBNLockPool  *pool,
+                          PBNLockType   type,
+                          PBNLock     **lockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return to the pool a lock that was borrowed from it, and null out the
+ * caller's reference to it. It must be the last live reference, as if the
+ * memory were being freed (the lock memory will re-initialized or zeroed).
+ *
+ * @param [in]     pool     The pool from which the lock was borrowed
+ * @param [in,out] lockPtr  The last reference to the lock being returned
+ **/
+void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr);
+
+#endif // PBN_LOCK_POOL_H
diff --git a/vdo/base/physicalLayer.c b/vdo/base/physicalLayer.c
new file mode 100644
index 0000000..231a3bf
--- /dev/null
+++ b/vdo/base/physicalLayer.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.c#1 $
+ */
+
+#include "physicalLayer.h"
+
+static PhysicalLayerGetter *physicalLayerGetter;
+
+/**********************************************************************/
+void registerPhysicalLayerGetter(PhysicalLayerGetter *getter)
+{
+  physicalLayerGetter = getter;
+}
+
+/**********************************************************************/
+PhysicalLayer *getPhysicalLayer(void)
+{
+  if (physicalLayerGetter != NULL) {
+    return (*physicalLayerGetter)();
+  }
+  return NULL;
+}
diff --git a/vdo/base/physicalLayer.h b/vdo/base/physicalLayer.h
new file mode 100644
index 0000000..18d6a20
--- /dev/null
+++ b/vdo/base/physicalLayer.h
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.h#2 $
+ */
+
+#ifndef PHYSICAL_LAYER_H
+#define PHYSICAL_LAYER_H
+
+#include "types.h"
+
+static const CRC32Checksum INITIAL_CHECKSUM = 0xffffffff;
+
+enum {
+  /* The size of a CRC-32 checksum */
+  CHECKSUM_SIZE = sizeof(CRC32Checksum),
+};
+
+/**
+ * A function to destroy a physical layer and NULL out the reference to it.
+ *
+ * @param layerPtr  A pointer to the layer to destroy
+ **/
+typedef void LayerDestructor(PhysicalLayer **layerPtr);
+
+/**
+ * A function to update a running CRC-32 checksum.
+ *
+ * @param crc     The current value of the crc
+ * @param buffer  The data to add to the checksum
+ * @param length  The length of the data
+ *
+ * @return The updated value of the checksum
+ **/
+typedef uint32_t CRC32Updater(CRC32Checksum  crc,
+                              const byte    *buffer,
+                              size_t         length);
+
+/**
+ * A function to report the block count of a physicalLayer.
+ *
+ * @param layer  The layer
+ *
+ * @return The block count of the layer
+ **/
+typedef BlockCount BlockCountGetter(PhysicalLayer *layer);
+
+/**
+ * A function which can allocate a buffer suitable for use in an
+ * ExtentReader or ExtentWriter.
+ *
+ * @param [in]  layer      The physical layer in question
+ * @param [in]  bytes      The size of the buffer, in bytes.
+ * @param [in]  why        The occasion for allocating the buffer
+ * @param [out] bufferPtr  A pointer to hold the buffer
+ *
+ * @return a success or error code
+ **/
+typedef int BufferAllocator(PhysicalLayer  *layer,
+                            size_t          bytes,
+                            const char     *why,
+                            char          **bufferPtr);
+
+/**
+ * A function which can read an extent from a physicalLayer.
+ *
+ * @param [in]  layer       The physical layer from which to read
+ * @param [in]  startBlock  The physical block number of the start of the
+ *                          extent
+ * @param [in]  blockCount  The number of blocks in the extent
+ * @param [out] buffer      A buffer to hold the extent
+ * @param [out] blocksRead  A pointer to hold the number of blocks read (may be
+ *                          NULL)
+ *
+ * @return a success or error code
+ **/
+typedef int ExtentReader(PhysicalLayer       *layer,
+                         PhysicalBlockNumber  startBlock,
+                         size_t               blockCount,
+                         char                *buffer,
+                         size_t              *blocksRead);
+
+/**
+ * A function which can write an extent to a physicalLayer.
+ *
+ * @param [in]  layer          The physical layer to which to write
+ * @param [in]  startBlock     The physical block number of the start of the
+ *                             extent
+ * @param [in]  blockCount     The number of blocks in the extent
+ * @param [in]  buffer         The buffer which contains the data
+ * @param [out] blocksWritten  A pointer to hold the number of blocks written
+ *                             (may be NULL)
+ *
+ * @return a success or error code
+ **/
+typedef int ExtentWriter(PhysicalLayer       *layer,
+                         PhysicalBlockNumber  startBlock,
+                         size_t               blockCount,
+                         char                *buffer,
+                         size_t              *blocksWritten);
+
+/**
+ * A function to allocate a metadata VIO.
+ *
+ * @param [in]  layer     The physical layer
+ * @param [in]  vioType   The type of VIO to create
+ * @param [in]  priority  The relative priority to assign to the VIOs
+ * @param [in]  parent    The parent of this VIO
+ * @param [in]  data      The buffer
+ * @param [out] vioPtr    A pointer to hold the new VIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+typedef int MetadataVIOCreator(PhysicalLayer  *layer,
+                               VIOType         vioType,
+                               VIOPriority     priority,
+                               void           *parent,
+                               char           *data,
+                               VIO           **vioPtr);
+
+/**
+ * A function to allocate an AllocatingVIO for compressed writes.
+ *
+ * @param [in]  layer             The physical layer
+ * @param [in]  parent            The parent of this VIO
+ * @param [in]  data              The buffer
+ * @param [out] allocatingVIOPtr  A pointer to hold the new AllocatingVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+typedef int CompressedWriteVIOCreator(PhysicalLayer  *layer,
+                                      void           *parent,
+                                      char           *data,
+                                      AllocatingVIO **allocatingVIOPtr);
+
+/**
+ * A function to destroy a VIO. The pointer to the VIO will be nulled out.
+ *
+ * @param vioPtr  A pointer to the VIO to destroy
+ **/
+typedef void VIODestructor(VIO **vioPtr);
+
+/**
+ * A function to zero the contents of a DataVIO.
+ *
+ * @param dataVIO  The DataVIO to zero
+ **/
+typedef AsyncDataOperation DataVIOZeroer;
+
+/**
+ * A function to copy the contents of a DataVIO into another DataVIO.
+ *
+ * @param source       The dataVIO to copy from
+ * @param destination  The dataVIO to copy to
+ **/
+typedef void DataCopier(DataVIO *source, DataVIO *destination);
+
+/**
+ * A function to apply a partial write to a DataVIO which has completed the
+ * read portion of a read-modify-write operation.
+ *
+ * @param dataVIO  The dataVIO to modify
+ **/
+typedef AsyncDataOperation DataModifier;
+
+/**
+ * A function to asynchronously hash the block data, setting the chunk name of
+ * the DataVIO. This is asynchronous to allow the computation to be done on
+ * different threads.
+ *
+ * @param dataVIO  The DataVIO to hash
+ **/
+typedef AsyncDataOperation DataHasher;
+
+/**
+ * A function to determine whether a block is a duplicate. This function
+ * expects the 'physical' field of the DataVIO to be set to the physical block
+ * where the block will be written if it is not a duplicate. If the block does
+ * turn out to be a duplicate, the DataVIO's 'isDuplicate' field will be set to
+ * true, and the DataVIO's 'advice' field will be set to the physical block and
+ * mapping state of the already stored copy of the block.
+ *
+ * @param dataVIO  The DataVIO containing the block to check.
+ **/
+typedef AsyncDataOperation DuplicationChecker;
+
+/**
+ * A function to verify the duplication advice by examining an already-stored
+ * data block. This function expects the 'physical' field of the DataVIO to be
+ * set to the physical block where the block will be written if it is not a
+ * duplicate, and the 'duplicate' field to be set to the physical block and
+ * mapping state where a copy of the data may already exist. If the block is
+ * not a duplicate, the DataVIO's 'isDuplicate' field will be cleared.
+ *
+ * @param dataVIO  The dataVIO containing the block to check.
+ **/
+typedef AsyncDataOperation DuplicationVerifier;
+
+/**
+ * A function to read a single DataVIO from the layer.
+ *
+ * If the DataVIO does not describe a read-modify-write operation, the
+ * physical layer may safely acknowledge the related user I/O request
+ * as complete.
+ *
+ * @param dataVIO  The DataVIO to read
+ **/
+typedef AsyncDataOperation DataReader;
+
+/**
+ * A function to read a single metadata VIO from the layer.
+ *
+ * @param vio  The vio to read
+ **/
+typedef AsyncOperation MetadataReader;
+
+/**
+ * A function to write a single DataVIO to the layer
+ *
+ * @param dataVIO  The DataVIO to write
+ **/
+typedef AsyncDataOperation DataWriter;
+
+/**
+ * A function to write a single metadata VIO from the layer.
+ *
+ * @param vio  The vio to write
+ **/
+typedef AsyncOperation MetadataWriter;
+
+/**
+ * A function to inform the layer that a DataVIO's related I/O request can be
+ * safely acknowledged as complete, even though the DataVIO itself may have
+ * further processing to do.
+ *
+ * @param dataVIO  The DataVIO to acknowledge
+ **/
+typedef AsyncDataOperation DataAcknowledger;
+
+/**
+ * A function to compare the contents of a DataVIO to another DataVIO.
+ *
+ * @param first   The first DataVIO to compare
+ * @param second  The second DataVIO to compare
+ *
+ * @return <code>true</code> if the contents of the two DataVIOs are the same
+ **/
+typedef bool DataVIOComparator(DataVIO *first, DataVIO *second);
+
+/**
+ * A function to compress the data in a DataVIO.
+ *
+ * @param dataVIO  The DataVIO to compress
+ **/
+typedef AsyncDataOperation DataCompressor;
+
+/**
+ * Update albireo.
+ *
+ * @param dataVIO  The DataVIO which needs to change the entry for its data
+ **/
+typedef AsyncDataOperation AlbireoUpdater;
+
+/**
+ * A function to finish flush requests
+ *
+ * @param vdoFlush  The flush requests
+ **/
+typedef void FlushComplete(VDOFlush **vdoFlush);
+
+/**
+ * A function to query the write policy of the layer.
+ *
+ * @param layer  The layer to query
+ *
+ * @return the write policy of the layer
+ **/
+typedef WritePolicy WritePolicyGetter(PhysicalLayer *layer);
+
+/**
+ * A function to create an object that can be enqueued to run in a specified
+ * thread. The Enqueueable will be put into the 'enqueueable' field of the
+ * supplied completion.
+ *
+ * @param completion  The completion to invoke the callback of
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+typedef int EnqueueableCreator(VDOCompletion *completion);
+
+/**
+ * A function to destroy and deallocate an Enqueueable object.
+ *
+ * @param enqueueablePtr  Pointer to the object pointer to be destroyed
+ **/
+typedef void EnqueueableDestructor(Enqueueable **enqueueablePtr);
+
+/**
+ * A function to enqueue the Enqueueable object to run on the thread specified
+ * by its associated completion.
+ *
+ * @param enqueueable  The object to be enqueued
+ **/
+typedef void Enqueuer(Enqueueable *enqueueable);
+
+/**
+ * A function to wait for an admin operation to complete. This function should
+ * not be called from a base-code thread.
+ *
+ * @param layer  The layer on which to wait
+ **/
+typedef void OperationWaiter(PhysicalLayer *layer);
+
+/**
+ * A function to inform the layer of the result of an admin operation.
+ *
+ * @param layer  The layer to inform
+ **/
+typedef void OperationComplete(PhysicalLayer *layer);
+
+/**
+ * A function to get the id of the current thread.
+ *
+ * @return The id of the current thread
+ **/
+typedef ThreadID ThreadIDGetter(void);
+
+/**
+ * A function to return the physical layer pointer for the current thread.
+ *
+ * @return The physical layer pointer
+ **/
+typedef PhysicalLayer *PhysicalLayerGetter(void);
+
+/**
+ * An abstraction representing the underlying physical layer.
+ **/
+struct physicalLayer {
+  // Management interface
+  LayerDestructor           *destroy;
+
+  // Synchronous interface
+  CRC32Updater              *updateCRC32;
+  BlockCountGetter          *getBlockCount;
+
+  // Synchronous IO interface
+  BufferAllocator           *allocateIOBuffer;
+  ExtentReader              *reader;
+  ExtentWriter              *writer;
+
+  WritePolicyGetter         *getWritePolicy;
+
+  // Synchronous interfaces (vio-based)
+  MetadataVIOCreator        *createMetadataVIO;
+  CompressedWriteVIOCreator *createCompressedWriteVIO;
+  VIODestructor             *freeVIO;
+  DataVIOZeroer             *zeroDataVIO;
+  DataCopier                *copyData;
+  DataModifier              *applyPartialWrite;
+
+  // Asynchronous interface (vio-based)
+  DataHasher                *hashData;
+  DuplicationChecker        *checkForDuplication;
+  DuplicationVerifier       *verifyDuplication;
+  DataReader                *readData;
+  DataWriter                *writeData;
+  CompressedWriter          *writeCompressedBlock;
+  MetadataReader            *readMetadata;
+  MetadataWriter            *writeMetadata;
+  MetadataWriter            *flush;
+  DataAcknowledger          *acknowledgeDataVIO;
+  DataVIOComparator         *compareDataVIOs;
+  DataCompressor            *compressDataVIO;
+  AlbireoUpdater            *updateAlbireo;
+
+  // Asynchronous interface (other)
+  FlushComplete             *completeFlush;
+  EnqueueableCreator        *createEnqueueable;
+  EnqueueableDestructor     *destroyEnqueueable;
+  Enqueuer                  *enqueue;
+  OperationWaiter           *waitForAdminOperation;
+  OperationComplete         *completeAdminOperation;
+
+  // Thread specific interface
+  ThreadIDGetter            *getCurrentThreadID;
+};
+
+/**
+ * Register the layer-specific implementation of getPhysicalLayer().
+ *
+ * @param getter  The function to be called
+ **/
+void registerPhysicalLayerGetter(PhysicalLayerGetter *getter);
+
+/**
+ * Fetch the physical layer pointer for the current thread.
+ *
+ * @return The physical layer pointer
+ **/
+PhysicalLayer *getPhysicalLayer(void);
+
+/**
+ * Get the id of the callback thread on which a completion is current running.
+ *
+ * @return the current thread ID
+ **/
+static inline ThreadID getCallbackThreadID(void)
+{
+  return getPhysicalLayer()->getCurrentThreadID();
+}
+
+#endif // PHYSICAL_LAYER_H
diff --git a/vdo/base/physicalZone.c b/vdo/base/physicalZone.c
new file mode 100644
index 0000000..accb631
--- /dev/null
+++ b/vdo/base/physicalZone.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.c#3 $
+ */
+
+#include "physicalZone.h"
+
+#include "memoryAlloc.h"
+
+#include "blockAllocator.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "flush.h"
+#include "hashLock.h"
+#include "intMap.h"
+#include "pbnLock.h"
+#include "pbnLockPool.h"
+#include "slabDepot.h"
+#include "vdoInternal.h"
+
+enum {
+  // Each user DataVIO needs a PBN read lock and write lock, and each packer
+  // output bin has an AllocatingVIO that needs a PBN write lock.
+  LOCK_POOL_CAPACITY = 2 * MAXIMUM_USER_VIOS + DEFAULT_PACKER_OUTPUT_BINS,
+};
+
+struct physicalZone {
+  /** Which physical zone this is */
+  ZoneCount       zoneNumber;
+  /** The thread ID for this zone */
+  ThreadID        threadID;
+  /** In progress operations keyed by PBN */
+  IntMap         *pbnOperations;
+  /** Pool of unused PBNLock instances */
+  PBNLockPool    *lockPool;
+  /** The block allocator for this zone */
+  BlockAllocator *allocator;
+};
+
+/**********************************************************************/
+int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr)
+{
+  PhysicalZone *zone;
+  int result = ALLOCATE(1, PhysicalZone, __func__, &zone);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->pbnOperations);
+  if (result != VDO_SUCCESS) {
+    freePhysicalZone(&zone);
+    return result;
+  }
+
+  result = makePBNLockPool(LOCK_POOL_CAPACITY, &zone->lockPool);
+  if (result != VDO_SUCCESS) {
+    freePhysicalZone(&zone);
+    return result;
+  }
+
+  zone->zoneNumber = zoneNumber;
+  zone->threadID   = getPhysicalZoneThread(getThreadConfig(vdo), zoneNumber);
+  zone->allocator  = getBlockAllocatorForZone(vdo->depot, zoneNumber);
+
+  *zonePtr = zone;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freePhysicalZone(PhysicalZone **zonePtr)
+{
+  if (*zonePtr == NULL) {
+    return;
+  }
+
+  PhysicalZone *zone = *zonePtr;
+  freePBNLockPool(&zone->lockPool);
+  freeIntMap(&zone->pbnOperations);
+  FREE(zone);
+  *zonePtr = NULL;
+}
+
+/**********************************************************************/
+ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone)
+{
+  return zone->zoneNumber;
+}
+
+/**********************************************************************/
+ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone)
+{
+  return zone->threadID;
+}
+
+/**********************************************************************/
+BlockAllocator *getBlockAllocator(const PhysicalZone *zone)
+{
+  return zone->allocator;
+}
+
+/**********************************************************************/
+PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn)
+{
+  return ((zone == NULL) ? NULL : intMapGet(zone->pbnOperations, pbn));
+}
+
+/**********************************************************************/
+int attemptPBNLock(PhysicalZone         *zone,
+                   PhysicalBlockNumber   pbn,
+                   PBNLockType           type,
+                   PBNLock             **lockPtr)
+{
+  // Borrow and prepare a lock from the pool so we don't have to do two IntMap
+  // accesses in the common case of no lock contention.
+  PBNLock *newLock;
+  int result = borrowPBNLockFromPool(zone->lockPool, type, &newLock);
+  if (result != VDO_SUCCESS) {
+    ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock");
+    return result;
+  }
+
+  PBNLock *lock;
+  result = intMapPut(zone->pbnOperations, pbn, newLock, false,
+                     (void **) &lock);
+  if (result != VDO_SUCCESS) {
+    returnPBNLockToPool(zone->lockPool, &newLock);
+    return result;
+  }
+
+  if (lock != NULL) {
+    // The lock is already held, so we don't need the borrowed lock.
+    returnPBNLockToPool(zone->lockPool, &newLock);
+
+    result = ASSERT(lock->holderCount > 0,
+                    "physical block %llu lock held", pbn);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+    *lockPtr = lock;
+  } else {
+    *lockPtr = newLock;
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void releasePBNLock(PhysicalZone         *zone,
+                    PhysicalBlockNumber   lockedPBN,
+                    PBNLock             **lockPtr)
+{
+  PBNLock *lock = *lockPtr;
+  if (lock == NULL) {
+    return;
+  }
+  *lockPtr = NULL;
+
+  ASSERT_LOG_ONLY(lock->holderCount > 0,
+                  "should not be releasing a lock that is not held");
+
+  lock->holderCount -= 1;
+  if (lock->holderCount > 0) {
+    // The lock was shared and is still referenced, so don't release it yet.
+    return;
+  }
+
+  PBNLock *holder = intMapRemove(zone->pbnOperations, lockedPBN);
+  ASSERT_LOG_ONLY((lock == holder),
+                  "physical block lock mismatch for block %llu",
+                  lockedPBN);
+
+  releaseProvisionalReference(lock, lockedPBN, zone->allocator);
+
+  returnPBNLockToPool(zone->lockPool, &lock);
+}
+
+/**********************************************************************/
+void dumpPhysicalZone(const PhysicalZone *zone)
+{
+  dumpBlockAllocator(zone->allocator);
+}
diff --git a/vdo/base/physicalZone.h b/vdo/base/physicalZone.h
new file mode 100644
index 0000000..2c02bbe
--- /dev/null
+++ b/vdo/base/physicalZone.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.h#1 $
+ */
+
+#ifndef PHYSICAL_ZONE_H
+#define PHYSICAL_ZONE_H
+
+#include "pbnLock.h"
+#include "types.h"
+
+/**
+ * Create a physical zone.
+ *
+ * @param [in]  vdo         The VDO to which the zone will belong
+ * @param [in]  zoneNumber  The number of the zone to create
+ * @param [out] zonePtr     A pointer to hold the new PhysicalZone
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a physical zone and null out the reference to it.
+ *
+ * @param zonePtr  A pointer to the zone to free
+ **/
+void freePhysicalZone(PhysicalZone **zonePtr);
+
+/**
+ * Get the zone number of a physical zone.
+ *
+ * @param zone  The zone
+ *
+ * @return The number of the zone
+ **/
+ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the ID of a physical zone's thread.
+ *
+ * @param zone  The zone
+ *
+ * @return The zone's thread ID
+ **/
+ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the block allocator from a physical zone.
+ *
+ * @param zone  The zone
+ *
+ * @return The zone's allocator
+ **/
+BlockAllocator *getBlockAllocator(const PhysicalZone *zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the lock on a PBN if one exists.
+ *
+ * @param zone  The physical zone responsible for the PBN
+ * @param pbn   The physical block number whose lock is desired
+ *
+ * @return The lock or NULL if the PBN is not locked
+ **/
+PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to lock a physical block in the zone responsible for it. If the PBN
+ * is already locked, the existing lock will be returned. Otherwise, a new
+ * lock instance will be borrowed from the pool, initialized, and returned.
+ * The lock owner will be NULL for a new lock acquired by the caller, who is
+ * responsible for setting that field promptly. The lock owner will be
+ * non-NULL when there is already an existing lock on the PBN.
+ *
+ * @param [in]  zone     The physical zone responsible for the PBN
+ * @param [in]  pbn      The physical block number to lock
+ * @param [in]  type     The type with which to initialize a new lock
+ * @param [out] lockPtr  A pointer to receive the lock, existing or new
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int attemptPBNLock(PhysicalZone         *zone,
+                   PhysicalBlockNumber   pbn,
+                   PBNLockType           type,
+                   PBNLock             **lockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Release a physical block lock if it is held, return it to the lock pool,
+ * and null out the caller's reference to it. It must be the last live
+ * reference, as if the memory were being freed (the lock memory will
+ * re-initialized or zeroed).
+ *
+ * @param [in]     zone       The physical zone in which the lock was obtained
+ * @param [in]     lockedPBN  The physical block number to unlock
+ * @param [in,out] lockPtr    The last reference to the lock being released
+ **/
+void releasePBNLock(PhysicalZone         *zone,
+                    PhysicalBlockNumber   lockedPBN,
+                    PBNLock             **lockPtr);
+
+/**
+ * Dump information about a physical zone to the log for debugging.
+ *
+ * @param zone   The zone to dump
+ **/
+void dumpPhysicalZone(const PhysicalZone *zone);
+
+#endif // PHYSICAL_ZONE_H
diff --git a/vdo/base/pointerMap.c b/vdo/base/pointerMap.c
new file mode 100644
index 0000000..395f266
--- /dev/null
+++ b/vdo/base/pointerMap.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.c#1 $
+ */
+
+/**
+ * Hash table implementation of a map from integers to pointers, implemented
+ * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see
+ * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does
+ * not contain any of the locking/concurrency features of the algorithm, just
+ * the collision resolution scheme.
+ *
+ * Hopscotch Hashing is based on hashing with open addressing and linear
+ * probing. All the entries are stored in a fixed array of buckets, with no
+ * dynamic allocation for collisions. Unlike linear probing, all the entries
+ * that hash to a given bucket are stored within a fixed neighborhood starting
+ * at that bucket. Chaining is effectively represented as a bit vector
+ * relative to each bucket instead of as pointers or explicit offsets.
+ *
+ * When an empty bucket cannot be found within a given neighborhood,
+ * subsequent neighborhoods are searched, and one or more entries will "hop"
+ * into those neighborhoods. When this process works, an empty bucket will
+ * move into the desired neighborhood, allowing the entry to be added. When
+ * that process fails (typically when the buckets are around 90% full), the
+ * table must be resized and the all entries rehashed and added to the
+ * expanded table.
+ *
+ * Unlike linear probing, the number of buckets that must be searched in the
+ * worst case has a fixed upper bound (the size of the neighborhood). Those
+ * entries occupy a small number of memory cache lines, leading to improved
+ * use of the cache (fewer misses on both successful and unsuccessful
+ * searches). Hopscotch hashing outperforms linear probing at much higher load
+ * factors, so even with the increased memory burden for maintaining the hop
+ * vectors, less memory is needed to achieve that performance. Hopscotch is
+ * also immune to "contamination" from deleting entries since entries are
+ * genuinely removed instead of being replaced by a placeholder.
+ *
+ * The published description of the algorithm used a bit vector, but the paper
+ * alludes to an offset scheme which is used by this implementation. Since the
+ * entries in the neighborhood are within N entries of the hash bucket at the
+ * start of the neighborhood, a pair of small offset fields each log2(N) bits
+ * wide is all that's needed to maintain the hops as a linked list. In order
+ * to encode "no next hop" (i.e. NULL) as the natural initial value of zero,
+ * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 =>
+ * offset=1, etc.) We can represent neighborhoods of up to 255 entries with
+ * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the
+ * first entry in the list is always the bucket closest to the start of the
+ * neighborhood.
+ *
+ * While individual accesses tend to be very fast, the table resize operations
+ * are very very expensive. If an upper bound on the latency of adding an
+ * entry to the table is needed, we either need to ensure the table is
+ * pre-sized to be large enough so no resize is ever needed, or we'll need to
+ * develop an approach to incrementally resize the table.
+ **/
+
+#include "pointerMap.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+enum {
+  DEFAULT_CAPACITY = 16,    // the number of neighborhoods in a new table
+  NEIGHBORHOOD     = 255,   // the number of buckets in each neighborhood
+  MAX_PROBES       = 1024,  // limit on the number of probes for a free bucket
+  NULL_HOP_OFFSET  = 0,     // the hop offset value terminating the hop list
+  DEFAULT_LOAD     = 75     // a compromise between memory use and performance
+};
+
+/**
+ * Buckets are packed together to reduce memory usage and improve cache
+ * efficiency. It would be tempting to encode the hop offsets separately and
+ * maintain alignment of key/value pairs, but it's crucial to keep the hop
+ * fields near the buckets that they use them so they'll tend to share cache
+ * lines.
+ **/
+typedef struct __attribute__((packed)) bucket {
+  uint8_t   firstHop;  // the biased offset of the first entry in the hop list
+                       // of the neighborhood that hashes to this bucket
+  uint8_t   nextHop;   // the biased offset of the next bucket in the hop list
+
+  const void *key;     // the key stored in this bucket
+  void       *value;   // the value stored in this bucket (NULL if empty)
+} Bucket;
+
+/**
+ * The concrete definition of the opaque PointerMap type. To avoid having to
+ * wrap the neighborhoods of the last entries back around to the start of the
+ * bucket array, we allocate a few more buckets at the end of the array
+ * instead, which is why capacity and bucketCount are different.
+ **/
+struct pointerMap {
+  /** the number of entries stored in the map */
+  size_t                size;
+  /** the number of neighborhoods in the map */
+  size_t                capacity;
+  /** the number of buckets in the bucket array */
+  size_t                bucketCount;
+  /** the array of hash buckets */
+  Bucket               *buckets;
+  /** the function for comparing keys for equality */
+  PointerKeyComparator *comparator;
+  /** the function for getting a hash code from a key */
+  PointerKeyHasher     *hasher;
+};
+
+/**
+ * Initialize a PointerMap.
+ *
+ * @param map       the map to initialize
+ * @param capacity  the initial capacity of the map
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int allocateBuckets(PointerMap *map, size_t capacity)
+{
+  map->size     = 0;
+  map->capacity = capacity;
+
+  // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a
+  // full neighborhood without have to wrap back around to element zero.
+  map->bucketCount = capacity + (NEIGHBORHOOD - 1);
+  return ALLOCATE(map->bucketCount, Bucket, "PointerMap buckets",
+                  &map->buckets);
+}
+
+/**********************************************************************/
+int makePointerMap(size_t                 initialCapacity,
+                   unsigned int           initialLoad,
+                   PointerKeyComparator   comparator,
+                   PointerKeyHasher       hasher,
+                   PointerMap           **mapPtr)
+{
+  // Use the default initial load if the caller did not specify one.
+  if (initialLoad == 0) {
+    initialLoad = DEFAULT_LOAD;
+  }
+  if (initialLoad > 100) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  PointerMap *map;
+  int result = ALLOCATE(1, PointerMap, "PointerMap", &map);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  map->hasher     = hasher;
+  map->comparator = comparator;
+
+  // Use the default capacity if the caller did not specify one.
+  size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY;
+
+  // Scale up the capacity by the specified initial load factor.
+  // (i.e to hold 1000 entries at 80% load we need a capacity of 1250)
+  capacity = capacity * 100 / initialLoad;
+
+  result = allocateBuckets(map, capacity);
+  if (result != UDS_SUCCESS) {
+    freePointerMap(&map);
+    return result;
+  }
+
+  *mapPtr = map;
+  return UDS_SUCCESS;
+}
+
+/**
+ * Free the bucket array for the map.
+ *
+ * @param map  the map whose bucket array is to be freed
+ **/
+static void freeBuckets(PointerMap *map)
+{
+  FREE(map->buckets);
+  map->buckets = NULL;
+}
+
+/**********************************************************************/
+void freePointerMap(PointerMap **mapPtr)
+{
+  if (*mapPtr != NULL) {
+    freeBuckets(*mapPtr);
+    FREE(*mapPtr);
+    *mapPtr = NULL;
+  }
+}
+
+/**********************************************************************/
+size_t pointerMapSize(const PointerMap *map)
+{
+  return map->size;
+}
+
+/**
+ * Convert a biased hop offset within a neighborhood to a pointer to the
+ * bucket it references.
+ *
+ * @param neighborhood  the first bucket in the neighborhood
+ * @param hopOffset     the biased hop offset to the desired bucket
+ *
+ * @return <code>NULL</code> if hopOffset is zero, otherwise a pointer to
+ *         the bucket in the neighborhood at <code>hopOffset - 1</code>
+ **/
+static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset)
+{
+  if (hopOffset == NULL_HOP_OFFSET) {
+    return NULL;
+  }
+
+  STATIC_ASSERT(NULL_HOP_OFFSET == 0);
+  return &neighborhood[hopOffset - 1];
+}
+
+/**
+ * Add a bucket into the hop list for the neighborhood, inserting it into the
+ * list so the hop list remains sorted by hop offset.
+ *
+ * @param neighborhood  the first bucket in the neighborhood
+ * @param newBucket     the bucket to add to the hop list
+ **/
+static void insertInHopList(Bucket *neighborhood, Bucket *newBucket)
+{
+  // Zero indicates a NULL hop offset, so bias the hop offset by one.
+  int hopOffset = 1 + (newBucket - neighborhood);
+
+  // Handle the special case of adding a bucket at the start of the list.
+  int nextHop = neighborhood->firstHop;
+  if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
+    newBucket->nextHop = nextHop;
+    neighborhood->firstHop = hopOffset;
+    return;
+  }
+
+  // Search the hop list for the insertion point that maintains the sort
+  // order.
+  for (;;) {
+    Bucket *bucket = dereferenceHop(neighborhood, nextHop);
+    nextHop = bucket->nextHop;
+
+    if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) {
+      newBucket->nextHop = nextHop;
+      bucket->nextHop = hopOffset;
+      return;
+    }
+  }
+}
+
+/**
+ * Select and return the hash bucket for a given search key.
+ *
+ * @param map  the map to search
+ * @param key  the mapping key
+ **/
+static Bucket *selectBucket(const PointerMap *map, const void *key)
+{
+  /*
+   * Scale the 32-bit hash to a bucket index by treating it as a binary
+   * fraction and multiplying that by the capacity. If the hash is uniformly
+   * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be
+   * uniformly distributed over [0 .. capacity-1]. The multiply and shift is
+   * much faster than a divide (modulus) on X86 CPUs.
+   */
+  uint64_t hash = map->hasher(key);
+  return &map->buckets[(hash * map->capacity) >> 32];
+}
+
+/**
+ * Search the hop list associated with given hash bucket for a given search
+ * key. If the key is found, returns a pointer to the entry (bucket or
+ * collision), otherwise returns <code>NULL</code>.
+ *
+ * @param [in]  map          the map being searched
+ * @param [in]  bucket       the map bucket to search for the key
+ * @param [in]  key          the mapping key
+ * @param [out] previousPtr  if not <code>NULL</code>, a pointer in which to
+ *                           store the bucket in the list preceding the one
+ *                           that had the matching key
+ *
+ * @return an entry that matches the key, or <code>NULL</code> if not found
+ **/
+static Bucket *searchHopList(PointerMap  *map,
+                             Bucket      *bucket,
+                             const void  *key,
+                             Bucket     **previousPtr)
+{
+  Bucket *previous = NULL;
+  unsigned int nextHop = bucket->firstHop;
+  while (nextHop != NULL_HOP_OFFSET) {
+    // Check the neighboring bucket indexed by the offset for the desired key.
+    Bucket *entry = dereferenceHop(bucket, nextHop);
+    if ((entry->value != NULL) && map->comparator(key, entry->key)) {
+      if (previousPtr != NULL) {
+        *previousPtr = previous;
+      }
+      return entry;
+    }
+    nextHop = entry->nextHop;
+    previous = entry;
+  }
+  return NULL;
+}
+
+/**********************************************************************/
+void *pointerMapGet(PointerMap *map, const void *key)
+{
+  Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL);
+  return ((match != NULL) ? match->value : NULL);
+}
+
+/**
+ * Increase the number of hash buckets and rehash all the existing entries,
+ * storing them in the new buckets.
+ *
+ * @param map  the map to resize
+ **/
+static int resizeBuckets(PointerMap *map)
+{
+  // Copy the top-level map data to the stack.
+  PointerMap oldMap = *map;
+
+  // Re-initialize the map to be empty and 50% larger.
+  size_t newCapacity = map->capacity / 2 * 3;
+  logInfo("%s: attempting resize from %zu to %zu, current size=%zu",
+          __func__, map->capacity, newCapacity, map->size);
+  int result = allocateBuckets(map, newCapacity);
+  if (result != UDS_SUCCESS) {
+    *map = oldMap;
+    return result;
+  }
+
+  // Populate the new hash table from the entries in the old bucket array.
+  for (size_t i = 0; i < oldMap.bucketCount; i++) {
+    Bucket *entry = &oldMap.buckets[i];
+    if (entry->value == NULL) {
+      continue;
+    }
+
+    result = pointerMapPut(map, entry->key, entry->value, true, NULL);
+    if (result != UDS_SUCCESS) {
+      // Destroy the new partial map and restore the map from the stack.
+      freeBuckets(map);
+      *map = oldMap;
+      return result;
+    }
+  }
+
+  // Destroy the old bucket array.
+  freeBuckets(&oldMap);
+  return UDS_SUCCESS;
+}
+
+/**
+ * Probe the bucket array starting at the given bucket for the next empty
+ * bucket, returning a pointer to it. <code>NULL</code> will be returned if
+ * the search reaches the end of the bucket array or if the number of linear
+ * probes exceeds a specified limit.
+ *
+ * @param map        the map containing the buckets to search
+ * @param bucket     the bucket at which to start probing
+ * @param maxProbes  the maximum number of buckets to search
+ *
+ * @return the next empty bucket, or <code>NULL</code> if the search failed
+ **/
+static Bucket *findEmptyBucket(PointerMap   *map,
+                               Bucket       *bucket,
+                               unsigned int  maxProbes)
+{
+  // Limit the search to either the nearer of the end of the bucket array or a
+  // fixed distance beyond the initial bucket.
+  size_t remaining = &map->buckets[map->bucketCount] - bucket;
+  Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)];
+
+  for (Bucket *entry = bucket; entry < sentinel; entry++) {
+    if (entry->value == NULL) {
+      return entry;
+    }
+  }
+  return NULL;
+}
+
+/**
+ * Move an empty bucket closer to the start of the bucket array. This searches
+ * the neighborhoods that contain the empty bucket for a non-empty bucket
+ * closer to the start of the array. If such a bucket is found, this swaps the
+ * two buckets by moving the entry to the empty bucket.
+ *
+ * @param map   the map containing the bucket
+ * @param hole  the empty bucket to fill with an entry that precedes it in one
+ *              of its enclosing neighborhoods
+ *
+ * @return the bucket that was vacated by moving its entry to the provided
+ *         hole, or <code>NULL</code> if no entry could be moved
+ **/
+static Bucket *moveEmptyBucket(PointerMap *map __attribute__((unused)),
+                               Bucket     *hole)
+{
+  /*
+   * Examine every neighborhood that the empty bucket is part of, starting
+   * with the one in which it is the last bucket. No boundary check is needed
+   * for the negative array arithmetic since this function is only called when
+   * hole is at least NEIGHBORHOOD cells deeper into the array than a valid
+   * bucket.
+   */
+  for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) {
+    // Find the entry that is nearest to the bucket, which means it will be
+    // nearest to the hash bucket whose neighborhood is full.
+    Bucket *newHole = dereferenceHop(bucket, bucket->firstHop);
+    if (newHole == NULL) {
+      // There are no buckets in this neighborhood that are in use by this one
+      // (they must all be owned by overlapping neighborhoods).
+      continue;
+    }
+
+    // Skip this bucket if its first entry is actually further away than the
+    // hole that we're already trying to fill.
+    if (hole < newHole) {
+      continue;
+    }
+
+    /*
+     * We've found an entry in this neighborhood that we can "hop" further
+     * away, moving the hole closer to the hash bucket, if not all the way
+     * into its neighborhood.
+     */
+
+    // The entry that will be the new hole is the first bucket in the list,
+    // so setting firstHop is all that's needed remove it from the list.
+    bucket->firstHop = newHole->nextHop;
+    newHole->nextHop = NULL_HOP_OFFSET;
+
+    // Move the entry into the original hole.
+    hole->key      = newHole->key;
+    hole->value    = newHole->value;
+    newHole->value = NULL;
+
+    // Insert the filled hole into the hop list for the neighborhood.
+    insertInHopList(bucket, hole);
+    return newHole;
+  }
+
+  // We couldn't find an entry to relocate to the hole.
+  return NULL;
+}
+
+/**
+ * Find and update any existing mapping for a given key, returning the value
+ * associated with the key in the provided pointer.
+ *
+ * @param [in]  map           the PointerMap to attempt to modify
+ * @param [in]  neighborhood  the first bucket in the neighborhood that
+ *                            would contain the search key
+ * @param [in]  key           the key with which to associate the new value
+ * @param [in]  newValue      the value to be associated with the key
+ * @param [in]  update        whether to overwrite an existing value
+ * @param [out] oldValuePtr   a pointer in which to store the old value
+ *                            (unmodified if no mapping was found)
+ *
+ * @return <code>true</code> if the map contains a mapping for the key
+ *         <code>false</code> if it does not
+ **/
+static bool updateMapping(PointerMap  *map,
+                          Bucket      *neighborhood,
+                          const void  *key,
+                          void        *newValue,
+                          bool         update,
+                          void       **oldValuePtr)
+{
+  Bucket *bucket = searchHopList(map, neighborhood, key, NULL);
+  if (bucket == NULL) {
+    // There is no bucket containing the key in the neighborhood.
+    return false;
+  }
+
+  // Return the value of the current mapping (if desired) and update the
+  // mapping with the new value (if desired).
+  if (oldValuePtr != NULL) {
+    *oldValuePtr = bucket->value;
+  }
+  if (update) {
+    // We're dropping the old key pointer on the floor here, assuming it's a
+    // property of the value or that it's otherwise safe to just forget.
+    bucket->key   = key;
+    bucket->value = newValue;
+  }
+  return true;
+}
+
+/**
+ * Find an empty bucket in a specified neighborhood for a new mapping or
+ * attempt to re-arrange mappings so there is such a bucket. This operation
+ * may fail (returning NULL) if an empty bucket is not available or could not
+ * be relocated to the neighborhood.
+ *
+ * @param map           the PointerMap to search or modify
+ * @param neighborhood  the first bucket in the neighborhood in which
+ *                      an empty bucket is needed for a new mapping
+ *
+ * @return a pointer to an empty bucket in the desired neighborhood, or
+ *         <code>NULL</code> if a vacancy could not be found or arranged
+ **/
+static Bucket *findOrMakeVacancy(PointerMap *map, Bucket *neighborhood)
+{
+  // Probe within and beyond the neighborhood for the first empty bucket.
+  Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES);
+
+  // Keep trying until the empty bucket is in the bucket's neighborhood or we
+  // are unable to move it any closer by swapping it with a filled bucket.
+  while (hole != NULL) {
+    int distance = hole - neighborhood;
+    if (distance < NEIGHBORHOOD) {
+      // We've found or relocated an empty bucket close enough to the initial
+      // hash bucket to be referenced by its hop vector.
+      return hole;
+    }
+
+    // The nearest empty bucket isn't within the neighborhood that must
+    // contain the new entry, so try to swap it with bucket that is closer.
+    hole = moveEmptyBucket(map, hole);
+  }
+
+  return NULL;
+}
+
+/**********************************************************************/
+int pointerMapPut(PointerMap  *map,
+                  const void  *key,
+                  void        *newValue,
+                  bool         update,
+                  void       **oldValuePtr)
+{
+  if (newValue == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  // Select the bucket at the start of the neighborhood that must contain any
+  // entry for the provided key.
+  Bucket *neighborhood = selectBucket(map, key);
+
+  // Check whether the neighborhood already contains an entry for the key, in
+  // which case we optionally update it, returning the old value.
+  if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) {
+    return UDS_SUCCESS;
+  }
+
+  /*
+   * Find an empty bucket in the desired neighborhood for the new entry or
+   * re-arrange entries in the map so there is such a bucket. This operation
+   * will usually succeed; the loop body will only be executed on the rare
+   * occasions that we have to resize the map.
+   */
+  Bucket *bucket;
+  while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) {
+    /*
+     * There is no empty bucket in which to put the new entry in the current
+     * map, so we're forced to allocate a new bucket array with a larger
+     * capacity, re-hash all the entries into those buckets, and try again (a
+     * very expensive operation for large maps).
+     */
+    int result = resizeBuckets(map);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+
+    // Resizing the map invalidates all pointers to buckets, so recalculate
+    // the neighborhood pointer.
+    neighborhood = selectBucket(map, key);
+  }
+
+  // Put the new entry in the empty bucket, adding it to the neighborhood.
+  bucket->key   = key;
+  bucket->value = newValue;
+  insertInHopList(neighborhood, bucket);
+  map->size += 1;
+
+  // There was no existing entry, so there was no old value to be returned.
+  if (oldValuePtr != NULL) {
+    *oldValuePtr = NULL;
+  }
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void *pointerMapRemove(PointerMap *map, const void *key)
+{
+  // Select the bucket to search and search it for an existing entry.
+  Bucket *bucket = selectBucket(map, key);
+  Bucket *previous;
+  Bucket *victim = searchHopList(map, bucket, key, &previous);
+
+  if (victim == NULL) {
+    // There is no matching entry to remove.
+    return NULL;
+  }
+
+  // We found an entry to remove. Save the mapped value to return later and
+  // empty the bucket.
+  map->size -= 1;
+  void *value   = victim->value;
+  victim->value = NULL;
+  victim->key   = 0;
+
+  // The victim bucket is now empty, but it still needs to be spliced out of
+  // the hop list.
+  if (previous == NULL) {
+    // The victim is the head of the list, so swing firstHop.
+    bucket->firstHop  = victim->nextHop;
+  } else {
+    previous->nextHop = victim->nextHop;
+  }
+  victim->nextHop = NULL_HOP_OFFSET;
+
+  return value;
+}
diff --git a/vdo/base/pointerMap.h b/vdo/base/pointerMap.h
new file mode 100644
index 0000000..1bd0bd2
--- /dev/null
+++ b/vdo/base/pointerMap.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.h#1 $
+ */
+
+#ifndef POINTER_MAP_H
+#define POINTER_MAP_H
+
+#include "common.h"
+
+/**
+ * PointerMap associates pointer values (<code>void *</code>) with the data
+ * referenced by pointer keys (<code>void *</code>). <code>NULL</code> pointer
+ * values are not supported. A <code>NULL</code> key value is supported when
+ * the instance's key comparator and hasher functions support it.
+ *
+ * The map is implemented as hash table, which should provide constant-time
+ * insert, query, and remove operations, although the insert may occasionally
+ * grow the table, which is linear in the number of entries in the map. The
+ * table will grow as needed to hold new entries, but will not shrink as
+ * entries are removed.
+ *
+ * The key and value pointers passed to the map are retained and used by the
+ * map, but are not owned by the map. Freeing the map does not attempt to free
+ * the pointers. The client is entirely responsible for the memory managment
+ * of the keys and values. The current interface and implementation assume
+ * that keys will be properties of the values, or that keys will not be memory
+ * managed, or that keys will not need to be freed as a result of being
+ * replaced when a key is re-mapped.
+ **/
+
+typedef struct pointerMap PointerMap;
+
+/**
+ * The prototype of functions that compare the referents of two pointer keys
+ * for equality. If two keys are equal, then both keys must have the same the
+ * hash code associated with them by the hasher function defined below.
+
+ * @param thisKey  The first element to compare
+ * @param thatKey  The second element to compare
+ *
+ * @return <code>true</code> if and only if the referents of the two
+ *         key pointers are to be treated as the same key by the map
+ **/
+typedef bool PointerKeyComparator(const void *thisKey, const void *thatKey);
+
+/**
+ * The prototype of functions that get or calculate a hash code associated
+ * with the referent of pointer key. The hash code must be uniformly
+ * distributed over all uint32_t values. The hash code associated with a given
+ * key must not change while the key is in the map. If the comparator function
+ * says two keys are equal, then this function must return the same hash code
+ * for both keys. This function may be called many times for a key while an
+ * entry is stored for it in the map.
+ *
+ * @param key  The pointer key to hash
+ *
+ * @return the hash code for the key
+ **/
+typedef uint32_t PointerKeyHasher(const void *key);
+
+/**
+ * Allocate and initialize a PointerMap.
+ *
+ * @param [in]  initialCapacity  The number of entries the map should
+ *                               initially be capable of holding (zero tells
+ *                               the map to use its own small default)
+ * @param [in]  initialLoad      The load factor of the map, expressed as an
+ *                               integer percentage (typically in the range
+ *                               50 to 90, with zero telling the map to use
+ *                               its own default)
+ * @param [in]  comparator       The function to use to compare the referents
+ *                               of two pointer keys for equality
+ * @param [in]  hasher           The function to use obtain the hash code
+ *                               associated with each pointer key
+ * @param [out] mapPtr           A pointer to hold the new PointerMap
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int makePointerMap(size_t                 initialCapacity,
+                   unsigned int           initialLoad,
+                   PointerKeyComparator   comparator,
+                   PointerKeyHasher       hasher,
+                   PointerMap           **mapPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a PointerMap and null out the reference to it. NOTE: The map does not
+ * own the pointer keys and values stored in the map and they are not freed by
+ * this call.
+ *
+ * @param [in,out] mapPtr  The reference to the PointerMap to free
+ **/
+void freePointerMap(PointerMap **mapPtr);
+
+/**
+ * Get the number of entries stored in a PointerMap.
+ *
+ * @param map  The PointerMap to query
+ *
+ * @return the number of entries in the map
+ **/
+size_t pointerMapSize(const PointerMap *map);
+
+/**
+ * Retrieve the value associated with a given key from the PointerMap.
+ *
+ * @param map  The PointerMap to query
+ * @param key  The key to look up (may be <code>NULL</code> if the
+ *             comparator and hasher functions support it)
+ *
+ * @return the value associated with the given key, or <code>NULL</code>
+ *         if the key is not mapped to any value
+ **/
+void *pointerMapGet(PointerMap *map, const void *key);
+
+/**
+ * Try to associate a value (a pointer) with an integer in a PointerMap.
+ * If the map already contains a mapping for the provided key, the old value is
+ * only replaced with the specified value if update is true. In either case
+ * the old value is returned. If the map does not already contain a value for
+ * the specified key, the new value is added regardless of the value of update.
+ *
+ * If the value stored in the map is updated, then the key stored in the map
+ * will also be updated with the key provided by this call. The old key will
+ * not be returned due to the memory managment assumptions described in the
+ * interface header comment.
+ *
+ * @param [in]  map          The PointerMap to attempt to modify
+ * @param [in]  key          The key with which to associate the new value
+ *                           (may be <code>NULL</code> if the comparator and
+ *                           hasher functions support it)
+ * @param [in]  newValue     The value to be associated with the key
+ * @param [in]  update       Whether to overwrite an existing value
+ * @param [out] oldValuePtr  A pointer in which to store either the old value
+ *                           (if the key was already mapped) or
+ *                           <code>NULL</code> if the map did not contain the
+ *                           key; <code>NULL</code> may be provided if the
+ *                           caller does not need to know the old value
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+int pointerMapPut(PointerMap  *map,
+                  const void  *key,
+                  void        *newValue,
+                  bool         update,
+                  void       **oldValuePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove the mapping for a given key from the PointerMap.
+ *
+ * @param map  The PointerMap from which to remove the mapping
+ * @param key  The key whose mapping is to be removed (may be <code>NULL</code>
+ *             if the comparator and hasher functions support it)
+ *
+ * @return the value that was associated with the key, or
+ *         <code>NULL</code> if it was not mapped
+ **/
+void *pointerMapRemove(PointerMap *map, const void *key);
+
+#endif /* POINTER_MAP_H */
diff --git a/vdo/base/priorityTable.c b/vdo/base/priorityTable.c
new file mode 100644
index 0000000..deb423b
--- /dev/null
+++ b/vdo/base/priorityTable.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.c#1 $
+ */
+
+#include "priorityTable.h"
+
+#include "errors.h"
+#include "memoryAlloc.h"
+#include "numUtils.h"
+
+#include "statusCodes.h"
+
+/** We use a single 64-bit search vector, so the maximum priority is 63 */
+enum { MAX_PRIORITY = 63 };
+
+/**
+ * All the entries with the same priority are queued in a circular list in a
+ * bucket for that priority. The table is essentially an array of buckets.
+ **/
+typedef struct bucket {
+  /** The head of a queue of table entries, all having the same priority */
+  RingNode     queue;
+  /** The priority of all the entries in this bucket */
+  unsigned int priority;
+} Bucket;
+
+/**
+ * A priority table is an array of buckets, indexed by priority. New entries
+ * are added to the end of the queue in the appropriate bucket. The dequeue
+ * operation finds the highest-priority non-empty bucket by searching a bit
+ * vector represented as a single 8-byte word, which is very fast with
+ * compiler and CPU support.
+ **/
+struct priorityTable {
+  /** The maximum priority of entries that may be stored in this table */
+  unsigned int maxPriority;
+  /** A bit vector flagging all buckets that are currently non-empty */
+  uint64_t     searchVector;
+  /** The array of all buckets, indexed by priority */
+  Bucket       buckets[];
+};
+
+/**
+ * Convert a queue head to to the bucket that contains it.
+ *
+ * @param head  The bucket queue ring head pointer to convert
+ *
+ * @return the enclosing bucket
+ **/
+static inline Bucket *asBucket(RingNode *head)
+{
+  STATIC_ASSERT(offsetof(Bucket, queue) == 0);
+  return (Bucket *) head;
+}
+
+/**********************************************************************/
+int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr)
+{
+  if (maxPriority > MAX_PRIORITY) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  PriorityTable *table;
+  int result = ALLOCATE_EXTENDED(PriorityTable, maxPriority + 1, Bucket,
+                                 __func__, &table);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (unsigned int priority = 0; priority <= maxPriority; priority++) {
+    Bucket *bucket   = &table->buckets[priority];
+    bucket->priority = priority;
+    initializeRing(&bucket->queue);
+  }
+
+  table->maxPriority  = maxPriority;
+  table->searchVector = 0;
+
+  *tablePtr = table;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freePriorityTable(PriorityTable **tablePtr)
+{
+  PriorityTable *table = *tablePtr;
+  if (table == NULL) {
+    return;
+  }
+
+  // Unlink the buckets from any entries still in the table so the entries
+  // won't be left with dangling pointers to freed memory.
+  resetPriorityTable(table);
+
+  FREE(table);
+  *tablePtr = NULL;
+}
+
+/**********************************************************************/
+void resetPriorityTable(PriorityTable *table)
+{
+  table->searchVector = 0;
+  for (unsigned int priority = 0; priority <= table->maxPriority; priority++) {
+    unspliceRingNode(&table->buckets[priority].queue);
+  }
+}
+
+/**********************************************************************/
+void priorityTableEnqueue(PriorityTable *table,
+                          unsigned int   priority,
+                          RingNode      *entry)
+{
+  ASSERT_LOG_ONLY((priority <= table->maxPriority),
+                  "entry priority must be valid for the table");
+
+  // Append the entry to the queue in the specified bucket.
+  pushRingNode(&table->buckets[priority].queue, entry);
+
+  // Flag the bucket in the search vector since it must be non-empty.
+  table->searchVector |= (1ULL << priority);
+}
+
+/**********************************************************************/
+static inline void markBucketEmpty(PriorityTable *table, Bucket *bucket)
+{
+  table->searchVector &= ~(1ULL << bucket->priority);
+}
+
+/**********************************************************************/
+RingNode *priorityTableDequeue(PriorityTable *table)
+{
+  // Find the highest priority non-empty bucket by finding the highest-order
+  // non-zero bit in the search vector.
+  int topPriority = logBaseTwo(table->searchVector);
+
+  if (topPriority < 0) {
+    // All buckets are empty.
+    return NULL;
+  }
+
+  // Dequeue the first entry in the bucket.
+  Bucket   *bucket = &table->buckets[topPriority];
+  RingNode *entry  = unspliceRingNode(bucket->queue.next);
+
+  // Clear the bit in the search vector if the bucket has been emptied.
+  if (isRingEmpty(&bucket->queue)) {
+    markBucketEmpty(table, bucket);
+  }
+
+  return entry;
+}
+
+/**********************************************************************/
+void priorityTableRemove(PriorityTable *table, RingNode *entry)
+{
+  // We can't guard against calls where the entry is on a ring for a different
+  // table, but it's easy to deal with an entry not in any table or ring.
+  if (isRingEmpty(entry)) {
+    return;
+  }
+
+  // Remove the entry from the bucket ring, remembering a pointer to another
+  // entry in the ring.
+  RingNode *nextNode = entry->next;
+  unspliceRingNode(entry);
+
+  // If the rest of the ring is now empty, the next node must be the ring head
+  // in the bucket and we can use it to update the search vector.
+  if (isRingEmpty(nextNode)) {
+    markBucketEmpty(table, asBucket(nextNode));
+  }
+}
+
+/**********************************************************************/
+bool isPriorityTableEmpty(PriorityTable *table)
+{
+  return (table->searchVector == 0);
+}
diff --git a/vdo/base/priorityTable.h b/vdo/base/priorityTable.h
new file mode 100644
index 0000000..d48a570
--- /dev/null
+++ b/vdo/base/priorityTable.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.h#2 $
+ */
+
+#ifndef PRIORITY_TABLE_H
+#define PRIORITY_TABLE_H
+
+#include "ringNode.h"
+
+/**
+ * A PriorityTable is a simple implementation of a priority queue for entries
+ * with priorities that are small non-negative integer values. It implements
+ * the obvious priority queue operations of enqueuing an entry and dequeuing
+ * an entry with the maximum priority. It also supports removing an arbitrary
+ * entry. The priority of an entry already in the table can be changed by
+ * removing it and re-enqueuing it with a different priority. All operations
+ * have O(1) complexity.
+ *
+ * The links for the table entries must be embedded in the entries themselves.
+ * RingNode is used to link entries in the table and no wrapper type is
+ * declared, so an existing RingNode link in an object can also be used to
+ * queue it in a PriorityTable, assuming the field is not used for anything
+ * else while so queued.
+ *
+ * The table is implemented as an array of queues (circular lists) indexed by
+ * priority, along with a hint for which queues are non-empty. Steven Skiena
+ * calls a very similar structure a "bounded height priority queue", but given
+ * the resemblance to a hash table, "priority table" seems both shorter and
+ * more apt, if somewhat novel.
+ **/
+
+typedef struct priorityTable PriorityTable;
+
+/**
+ * Allocate and initialize a new PriorityTable.
+ *
+ * @param [in]  maxPriority  The maximum priority value for table entries
+ * @param [out] tablePtr     A pointer to hold the new table
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a PriorityTable and null out the reference to it. NOTE: The table does
+ * not own the entries stored in it and they are not freed by this call.
+ *
+ * @param [in,out] tablePtr  The reference to the table to free
+ **/
+void freePriorityTable(PriorityTable **tablePtr);
+
+/**
+ * Add a new entry to the priority table, appending it to the queue for
+ * entries with the specified priority.
+ *
+ * @param table     The table in which to store the entry
+ * @param priority  The priority of the entry
+ * @param entry     The RingNode embedded in the entry to store in the table
+ *                  (the caller must have initialized it)
+ **/
+void priorityTableEnqueue(PriorityTable *table,
+                          unsigned int   priority,
+                          RingNode      *entry);
+
+/**
+ * Reset a priority table, leaving it in the same empty state as when newly
+ * constructed. NOTE: The table does not own the entries stored in it and they
+ * are not freed (or even unlinked from each other) by this call.
+ *
+ * @param table  The table to reset
+ **/
+void resetPriorityTable(PriorityTable *table);
+
+/**
+ * Find the highest-priority entry in the table, remove it from the table, and
+ * return it. If there are multiple entries with the same priority, the one
+ * that has been in the table with that priority the longest will be returned.
+ *
+ * @param table  The priority table from which to remove an entry
+ *
+ * @return the dequeued entry, or NULL if the table is currently empty
+ **/
+RingNode *priorityTableDequeue(PriorityTable *table)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove a specified entry from its priority table.
+ *
+ * @param table   The table from which to remove the entry
+ * @param entry   The entry to remove from the table
+ **/
+void priorityTableRemove(PriorityTable *table, RingNode *entry);
+
+/**
+ * Return whether the priority table is empty.
+ *
+ * @param table   The table to check
+ *
+ * @return <code>true</code> if the table is empty
+ **/
+bool isPriorityTableEmpty(PriorityTable *table)
+  __attribute__((warn_unused_result));
+
+#endif /* PRIORITY_TABLE_H */
diff --git a/vdo/base/readOnlyNotifier.c b/vdo/base/readOnlyNotifier.c
new file mode 100644
index 0000000..ba837ac
--- /dev/null
+++ b/vdo/base/readOnlyNotifier.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.c#5 $
+ */
+
+#include "readOnlyNotifier.h"
+
+#include "atomic.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "completion.h"
+#include "physicalLayer.h"
+#include "threadConfig.h"
+
+/**
+ * A ReadOnlyNotifier has a single completion which is used to perform
+ * read-only notifications, however, enterReadOnlyMode() may be called from any
+ * base thread. A pair of atomic fields are used to control the read-only mode
+ * entry process. The first field holds the read-only error. The second is the
+ * state field, which may hold any of the four special values enumerated here.
+ *
+ * When enterReadOnlyMode() is called from some base thread, a compare-and-swap
+ * is done on the readOnlyError, setting it to the supplied error if the value
+ * was VDO_SUCCESS. If this fails, some other thread has already intiated
+ * read-only entry or scheduled a pending entry, so the call exits. Otherwise,
+ * a compare-and-swap is done on the state, setting it to NOTIFYING if the
+ * value was MAY_NOTIFY. If this succeeds, the caller initiates the
+ * notification. If this failed due to notifications being disallowed, the
+ * notifier will be in the MAY_NOT_NOTIFY state but readOnlyError will not be
+ * VDO_SUCCESS. This configuration will indicate to allowReadOnlyModeEntry()
+ * that there is a pending notification to perform.
+ **/
+enum {
+  /** Notifications are allowed but not in progress */
+  MAY_NOTIFY = 0,
+  /** A notification is in progress */
+  NOTIFYING,
+  /** Notifications are not allowed */
+  MAY_NOT_NOTIFY,
+  /** A notification has completed */
+  NOTIFIED,
+};
+
+/**
+ * An object to be notified when the VDO enters read-only mode
+ **/
+typedef struct readOnlyListener ReadOnlyListener;
+
+struct readOnlyListener {
+  /** The listener */
+  void                 *listener;
+  /** The method to call to notifiy the listener */
+  ReadOnlyNotification *notify;
+  /** A pointer to the next listener */
+  ReadOnlyListener     *next;
+};
+
+/**
+ * Data associated with each base code thread.
+ **/
+typedef struct threadData {
+  /**
+   * Each thread maintains its own notion of whether the VDO is read-only so
+   * that the read-only state can be checked from any base thread without
+   * worrying about synchronization or thread safety. This does mean that
+   * knowledge of the VDO going read-only does not occur simultaneously across
+   * the VDO's threads, but that does not seem to cause any problems.
+   */
+  bool              isReadOnly;
+  /**
+   * A list of objects waiting to be notified on this thread that the VDO has
+   * entered read-only mode.
+   **/
+  ReadOnlyListener *listeners;
+} ThreadData;
+
+struct readOnlyNotifier {
+  /** The completion for entering read-only mode */
+  VDOCompletion       completion;
+  /** A completion waiting for notifications to be drained or enabled */
+  VDOCompletion      *waiter;
+  /** The code of the error which put the VDO into read-only mode */
+  Atomic32            readOnlyError;
+  /** The current state of the notifier (values described above) */
+  Atomic32            state;
+  /** The thread config of the VDO */
+  const ThreadConfig *threadConfig;
+  /** The array of per-thread data */
+  ThreadData          threadData[];
+};
+
+/**
+ * Convert a generic VDOCompletion to a ReadOnlyNotifier.
+ *
+ * @param completion The completion to convert
+ *
+ * @return The completion as a ReadOnlyNotifier
+ **/
+static inline ReadOnlyNotifier *asNotifier(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(ReadOnlyNotifier, completion) == 0);
+  assertCompletionType(completion->type, READ_ONLY_MODE_COMPLETION);
+  return (ReadOnlyNotifier *) completion;
+}
+
+/**********************************************************************/
+int makeReadOnlyNotifier(bool                 isReadOnly,
+                         const ThreadConfig  *threadConfig,
+                         PhysicalLayer       *layer,
+                         ReadOnlyNotifier   **notifierPtr)
+{
+  ReadOnlyNotifier *notifier;
+  int result = ALLOCATE_EXTENDED(ReadOnlyNotifier,
+                                 threadConfig->baseThreadCount, ThreadData,
+                                 __func__, &notifier);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  notifier->threadConfig = threadConfig;
+  if (isReadOnly) {
+    atomicStore32(&notifier->readOnlyError, (uint32_t) VDO_READ_ONLY);
+    atomicStore32(&notifier->state, NOTIFIED);
+  } else {
+    atomicStore32(&notifier->state, MAY_NOTIFY);
+  }
+  result = initializeEnqueueableCompletion(&notifier->completion,
+                                           READ_ONLY_MODE_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    freeReadOnlyNotifier(&notifier);
+    return result;
+  }
+
+  for (ThreadCount id = 0; id < threadConfig->baseThreadCount; id++) {
+    notifier->threadData[id].isReadOnly = isReadOnly;
+  }
+
+  *notifierPtr = notifier;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr)
+{
+  ReadOnlyNotifier *notifier = *notifierPtr;
+  if (notifier == NULL) {
+    return;
+  }
+
+  for (ThreadCount id = 0; id < notifier->threadConfig->baseThreadCount;
+       id++) {
+    ThreadData       *threadData = &notifier->threadData[id];
+    ReadOnlyListener *listener   = threadData->listeners;
+    while (listener != NULL) {
+      ReadOnlyListener *toFree = listener;
+      listener = listener->next;
+      FREE(toFree);
+    }
+  }
+
+  destroyEnqueueable(&notifier->completion);
+  FREE(notifier);
+  *notifierPtr = NULL;
+}
+
+/**
+ * Check that a function was called on the admin thread.
+ *
+ * @param notifier  The notifier
+ * @param caller    The name of the function (for logging)
+ **/
+static void assertOnAdminThread(ReadOnlyNotifier *notifier, const char *caller)
+{
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((getAdminThread(notifier->threadConfig) == threadID),
+                  "%s called on admin thread", caller);
+}
+
+
+/**********************************************************************/
+void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier,
+                                      VDOCompletion    *parent)
+{
+  if (notifier == NULL) {
+    finishCompletion(parent, VDO_SUCCESS);
+    return;
+  }
+
+  assertOnAdminThread(notifier, __func__);
+  if (notifier->waiter != NULL) {
+    finishCompletion(parent, VDO_COMPONENT_BUSY);
+    return;
+  }
+
+  uint32_t state = atomicLoad32(&notifier->state);
+  if ((state == MAY_NOT_NOTIFY) || (state == NOTIFIED)) {
+    // Notifications are already done or disallowed.
+    completeCompletion(parent);
+    return;
+  }
+
+  if (compareAndSwap32(&notifier->state, MAY_NOTIFY, MAY_NOT_NOTIFY)) {
+    // A notification was not in progress, and now they are disallowed.
+    completeCompletion(parent);
+    return;
+  }
+
+  /*
+   * A notification is in progress, so wait for it to finish. There is no race
+   * here since the notification can't finish while the admin thread is in this
+   * method.
+   */
+  notifier->waiter = parent;
+}
+
+/**
+ * Complete the process of entering read only mode.
+ *
+ * @param completion  The read-only mode completion
+ **/
+static void finishEnteringReadOnlyMode(VDOCompletion *completion)
+{
+  ReadOnlyNotifier *notifier = asNotifier(completion);
+  assertOnAdminThread(notifier, __func__);
+  atomicStore32(&notifier->state, NOTIFIED);
+
+  VDOCompletion *waiter = notifier->waiter;
+  if (waiter != NULL) {
+    notifier->waiter = NULL;
+    finishCompletion(waiter, completion->result);
+  }
+}
+
+/**
+ * Inform each thread that the VDO is in read-only mode.
+ *
+ * @param completion  The read-only mode completion
+ **/
+static void makeThreadReadOnly(VDOCompletion *completion)
+{
+  ThreadID          threadID   = completion->callbackThreadID;
+  ReadOnlyNotifier *notifier   = asNotifier(completion);
+  ReadOnlyListener *listener   = completion->parent;
+  if (listener == NULL) {
+    // This is the first call on this thread
+    ThreadData *threadData = &notifier->threadData[threadID];
+    threadData->isReadOnly = true;
+    listener               = threadData->listeners;
+    if (threadID == 0) {
+      // Note: This message must be recognizable by Permabit::UserMachine.
+      logErrorWithStringError((int) atomicLoad32(&notifier->readOnlyError),
+                              "Unrecoverable error, entering read-only mode");
+    }
+  } else {
+    // We've just finished notifying a listener
+    listener = listener->next;
+  }
+
+  if (listener != NULL) {
+    // We have a listener to notify
+    prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly,
+                      threadID, listener);
+    listener->notify(listener->listener, completion);
+    return;
+  }
+
+  // We're done with this thread
+  if (++threadID >= notifier->threadConfig->baseThreadCount) {
+    // There are no more threads
+    prepareCompletion(completion, finishEnteringReadOnlyMode,
+                      finishEnteringReadOnlyMode,
+                      getAdminThread(notifier->threadConfig), NULL);
+  } else {
+    prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly,
+                      threadID, NULL);
+  }
+
+  invokeCallback(completion);
+}
+
+/**********************************************************************/
+void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, VDOCompletion *parent)
+{
+  assertOnAdminThread(notifier, __func__);
+  if (notifier->waiter != NULL) {
+    finishCompletion(parent, VDO_COMPONENT_BUSY);
+    return;
+  }
+
+   if (!compareAndSwap32(&notifier->state, MAY_NOT_NOTIFY, MAY_NOTIFY)) {
+    // Notifications were already allowed or complete
+    completeCompletion(parent);
+    return;
+  }
+
+  if ((int) atomicLoad32(&notifier->readOnlyError) == VDO_SUCCESS) {
+    // We're done
+    completeCompletion(parent);
+    return;
+  }
+
+  // There may have been a pending notification
+  if (!compareAndSwap32(&notifier->state, MAY_NOTIFY, NOTIFYING)) {
+    /*
+     * There wasn't, the error check raced with a thread calling
+     * enterReadOnlyMode() after we set the state to MAY_NOTIFY. It has already
+     * started the notification.
+     */
+    completeCompletion(parent);
+    return;
+  }
+
+  // Do the pending notification.
+  notifier->waiter = parent;
+  makeThreadReadOnly(&notifier->completion);
+}
+
+/**********************************************************************/
+void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode)
+{
+  ThreadData *threadData = &notifier->threadData[getCallbackThreadID()];
+  if (threadData->isReadOnly) {
+    // This thread has already gone read-only.
+    return;
+  }
+
+  // Record for this thread that the VDO is read-only.
+  threadData->isReadOnly = true;
+
+  if (!compareAndSwap32(&notifier->readOnlyError, (uint32_t) VDO_SUCCESS,
+                        (uint32_t) errorCode)) {
+    // The notifier is already aware of a read-only error
+    return;
+  }
+
+  if (compareAndSwap32(&notifier->state, MAY_NOTIFY, NOTIFYING)) {
+    // Initiate a notification starting on the lowest numbered thread.
+    launchCallback(&notifier->completion, makeThreadReadOnly, 0);
+  }
+}
+
+/**********************************************************************/
+bool isReadOnly(ReadOnlyNotifier *notifier)
+{
+  return notifier->threadData[getCallbackThreadID()].isReadOnly;
+}
+
+/**********************************************************************/
+bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier)
+{
+  return (((int) relaxedLoad32(&notifier->readOnlyError)) != VDO_SUCCESS);
+}
+
+/**********************************************************************/
+int registerReadOnlyListener(ReadOnlyNotifier     *notifier,
+                             void                 *listener,
+                             ReadOnlyNotification *notification,
+                             ThreadID              threadID)
+{
+  ReadOnlyListener *readOnlyListener;
+  int result = ALLOCATE(1, ReadOnlyListener, __func__, &readOnlyListener);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ThreadData *threadData = &notifier->threadData[threadID];
+  *readOnlyListener = (ReadOnlyListener) {
+    .listener = listener,
+    .notify   = notification,
+    .next     = threadData->listeners,
+  };
+
+  threadData->listeners = readOnlyListener;
+  return VDO_SUCCESS;
+}
diff --git a/vdo/base/readOnlyNotifier.h b/vdo/base/readOnlyNotifier.h
new file mode 100644
index 0000000..b5eb322
--- /dev/null
+++ b/vdo/base/readOnlyNotifier.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.h#3 $
+ */
+
+/*
+ * A ReadOnlyNotifier is responsible for propogating the fact that the VDO
+ * has encountered an unrecoverable error to all base threads. It also persists
+ * the read-only state to the super block.
+ *
+ * The notifier also provides the ability to wait for any notifications to be
+ * complete in order to not cause super block write races when shutting down
+ * the VDO.
+ */
+
+#ifndef READ_ONLY_NOTIFIER_H
+#define READ_ONLY_NOTIFIER_H
+
+#include "completion.h"
+
+/**
+ * A function to notify a listener that the VDO has gone read-only.
+ *
+ * @param listener  The object to notify
+ * @param parent    The completion to notify in order to acknowledge the
+ *                  notification
+ **/
+typedef void ReadOnlyNotification(void *listener, VDOCompletion *parent);
+
+/**
+ * Create a read-only notifer.
+ *
+ * @param [in]  isReadOnly     Whether the VDO is already read-only
+ * @param [in]  threadConfig   The thread configuration of the VDO
+ * @param [in]  layer          The physical layer of the VDO
+ * @param [out] notifierPtr    A pointer to receive the new notifier
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeReadOnlyNotifier(bool                 isReadOnly,
+                         const ThreadConfig  *threadConfig,
+                         PhysicalLayer       *layer,
+                         ReadOnlyNotifier   **notifierPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a ReadOnlyNotifier and null out the reference to it.
+ *
+ * @param notifierPtr  The reference to the notifier to free
+ **/
+void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr);
+
+/**
+ * Wait until no read-only notifications are in progress and prevent any
+ * subsequent notifications. Notifications may be re-enabled by calling
+ * allowReadOnlyModeEntry().
+ *
+ * @param notifier  The read-only notifier on which to wait
+ * @param parent    The completion to notify when no threads are entering
+ *                  read-only mode
+ **/
+void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier,
+                                      VDOCompletion    *parent);
+
+/**
+ * Allow the notifier to put the VDO into read-only mode, reversing the effects
+ * of waitUntilNotEnteringReadOnlyMode(). If some thread tried to put the VDO
+ * into read-only mode while notifications were disallowed, it will be done
+ * when this method is called. If that happens, the parent will not be notified
+ * until the VDO has actually entered read-only mode and attempted to save the
+ * super block.
+ *
+ * <p>This method may only be called from the admin thread.
+ *
+ * @param notifier  The notifier
+ * @param parent    The object to notify once the operation is complete
+ **/
+void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier,
+                            VDOCompletion    *parent);
+
+/**
+ * Put a VDO into read-only mode and save the read-only state in the super
+ * block. This method is a no-op if the VDO is already read-only.
+ *
+ * @param notifier        The read-only notifier of the VDO
+ * @param errorCode       The error which caused the VDO to enter read-only
+ *                        mode
+ **/
+void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode);
+
+/**
+ * Check whether the VDO is read-only. This method may be called from any
+ * thread, as opposed to examining the VDO's state field which is only safe
+ * to check from the admin thread.
+ *
+ * @param notifier        The read-only notifier of the VDO
+ *
+ * @return <code>true</code> if the VDO is read-only
+ **/
+bool isReadOnly(ReadOnlyNotifier *notifier)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO is or will be read-only (i.e. some thread has started
+ * the process of entering read-only mode, but not all threads have been
+ * notified yet). This method should only be called in cases where the expense
+ * of reading atomic state is not a problem. It was introduced in order to allow
+ * suppresion of spurious error messages resulting from VIO cleanup racing with
+ * read-only notification.
+ *
+ * @param notifier  The read-only notifier of the VDO
+ *
+ * @return <code>true</code> if the VDO has started (and possibly finished)
+ *         the process of entering read-only mode
+ **/
+bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier)
+  __attribute__((warn_unused_result));
+
+/**
+ * Register a listener to be notified when the VDO goes read-only.
+ *
+ * @param notifier      The notifier to register with
+ * @param listener      The object to notify
+ * @param notification  The function to call to send the notification
+ * @param threadID      The id of the thread on which to send the notification
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int registerReadOnlyListener(ReadOnlyNotifier     *notifier,
+                             void                 *listener,
+                             ReadOnlyNotification *notification,
+                             ThreadID              threadID);
+
+#endif /* READ_ONLY_NOTIFIER_H */
diff --git a/vdo/base/readOnlyRebuild.c b/vdo/base/readOnlyRebuild.c
new file mode 100644
index 0000000..7e9df0c
--- /dev/null
+++ b/vdo/base/readOnlyRebuild.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.c#9 $
+ */
+
+#include "readOnlyRebuild.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMapInternals.h"
+#include "blockMapRecovery.h"
+#include "completion.h"
+#include "numUtils.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalInternals.h"
+#include "recoveryUtils.h"
+#include "referenceCountRebuild.h"
+#include "slabDepot.h"
+#include "vdoInternal.h"
+#include "vdoPageCache.h"
+
+typedef struct {
+  /** The completion header */
+  VDOCompletion         completion;
+  /** A sub task completion */
+  VDOCompletion         subTaskCompletion;
+  /** The VDO in question */
+  VDO                  *vdo;
+  /** A buffer to hold the data read off disk */
+  char                 *journalData;
+  /** The entry data for the block map rebuild */
+  NumberedBlockMapping *entries;
+  /** The number of entries in the entry array */
+  size_t                entryCount;
+  /** The sequence number of the first valid block of the journal (if known) */
+  SequenceNumber        head;
+  /** The sequence number of the last valid block of the journal (if known) */
+  SequenceNumber        tail;
+  /** The number of logical blocks in use */
+  BlockCount            logicalBlocksUsed;
+  /** The number of allocated block map pages */
+  BlockCount            blockMapDataBlocks;
+} ReadOnlyRebuildCompletion;
+
+/**
+ * Convert a generic completion to a ReadOnlyRebuildCompletion.
+ *
+ * @param completion    The completion to convert
+ *
+ * @return the journal rebuild completion
+ **/
+__attribute__((warn_unused_result))
+static inline ReadOnlyRebuildCompletion *
+asReadOnlyRebuildCompletion(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(ReadOnlyRebuildCompletion, completion) == 0);
+  assertCompletionType(completion->type, READ_ONLY_REBUILD_COMPLETION);
+  return (ReadOnlyRebuildCompletion *) completion;
+}
+
+/**
+ * Free a rebuild completion and all underlying structures.
+ *
+ * @param rebuildPtr  A pointer to the rebuild completion to free
+ */
+static void freeRebuildCompletion(ReadOnlyRebuildCompletion **rebuildPtr)
+{
+  ReadOnlyRebuildCompletion *rebuild = *rebuildPtr;
+  if (rebuild == NULL) {
+    return;
+  }
+
+  destroyEnqueueable(&rebuild->subTaskCompletion);
+  FREE(rebuild->journalData);
+  FREE(rebuild->entries);
+  FREE(rebuild);
+  *rebuildPtr = NULL;
+}
+
+/**
+ * Allocate and initialize a read only rebuild completion.
+ *
+ * @param [in]  vdo         The VDO in question
+ * @param [out] rebuildPtr  A pointer to return the created rebuild completion
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int makeRebuildCompletion(VDO                        *vdo,
+                                 ReadOnlyRebuildCompletion **rebuildPtr)
+{
+  ReadOnlyRebuildCompletion *rebuild;
+  int result = ALLOCATE(1, ReadOnlyRebuildCompletion, __func__, &rebuild);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  initializeCompletion(&rebuild->completion, READ_ONLY_REBUILD_COMPLETION,
+                       vdo->layer);
+
+  result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion,
+                                           SUB_TASK_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    freeRebuildCompletion(&rebuild);
+    return result;
+  }
+
+  rebuild->vdo = vdo;
+  *rebuildPtr = rebuild;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Clean up the rebuild process, whether or not it succeeded, by freeing the
+ * rebuild completion and notifying the parent of the outcome.
+ *
+ * @param completion  The rebuild completion
+ **/
+static void completeRebuild(VDOCompletion *completion)
+{
+  VDOCompletion             *parent  = completion->parent;
+  int                        result  = completion->result;
+  ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion);
+  VDO                       *vdo     = rebuild->vdo;
+  setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, false);
+  freeRebuildCompletion(&rebuild);
+  finishCompletion(parent, result);
+}
+
+/**
+ * Finish rebuilding, free the rebuild completion and notify the parent.
+ *
+ * @param completion  The rebuild completion
+ **/
+static void finishRebuild(VDOCompletion *completion)
+{
+  ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion);
+  initializeRecoveryJournalPostRebuild(rebuild->vdo->recoveryJournal,
+                                       rebuild->vdo->completeRecoveries,
+                                       rebuild->tail,
+                                       rebuild->logicalBlocksUsed,
+                                       rebuild->blockMapDataBlocks);
+  logInfo("Read-only rebuild complete");
+  completeRebuild(completion);
+}
+
+/**
+ * Handle a rebuild error.
+ *
+ * @param completion  The rebuild completion
+ **/
+static void abortRebuild(VDOCompletion *completion)
+{
+  logInfo("Read-only rebuild aborted");
+  completeRebuild(completion);
+}
+
+/**
+ * Abort a rebuild if there is an error.
+ *
+ * @param result   The result to check
+ * @param rebuild  The journal rebuild completion
+ *
+ * @return <code>true</code> if the result was an error
+ **/
+__attribute__((warn_unused_result))
+static bool abortRebuildOnError(int                        result,
+                                ReadOnlyRebuildCompletion *rebuild)
+{
+  if (result == VDO_SUCCESS) {
+    return false;
+  }
+
+  finishCompletion(&rebuild->completion, result);
+  return true;
+}
+
+/**
+ * Clean up after finishing the reference count rebuild. This callback is
+ * registered in launchReferenceCountRebuild().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void finishReferenceCountRebuild(VDOCompletion *completion)
+{
+  ReadOnlyRebuildCompletion *rebuild = completion->parent;
+  VDO                       *vdo     = rebuild->vdo;
+  assertOnAdminThread(vdo, __func__);
+  if (vdo->loadState != VDO_REBUILD_FOR_UPGRADE) {
+    // A "rebuild" for upgrade should not increment this count.
+    vdo->completeRecoveries++;
+  }
+
+  logInfo("Saving rebuilt state");
+  prepareToFinishParent(completion, &rebuild->completion);
+  drainSlabDepot(vdo->depot, ADMIN_STATE_REBUILDING, completion);
+}
+
+/**
+ * Rebuild the reference counts from the block map now that all journal entries
+ * have been applied to the block map. This callback is registered in
+ * applyJournalEntries().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void launchReferenceCountRebuild(VDOCompletion *completion)
+{
+  ReadOnlyRebuildCompletion *rebuild = completion->parent;
+  VDO                       *vdo     = rebuild->vdo;
+
+  // We must allocate RefCounts before we can rebuild them.
+  int result = allocateSlabRefCounts(vdo->depot);
+  if (abortRebuildOnError(result, rebuild)) {
+    return;
+  }
+
+  prepareCompletion(completion, finishReferenceCountRebuild,
+                    finishParentCallback, getAdminThread(getThreadConfig(vdo)),
+                    completion->parent);
+  rebuildReferenceCounts(vdo, completion, &rebuild->logicalBlocksUsed,
+                         &rebuild->blockMapDataBlocks);
+}
+
+/**
+ * Append an array of recovery journal entries from a journal block sector to
+ * the array of numbered mappings in the rebuild completion, numbering each
+ * entry in the order they are appended.
+ *
+ * @param rebuild     The journal rebuild completion
+ * @param sector      The recovery journal sector with entries
+ * @param entryCount  The number of entries to append
+ **/
+static void appendSectorEntries(ReadOnlyRebuildCompletion *rebuild,
+                                PackedJournalSector       *sector,
+                                JournalEntryCount          entryCount)
+{
+  for (JournalEntryCount i = 0; i < entryCount; i++) {
+    RecoveryJournalEntry entry
+      = unpackRecoveryJournalEntry(&sector->entries[i]);
+    int result = validateRecoveryJournalEntry(rebuild->vdo, &entry);
+    if (result != VDO_SUCCESS) {
+      // When recovering from read-only mode, ignore damaged entries.
+      continue;
+    }
+
+    if (isIncrementOperation(entry.operation)) {
+      rebuild->entries[rebuild->entryCount] = (NumberedBlockMapping) {
+        .blockMapSlot  = entry.slot,
+        .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state),
+        .number        = rebuild->entryCount,
+      };
+      rebuild->entryCount++;
+    }
+  }
+}
+
+/**
+ * Create an array of all valid journal entries, in order, and store
+ * it in the rebuild completion.
+ *
+ * @param rebuild  The journal rebuild completion
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int extractJournalEntries(ReadOnlyRebuildCompletion *rebuild)
+{
+  VDO             *vdo      = rebuild->vdo;
+  RecoveryJournal *journal  = vdo->recoveryJournal;
+  SequenceNumber   first    = rebuild->head;
+  SequenceNumber   last     = rebuild->tail;
+  BlockCount       maxCount = ((last - first + 1) * journal->entriesPerBlock);
+
+  // Allocate a NumberedBlockMapping array large enough to transcribe every
+  // PackedRecoveryJournalEntry from every valid journal block.
+  int result = ALLOCATE(maxCount, NumberedBlockMapping, __func__,
+                        &rebuild->entries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (SequenceNumber i = first; i <= last; i++) {
+    PackedJournalHeader *packedHeader
+      = getJournalBlockHeader(journal, rebuild->journalData, i);
+    RecoveryBlockHeader header;
+    unpackRecoveryBlockHeader(packedHeader, &header);
+
+    if (!isExactRecoveryJournalBlock(journal, &header, i)) {
+      // This block is invalid, so skip it.
+      continue;
+    }
+
+    // Don't extract more than the expected maximum entries per block.
+    JournalEntryCount blockEntries = minBlock(journal->entriesPerBlock,
+                                              header.entryCount);
+    for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) {
+      // Stop when all entries counted in the header are applied or skipped.
+      if (blockEntries == 0) {
+        break;
+      }
+
+      PackedJournalSector *sector = getJournalBlockSector(packedHeader, j);
+      if (!isValidRecoveryJournalSector(&header, sector)) {
+        blockEntries -= minBlock(blockEntries,
+                                 RECOVERY_JOURNAL_ENTRIES_PER_SECTOR);
+        continue;
+      }
+
+      // Don't extract more than the expected maximum entries per sector.
+      JournalEntryCount sectorEntries
+        = minBlock(sector->entryCount, RECOVERY_JOURNAL_ENTRIES_PER_SECTOR);
+      // Only extract as many as the block header calls for.
+      sectorEntries = minBlock(sectorEntries, blockEntries);
+      appendSectorEntries(rebuild, sector, sectorEntries);
+      // Even if the sector wasn't full, count it as full when counting up
+      // to the entry count the block header claims.
+      blockEntries -= minBlock(blockEntries,
+                               RECOVERY_JOURNAL_ENTRIES_PER_SECTOR);
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Determine the limits of the valid recovery journal and apply all
+ * valid entries to the block map. This callback is registered in
+ * rebuildJournalAsync().
+ *
+ * @param completion   The sub-task completion
+ **/
+static void applyJournalEntries(VDOCompletion *completion)
+{
+  ReadOnlyRebuildCompletion *rebuild
+    = asReadOnlyRebuildCompletion(completion->parent);
+  VDO *vdo = rebuild->vdo;
+
+  logInfo("Finished reading recovery journal");
+  assertOnLogicalZoneThread(vdo, 0, __func__);
+
+  bool foundEntries = findHeadAndTail(vdo->recoveryJournal,
+                                     rebuild->journalData, &rebuild->tail,
+                                     &rebuild->head, NULL);
+  if (foundEntries) {
+    int result = extractJournalEntries(rebuild);
+    if (abortRebuildOnError(result, rebuild)) {
+      return;
+    }
+  }
+
+  // Suppress block map errors.
+  setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, true);
+
+  // Play the recovery journal into the block map.
+  prepareCompletion(completion, launchReferenceCountRebuild,
+                    finishParentCallback, completion->callbackThreadID,
+                    completion->parent);
+  recoverBlockMap(vdo, rebuild->entryCount, rebuild->entries, completion);
+}
+
+/**
+ * Begin loading the journal.
+ *
+ * @param completion    The sub task completion
+ **/
+static void loadJournal(VDOCompletion *completion)
+{
+  ReadOnlyRebuildCompletion *rebuild
+    = asReadOnlyRebuildCompletion(completion->parent);
+  VDO *vdo = rebuild->vdo;
+  assertOnLogicalZoneThread(vdo, 0, __func__);
+
+  prepareCompletion(completion, applyJournalEntries, finishParentCallback,
+                    completion->callbackThreadID, completion->parent);
+  loadJournalAsync(vdo->recoveryJournal, completion, &rebuild->journalData);
+}
+
+/**********************************************************************/
+void launchRebuild(VDO *vdo, VDOCompletion *parent)
+{
+  // Note: These messages must be recognizable by Permabit::VDODeviceBase.
+  if (vdo->loadState == VDO_REBUILD_FOR_UPGRADE) {
+    logWarning("Rebuilding reference counts for upgrade");
+  } else {
+    logWarning("Rebuilding reference counts to clear read-only mode");
+    vdo->readOnlyRecoveries++;
+  }
+
+  ReadOnlyRebuildCompletion *rebuild;
+  int result = makeRebuildCompletion(vdo, &rebuild);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  VDOCompletion *completion = &rebuild->completion;
+  prepareCompletion(completion, finishRebuild, abortRebuild,
+                    parent->callbackThreadID, parent);
+
+  VDOCompletion *subTaskCompletion = &rebuild->subTaskCompletion;
+  prepareCompletion(subTaskCompletion, loadJournal, finishParentCallback,
+                    getLogicalZoneThread(getThreadConfig(vdo), 0),
+                    completion);
+  loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_REBUILD,
+                subTaskCompletion, NULL);
+}
diff --git a/vdo/base/readOnlyRebuild.h b/vdo/base/readOnlyRebuild.h
new file mode 100644
index 0000000..9f40ce6
--- /dev/null
+++ b/vdo/base/readOnlyRebuild.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.h#1 $
+ */
+
+#ifndef READ_ONLY_REBUILD_H
+#define READ_ONLY_REBUILD_H
+
+#include "completion.h"
+#include "vdo.h"
+
+/**
+ * Construct a ReadOnlyRebuildCompletion and launch it. Apply all valid journal
+ * block entries to all VDO structures. Must be launched from logical zone 0.
+ *
+ * @param vdo           The VDO to rebuild
+ * @param parent        The completion to notify when the rebuild is complete
+ **/
+void launchRebuild(VDO *vdo, VDOCompletion *parent);
+
+#endif // READ_ONLY_REBUILD_H
diff --git a/vdo/base/recoveryJournal.c b/vdo/base/recoveryJournal.c
new file mode 100644
index 0000000..c44053c
--- /dev/null
+++ b/vdo/base/recoveryJournal.c
@@ -0,0 +1,1403 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.c#30 $
+ */
+
+#include "recoveryJournal.h"
+#include "recoveryJournalInternals.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMap.h"
+#include "constants.h"
+#include "dataVIO.h"
+#include "extent.h"
+#include "header.h"
+#include "numUtils.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalBlock.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "waitQueue.h"
+
+typedef struct {
+  SequenceNumber journalStart;       // Sequence number to start the journal
+  BlockCount     logicalBlocksUsed;  // Number of logical blocks used by VDO
+  BlockCount     blockMapDataBlocks; // Number of block map pages allocated
+} __attribute__((packed)) RecoveryJournalState7_0;
+
+static const Header RECOVERY_JOURNAL_HEADER_7_0 = {
+  .id = RECOVERY_JOURNAL,
+  .version = {
+    .majorVersion = 7,
+    .minorVersion = 0,
+  },
+  .size = sizeof(RecoveryJournalState7_0),
+};
+
+static const uint64_t RECOVERY_COUNT_MASK = 0xff;
+
+enum {
+  /*
+   * The number of reserved blocks must be large enough to prevent a
+   * new recovery journal block write from overwriting a block which
+   * appears to still be a valid head block of the journal. Currently,
+   * that means reserving enough space for all 2048 VIOs, or 8 blocks.
+   */
+  RECOVERY_JOURNAL_RESERVED_BLOCKS = 8,
+};
+
+/**********************************************************************/
+const char *getJournalOperationName(JournalOperation operation)
+{
+  switch (operation) {
+  case DATA_DECREMENT:
+    return "data decrement";
+
+  case DATA_INCREMENT:
+    return "data increment";
+
+  case BLOCK_MAP_DECREMENT:
+    return "block map decrement";
+
+  case BLOCK_MAP_INCREMENT:
+    return "block map increment";
+
+  default:
+    return "unknown journal operation";
+  }
+}
+
+/**
+ * Get a block from the end of the free list.
+ *
+ * @param journal  The journal
+ *
+ * @return The block or <code>NULL</code> if the list is empty
+ **/
+static RecoveryJournalBlock *popFreeList(RecoveryJournal *journal)
+{
+  return blockFromRingNode(popRingNode(&journal->freeTailBlocks));
+}
+
+/**
+ * Get a block from the end of the active list.
+ *
+ * @param journal  The journal
+ *
+ * @return The block or <code>NULL</code> if the list is empty
+ **/
+static RecoveryJournalBlock *popActiveList(RecoveryJournal *journal)
+{
+  return blockFromRingNode(popRingNode(&journal->activeTailBlocks));
+}
+
+/**
+ * Assert that we are running on the journal thread.
+ *
+ * @param journal       The journal
+ * @param functionName  The function doing the check (for logging)
+ **/
+static void assertOnJournalThread(RecoveryJournal *journal,
+                                  const char      *functionName)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID() == journal->threadID),
+                  "%s() called on journal thread", functionName);
+}
+
+/**
+ * WaiterCallback implementation invoked whenever a DataVIO is to be released
+ * from the journal, either because its entry was committed to disk,
+ * or because there was an error.
+ **/
+static void continueWaiter(Waiter *waiter, void *context)
+{
+  DataVIO *dataVIO = waiterAsDataVIO(waiter);
+  dataVIOAddTraceRecord(dataVIO,
+                        THIS_LOCATION("$F($j-$js);"
+                                      "cb=continueJournalWaiter($j-$js)"));
+  int waitResult = *((int *) context);
+  continueDataVIO(dataVIO, waitResult);
+}
+
+/**
+ * Check whether the journal has any waiters on any blocks.
+ *
+ * @param journal  The journal in question
+ *
+ * @return <code>true</code> if any block has a waiter
+ **/
+static inline bool hasBlockWaiters(RecoveryJournal *journal)
+{
+  // Either the first active tail block (if it exists) has waiters,
+  // or no active tail block has waiters.
+  if (isRingEmpty(&journal->activeTailBlocks)) {
+    return false;
+  }
+
+  RecoveryJournalBlock *block
+    = blockFromRingNode(journal->activeTailBlocks.next);
+  return (hasWaiters(&block->entryWaiters)
+          || hasWaiters(&block->commitWaiters));
+}
+
+/**********************************************************************/
+static void recycleJournalBlocks(RecoveryJournal *block);
+static void recycleJournalBlock(RecoveryJournalBlock *block);
+static void notifyCommitWaiters(RecoveryJournal *journal);
+
+/**
+ * Check whether the journal has drained.
+ *
+ * @param journal The journal which may have just drained
+ **/
+static void checkForDrainComplete(RecoveryJournal *journal)
+{
+  int result = VDO_SUCCESS;
+  if (isReadOnly(journal->readOnlyNotifier)) {
+    result = VDO_READ_ONLY;
+    /*
+     * Clean up any full active blocks which were not written due to being
+     * in read-only mode.
+     *
+     * XXX: This would probably be better as a short-circuit in writeBlock().
+     */
+    notifyCommitWaiters(journal);
+    recycleJournalBlocks(journal);
+
+    // Release any DataVIOs waiting to be assigned entries.
+    notifyAllWaiters(&journal->decrementWaiters, continueWaiter, &result);
+    notifyAllWaiters(&journal->incrementWaiters, continueWaiter, &result);
+  }
+
+  if (!isDraining(&journal->state)
+      || journal->reaping || hasBlockWaiters(journal)
+      || hasWaiters(&journal->incrementWaiters)
+      || hasWaiters(&journal->decrementWaiters)
+      || !suspendLockCounter(journal->lockCounter)) {
+    return;
+  }
+
+  if (isSaving(&journal->state)) {
+    if (journal->activeBlock != NULL) {
+      ASSERT_LOG_ONLY(((result == VDO_READ_ONLY)
+                       || !isRecoveryBlockDirty(journal->activeBlock)),
+                      "journal being saved has clean active block");
+      recycleJournalBlock(journal->activeBlock);
+    }
+
+    ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks),
+                    "all blocks in a journal being saved must be inactive");
+  }
+
+  finishDrainingWithResult(&journal->state, result);
+}
+
+/**
+ * Notifiy a recovery journal that the VDO has gone read-only.
+ *
+ * <p>Implements ReadOnlyNotification.
+ *
+ * @param listener  The journal
+ * @param parent    The completion to notify in order to acknowledge the
+ *                  notification
+ **/
+static void notifyRecoveryJournalOfReadOnlyMode(void          *listener,
+                                                VDOCompletion *parent)
+{
+  checkForDrainComplete(listener);
+  completeCompletion(parent);
+}
+
+/**
+ * Put the journal in read-only mode. All attempts to add entries after
+ * this function is called will fail. All VIOs waiting for commits will be
+ * awakened with an error.
+ *
+ * @param journal    The journal which has failed
+ * @param errorCode  The error result triggering this call
+ **/
+static void enterJournalReadOnlyMode(RecoveryJournal *journal, int errorCode)
+{
+  enterReadOnlyMode(journal->readOnlyNotifier, errorCode);
+  checkForDrainComplete(journal);
+}
+
+/**********************************************************************/
+SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal)
+{
+  return journal->tail;
+}
+
+/**
+ * Get the head of the recovery journal, which is the lowest sequence number of
+ * the block map head and the slab journal head.
+ *
+ * @param journal    The journal
+ *
+ * @return the head of the journal
+ **/
+static inline SequenceNumber getRecoveryJournalHead(RecoveryJournal *journal)
+{
+  return minSequenceNumber(journal->blockMapHead, journal->slabJournalHead);
+}
+
+/**
+ * Compute the recovery count byte for a given recovery count.
+ *
+ * @param recoveryCount  The recovery count
+ *
+ * @return The byte corresponding to the recovery count
+ **/
+__attribute__((warn_unused_result))
+static inline uint8_t computeRecoveryCountByte(uint64_t recoveryCount)
+{
+  return (uint8_t) (recoveryCount & RECOVERY_COUNT_MASK);
+}
+
+/**
+ * Check whether the journal is over the threshold, and if so, force the oldest
+ * slab journal tail block to commit.
+ *
+ * @param journal    The journal
+ **/
+static void checkSlabJournalCommitThreshold(RecoveryJournal *journal)
+{
+  BlockCount currentLength = journal->tail - journal->slabJournalHead;
+  if (currentLength > journal->slabJournalCommitThreshold) {
+    journal->events.slabJournalCommitsRequested++;
+    commitOldestSlabJournalTailBlocks(journal->depot,
+                                      journal->slabJournalHead);
+  }
+}
+
+/**********************************************************************/
+static void reapRecoveryJournal(RecoveryJournal *journal);
+static void assignEntries(RecoveryJournal *journal);
+
+/**
+ * Finish reaping the journal.
+ *
+ * @param journal The journal being reaped
+ **/
+static void finishReaping(RecoveryJournal *journal)
+{
+  SequenceNumber oldHead    = getRecoveryJournalHead(journal);
+  journal->blockMapHead     = journal->blockMapReapHead;
+  journal->slabJournalHead  = journal->slabJournalReapHead;
+  BlockCount blocksReaped   = getRecoveryJournalHead(journal) - oldHead;
+  journal->availableSpace  += blocksReaped * journal->entriesPerBlock;
+  journal->reaping          = false;
+  checkSlabJournalCommitThreshold(journal);
+  assignEntries(journal);
+  checkForDrainComplete(journal);
+}
+
+/**
+ * Finish reaping the journal after flushing the lower layer. This is the
+ * callback registered in reapRecoveryJournal().
+ *
+ * @param completion  The journal's flush VIO
+ **/
+static void completeReaping(VDOCompletion *completion)
+{
+  RecoveryJournal *journal = completion->parent;
+  finishReaping(journal);
+
+  // Try reaping again in case more locks were released while flush was out.
+  reapRecoveryJournal(journal);
+}
+
+/**
+ * Handle an error when flushing the lower layer due to reaping.
+ *
+ * @param completion  The journal's flush VIO
+ **/
+static void handleFlushError(VDOCompletion *completion)
+{
+  RecoveryJournal *journal = completion->parent;
+  journal->reaping = false;
+  enterJournalReadOnlyMode(journal, completion->result);
+}
+
+/**
+ * Set all journal fields appropriately to start journaling from the current
+ * active block.
+ *
+ * @param journal  The journal to be reset based on its active block
+ **/
+static void initializeJournalState(RecoveryJournal *journal)
+{
+  journal->appendPoint.sequenceNumber = journal->tail;
+  journal->lastWriteAcknowledged      = journal->tail;
+  journal->blockMapHead               = journal->tail;
+  journal->slabJournalHead            = journal->tail;
+  journal->blockMapReapHead           = journal->tail;
+  journal->slabJournalReapHead        = journal->tail;
+  journal->blockMapHeadBlockNumber
+    = getRecoveryJournalBlockNumber(journal, journal->blockMapHead);
+  journal->slabJournalHeadBlockNumber
+    = getRecoveryJournalBlockNumber(journal, journal->slabJournalHead);
+}
+
+/**********************************************************************/
+BlockCount getRecoveryJournalLength(BlockCount journalSize)
+{
+  BlockCount reservedBlocks = journalSize / 4;
+  if (reservedBlocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) {
+    reservedBlocks = RECOVERY_JOURNAL_RESERVED_BLOCKS;
+  }
+  return (journalSize - reservedBlocks);
+}
+
+/**
+ * Attempt to reap the journal now that all the locks on some journal block
+ * have been released. This is the callback registered with the lock counter.
+ *
+ * @param completion  The lock counter completion
+ **/
+static void reapRecoveryJournalCallback(VDOCompletion *completion)
+{
+  RecoveryJournal *journal = (RecoveryJournal *) completion->parent;
+  // The acknowledgement must be done before reaping so that there is no
+  // race between acknowledging the notification and unlocks wishing to notify.
+  acknowledgeUnlock(journal->lockCounter);
+
+  if (isQuiescing(&journal->state)) {
+    // Don't start reaping when the journal is trying to quiesce. Do check if
+    // this notification is the last thing the drain is waiting on.
+    checkForDrainComplete(journal);
+    return;
+  }
+
+  reapRecoveryJournal(journal);
+  checkSlabJournalCommitThreshold(journal);
+}
+
+/**********************************************************************
+ * Set the journal's tail sequence number.
+ *
+ * @param journal The journal whose tail is to be set
+ * @param tail    The new tail value
+ **/
+static void setJournalTail(RecoveryJournal *journal, SequenceNumber tail)
+{
+  // VDO does not support sequence numbers above 1 << 48 in the slab journal.
+  if (tail >= (1ULL << 48)) {
+    enterJournalReadOnlyMode(journal, VDO_JOURNAL_OVERFLOW);
+  }
+
+  journal->tail = tail;
+}
+
+/**********************************************************************/
+int makeRecoveryJournal(Nonce                nonce,
+                        PhysicalLayer       *layer,
+                        Partition           *partition,
+                        uint64_t             recoveryCount,
+                        BlockCount           journalSize,
+                        BlockCount           tailBufferSize,
+                        ReadOnlyNotifier    *readOnlyNotifier,
+                        const ThreadConfig  *threadConfig,
+                        RecoveryJournal    **journalPtr)
+{
+  RecoveryJournal *journal;
+  int result = ALLOCATE(1, RecoveryJournal, __func__, &journal);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  initializeRing(&journal->freeTailBlocks);
+  initializeRing(&journal->activeTailBlocks);
+  initializeWaitQueue(&journal->pendingWrites);
+
+  journal->threadID         = getJournalZoneThread(threadConfig);
+  journal->partition        = partition;
+  journal->nonce            = nonce;
+  journal->recoveryCount    = computeRecoveryCountByte(recoveryCount);
+  journal->size             = journalSize;
+  journal->readOnlyNotifier = readOnlyNotifier;
+  journal->tail             = 1;
+  journal->slabJournalCommitThreshold = (journalSize * 2) / 3;
+  initializeJournalState(journal);
+
+  journal->entriesPerBlock = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK;
+  BlockCount journalLength = getRecoveryJournalLength(journalSize);
+  journal->availableSpace  = journal->entriesPerBlock * journalLength;
+
+  // Only make the tail buffer and VIO in normal operation since the formatter
+  // doesn't need them.
+  if (layer->createMetadataVIO != NULL) {
+    for (BlockCount i = 0; i < tailBufferSize; i++) {
+      RecoveryJournalBlock *block;
+      result = makeRecoveryBlock(layer, journal, &block);
+      if (result != VDO_SUCCESS) {
+        freeRecoveryJournal(&journal);
+        return result;
+      }
+
+      pushRingNode(&journal->freeTailBlocks, &block->ringNode);
+    }
+
+    result = makeLockCounter(layer, journal, reapRecoveryJournalCallback,
+                             journal->threadID, threadConfig->logicalZoneCount,
+                             threadConfig->physicalZoneCount, journal->size,
+                             &journal->lockCounter);
+    if (result != VDO_SUCCESS) {
+      freeRecoveryJournal(&journal);
+      return result;
+    }
+
+    result = ALLOCATE(VDO_BLOCK_SIZE, char, "journal flush data",
+                      &journal->unusedFlushVIOData);
+    if (result != VDO_SUCCESS) {
+      freeRecoveryJournal(&journal);
+      return result;
+    }
+
+    result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH,
+                       journal, journal->unusedFlushVIOData,
+                       &journal->flushVIO);
+    if (result != VDO_SUCCESS) {
+      freeRecoveryJournal(&journal);
+      return result;
+    }
+
+    result = registerReadOnlyListener(readOnlyNotifier, journal,
+                                      notifyRecoveryJournalOfReadOnlyMode,
+                                      journal->threadID);
+    if (result != VDO_SUCCESS) {
+      freeRecoveryJournal(&journal);
+      return result;
+    }
+
+    journal->flushVIO->completion.callbackThreadID = journal->threadID;
+  }
+
+  *journalPtr = journal;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeRecoveryJournal(RecoveryJournal **journalPtr)
+{
+  RecoveryJournal *journal = *journalPtr;
+  if (journal == NULL) {
+    return;
+  }
+
+  freeLockCounter(&journal->lockCounter);
+  freeVIO(&journal->flushVIO);
+  FREE(journal->unusedFlushVIOData);
+
+  // XXX: eventually, the journal should be constructed in a quiescent state
+  //      which requires opening before use.
+  if (!isQuiescent(&journal->state)) {
+    ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks),
+                    "journal being freed has no active tail blocks");
+  } else if (!isSaved(&journal->state)
+             && !isRingEmpty(&journal->activeTailBlocks)) {
+    logWarning("journal being freed has uncommited entries");
+  }
+
+  RecoveryJournalBlock *block;
+  while ((block = popActiveList(journal)) != NULL) {
+    freeRecoveryBlock(&block);
+  }
+
+  while ((block = popFreeList(journal)) != NULL) {
+    freeRecoveryBlock(&block);
+  }
+
+  FREE(journal);
+  *journalPtr = NULL;
+}
+
+/**********************************************************************/
+void setRecoveryJournalPartition(RecoveryJournal *journal,
+                                 Partition       *partition)
+{
+  journal->partition = partition;
+}
+
+/**********************************************************************/
+void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal,
+                                           uint64_t         recoveryCount,
+                                           SequenceNumber   tail)
+{
+  setJournalTail(journal, tail + 1);
+  journal->recoveryCount = computeRecoveryCountByte(recoveryCount);
+  initializeJournalState(journal);
+}
+
+/**********************************************************************/
+void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal,
+                                          uint64_t         recoveryCount,
+                                          SequenceNumber   tail,
+                                          BlockCount       logicalBlocksUsed,
+                                          BlockCount       blockMapDataBlocks)
+{
+  initializeRecoveryJournalPostRecovery(journal, recoveryCount, tail);
+  journal->logicalBlocksUsed  = logicalBlocksUsed;
+  journal->blockMapDataBlocks = blockMapDataBlocks;
+}
+
+/**********************************************************************/
+BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal)
+{
+  return journal->blockMapDataBlocks;
+}
+
+/**********************************************************************/
+void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal,
+                                      BlockCount       pages)
+{
+  journal->blockMapDataBlocks = pages;
+}
+
+/**********************************************************************/
+ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal)
+{
+  return journal->threadID;
+}
+
+/**********************************************************************/
+void openRecoveryJournal(RecoveryJournal *journal,
+                         SlabDepot       *depot,
+                         BlockMap        *blockMap)
+{
+  journal->depot    = depot;
+  journal->blockMap = blockMap;
+  journal->state.state = ADMIN_STATE_NORMAL_OPERATION;
+}
+
+/**********************************************************************/
+size_t getRecoveryJournalEncodedSize(void)
+{
+  return ENCODED_HEADER_SIZE + sizeof(RecoveryJournalState7_0);
+}
+
+/**********************************************************************/
+int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+{
+  SequenceNumber journalStart;
+  if (isSaved(&journal->state)) {
+    // If the journal is saved, we should start one past the active block
+    // (since the active block is not guaranteed to be empty).
+    journalStart = journal->tail;
+  } else {
+    // When we're merely suspended or have gone read-only, we must record the
+    // first block that might have entries that need to be applied.
+    journalStart = getRecoveryJournalHead(journal);
+  }
+
+  int result = encodeHeader(&RECOVERY_JOURNAL_HEADER_7_0, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t initialLength = contentLength(buffer);
+
+  result = putUInt64LEIntoBuffer(buffer, journalStart);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, journal->logicalBlocksUsed);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, journal->blockMapDataBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t encodedSize = contentLength(buffer) - initialLength;
+  return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == encodedSize,
+                "encoded recovery journal component size"
+                " must match header size");
+}
+
+/**
+ * Decode recovery journal component state version 7.0 from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param state   The state structure to receive the decoded values
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodeRecoveryJournalState_7_0(Buffer                  *buffer,
+                                          RecoveryJournalState7_0 *state)
+{
+  size_t initialLength = contentLength(buffer);
+
+  SequenceNumber journalStart;
+  int result = getUInt64LEFromBuffer(buffer, &journalStart);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BlockCount logicalBlocksUsed;
+  result = getUInt64LEFromBuffer(buffer, &logicalBlocksUsed);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  BlockCount blockMapDataBlocks;
+  result = getUInt64LEFromBuffer(buffer, &blockMapDataBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  *state = (RecoveryJournalState7_0) {
+    .journalStart       = journalStart,
+    .logicalBlocksUsed  = logicalBlocksUsed,
+    .blockMapDataBlocks = blockMapDataBlocks,
+  };
+
+  size_t decodedSize = initialLength - contentLength(buffer);
+  return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == decodedSize,
+                "decoded slab depot component size must match header size");
+}
+
+/**********************************************************************/
+int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+{
+  Header header;
+  int result = decodeHeader(buffer, &header);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateHeader(&RECOVERY_JOURNAL_HEADER_7_0, &header,
+                          true, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  RecoveryJournalState7_0 state;
+  result = decodeRecoveryJournalState_7_0(buffer, &state);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Update recovery journal in-memory information.
+  setJournalTail(journal, state.journalStart);
+  journal->logicalBlocksUsed  = state.logicalBlocksUsed;
+  journal->blockMapDataBlocks = state.blockMapDataBlocks;
+  initializeJournalState(journal);
+
+  // XXX: this is a hack until we make initial resume of a VDO a real resume
+  journal->state.state = ADMIN_STATE_SUSPENDED;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+{
+  // Sodium uses version 7.0, same as head, currently.
+  return decodeRecoveryJournal(journal, buffer);
+}
+
+/**
+ * Advance the tail of the journal.
+ *
+ * @param journal  The journal whose tail should be advanced
+ *
+ * @return <code>true</code> if the tail was advanced
+ **/
+static bool advanceTail(RecoveryJournal *journal)
+{
+  journal->activeBlock = popFreeList(journal);
+  if (journal->activeBlock == NULL) {
+    return false;
+  }
+
+  pushRingNode(&journal->activeTailBlocks, &journal->activeBlock->ringNode);
+  initializeRecoveryBlock(journal->activeBlock);
+  setJournalTail(journal, journal->tail + 1);
+  advanceBlockMapEra(journal->blockMap, journal->tail);
+  return true;
+}
+
+/**
+ * Check whether there is space to make a given type of entry.
+ *
+ * @param journal    The journal to check
+ * @param increment  Set to <code>true</code> if the desired entry is an
+ *                   increment
+ *
+ * @return <code>true</code> if there is space in the journal to make an
+ *         entry of the specified type
+ **/
+static bool checkForEntrySpace(RecoveryJournal *journal, bool increment)
+{
+  if (increment) {
+    return ((journal->availableSpace - journal->pendingDecrementCount) > 1);
+  }
+
+  return (journal->availableSpace > 0);
+}
+
+/**
+ * Prepare the currently active block to receive an entry and check whether
+ * an entry of the given type may be assigned at this time.
+ *
+ * @param journal    The journal receiving an entry
+ * @param increment  Set to <code>true</code> if the desired entry is an
+ *                   increment
+ *
+ * @return <code>true</code> if there is space in the journal to store an
+ *         entry of the specified type
+ **/
+static bool prepareToAssignEntry(RecoveryJournal *journal, bool increment)
+{
+  if (!checkForEntrySpace(journal, increment)) {
+    if (!increment) {
+      // There must always be room to make a decrement entry.
+      logError("No space for decrement entry in recovery journal");
+      enterJournalReadOnlyMode(journal, VDO_RECOVERY_JOURNAL_FULL);
+    }
+    return false;
+  }
+
+  if (isRecoveryBlockFull(journal->activeBlock) && !advanceTail(journal)) {
+    return false;
+  }
+
+  if (!isRecoveryBlockEmpty(journal->activeBlock)) {
+    return true;
+  }
+
+  if ((journal->tail - getRecoveryJournalHead(journal)) > journal->size) {
+    // Cannot use this block since the journal is full.
+    journal->events.diskFull++;
+    return false;
+  }
+
+  /*
+   * Don't allow the new block to be reaped until all of its entries have been
+   * committed to the block map and until the journal block has been fully
+   * committed as well. Because the block map update is done only after any
+   * slab journal entries have been made, the per-entry lock for the block map
+   * entry serves to protect those as well.
+   */
+  initializeLockCount(journal->lockCounter, journal->activeBlock->blockNumber,
+                      journal->entriesPerBlock + 1);
+  return true;
+}
+
+/**********************************************************************/
+static void writeBlocks(RecoveryJournal *journal);
+
+/**
+ * Queue a block for writing. The block is expected to be full. If the block
+ * is currently writing, this is a noop as the block will be queued for
+ * writing when the write finishes. The block must not currently be queued
+ * for writing.
+ *
+ * @param journal  The journal in question
+ * @param block    The block which is now ready to write
+ **/
+static void scheduleBlockWrite(RecoveryJournal      *journal,
+                               RecoveryJournalBlock *block)
+{
+  if (block->committing) {
+    return;
+  }
+
+  int result = enqueueWaiter(&journal->pendingWrites, &block->writeWaiter);
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(journal, result);
+    return;
+  }
+
+  PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer;
+  if ((layer->getWritePolicy(layer) == WRITE_POLICY_ASYNC)) {
+    /*
+     * At the end of adding entries, or discovering this partial block
+     * is now full and ready to rewrite, we will call writeBlocks() and
+     * write a whole batch.
+     */
+    return;
+  }
+  writeBlocks(journal);
+}
+
+/**
+ * Release a reference to a journal block.
+ *
+ * @param block  The journal block from which to release a reference
+ **/
+static void releaseJournalBlockReference(RecoveryJournalBlock *block)
+{
+  releaseJournalZoneReference(block->journal->lockCounter, block->blockNumber);
+}
+
+/**
+ * Implements WaiterCallback. Assign an entry waiter to the active block.
+ **/
+static void assignEntry(Waiter *waiter, void *context)
+{
+  DataVIO              *dataVIO = waiterAsDataVIO(waiter);
+  RecoveryJournalBlock *block   = (RecoveryJournalBlock *) context;
+  RecoveryJournal      *journal = block->journal;
+
+  // Record the point at which we will make the journal entry.
+  dataVIO->recoveryJournalPoint = (JournalPoint) {
+    .sequenceNumber = block->sequenceNumber,
+    .entryCount     = block->entryCount,
+  };
+
+  switch (dataVIO->operation.type) {
+  case DATA_INCREMENT:
+    if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) {
+      journal->logicalBlocksUsed++;
+    }
+    journal->pendingDecrementCount++;
+    break;
+
+  case DATA_DECREMENT:
+    if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) {
+      journal->logicalBlocksUsed--;
+    }
+
+    // Per-entry locks need not be held for decrement entries since the lock
+    // held for the incref entry will protect this entry as well.
+    releaseJournalBlockReference(block);
+    ASSERT_LOG_ONLY((journal->pendingDecrementCount != 0),
+                    "decrement follows increment");
+    journal->pendingDecrementCount--;
+    break;
+
+  case BLOCK_MAP_INCREMENT:
+    journal->blockMapDataBlocks++;
+    break;
+
+  default:
+    logError("Invalid journal operation %u", dataVIO->operation.type);
+    enterJournalReadOnlyMode(journal, VDO_NOT_IMPLEMENTED);
+    continueDataVIO(dataVIO, VDO_NOT_IMPLEMENTED);
+    return;
+  }
+
+  journal->availableSpace--;
+  int result = enqueueRecoveryBlockEntry(block, dataVIO);
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(journal, result);
+    continueDataVIO(dataVIO, result);
+  }
+
+  if (isRecoveryBlockFull(block)) {
+    // The block is full, so we can write it anytime henceforth. If it is
+    // already committing, we'll queue it for writing when it comes back.
+    scheduleBlockWrite(journal, block);
+  }
+
+  // Force out slab journal tail blocks when threshold is reached.
+  checkSlabJournalCommitThreshold(journal);
+}
+
+/**********************************************************************/
+static bool assignEntriesFromQueue(RecoveryJournal *journal,
+                                   WaitQueue       *queue,
+                                   bool             increment)
+{
+  while (hasWaiters(queue)) {
+    if (!prepareToAssignEntry(journal, increment)) {
+      return false;
+    }
+
+    notifyNextWaiter(queue, assignEntry, journal->activeBlock);
+  }
+
+  return true;
+}
+
+/**********************************************************************/
+static void assignEntries(RecoveryJournal *journal)
+{
+  if (journal->addingEntries) {
+    // Protect against re-entrancy.
+    return;
+  }
+
+  journal->addingEntries = true;
+  if (assignEntriesFromQueue(journal, &journal->decrementWaiters, false)) {
+    assignEntriesFromQueue(journal, &journal->incrementWaiters, true);
+  }
+
+  // Now that we've finished with entries, see if we have a batch of blocks to
+  // write.
+  writeBlocks(journal);
+  journal->addingEntries = false;
+}
+
+/**
+ * Prepare an in-memory journal block to be reused now that it has been fully
+ * committed.
+ *
+ * @param block  The block to be recycled
+ **/
+static void recycleJournalBlock(RecoveryJournalBlock *block)
+{
+  RecoveryJournal *journal = block->journal;
+  pushRingNode(&journal->freeTailBlocks, &block->ringNode);
+
+  // Release any unused entry locks.
+  for (BlockCount i = block->entryCount; i < journal->entriesPerBlock; i++) {
+    releaseJournalBlockReference(block);
+  }
+
+  // Release our own lock against reaping now that the block is completely
+  // committed, or we're giving up because we're in read-only mode.
+  if (block->entryCount > 0) {
+    releaseJournalBlockReference(block);
+  }
+
+  if (block == journal->activeBlock) {
+    journal->activeBlock = NULL;
+  }
+}
+
+/**
+ * WaiterCallback implementation invoked whenever a VIO is to be released
+ * from the journal because its entry was committed to disk.
+ **/
+static void continueCommittedWaiter(Waiter *waiter, void *context)
+{
+  DataVIO         *dataVIO = waiterAsDataVIO(waiter);
+  RecoveryJournal *journal = (RecoveryJournal *) context;
+  ASSERT_LOG_ONLY(beforeJournalPoint(&journal->commitPoint,
+                                     &dataVIO->recoveryJournalPoint),
+                  "DataVIOs released from recovery journal in order. "
+                  "Recovery journal point is (%llu, %" PRIu16 "), "
+                  "but commit waiter point is (%llu, %" PRIu16 ")",
+                  journal->commitPoint.sequenceNumber,
+                  journal->commitPoint.entryCount,
+                  dataVIO->recoveryJournalPoint.sequenceNumber,
+                  dataVIO->recoveryJournalPoint.entryCount);
+  journal->commitPoint = dataVIO->recoveryJournalPoint;
+
+  int result
+    = (isReadOnly(journal->readOnlyNotifier) ? VDO_READ_ONLY : VDO_SUCCESS);
+  continueWaiter(waiter, &result);
+}
+
+/**
+ * Notify any VIOs whose entries have now committed.
+ *
+ * @param journal  The recovery journal to update
+ **/
+static void notifyCommitWaiters(RecoveryJournal *journal)
+{
+  if (isRingEmpty(&journal->activeTailBlocks)) {
+    return;
+  }
+
+  for (RingNode *node = journal->activeTailBlocks.next;
+       node != &journal->activeTailBlocks;
+       node = node->next) {
+    RecoveryJournalBlock *block = blockFromRingNode(node);
+
+    if (block->committing) {
+      return;
+    }
+
+    notifyAllWaiters(&block->commitWaiters, continueCommittedWaiter, journal);
+    if (isReadOnly(journal->readOnlyNotifier)) {
+      notifyAllWaiters(&block->entryWaiters, continueCommittedWaiter, journal);
+    } else if (isRecoveryBlockDirty(block) || !isRecoveryBlockFull(block)) {
+      // Stop at partially-committed or partially-filled blocks.
+      return;
+    }
+  }
+}
+
+/**
+ * Recycle any journal blocks which have been fully committed.
+ *
+ * @param journal  The recovery journal to update
+ **/
+static void recycleJournalBlocks(RecoveryJournal *journal)
+{
+  while (!isRingEmpty(&journal->activeTailBlocks)) {
+    RecoveryJournalBlock *block
+      = blockFromRingNode(journal->activeTailBlocks.next);
+
+    if (block->committing) {
+      // Don't recycle committing blocks.
+       return;
+    }
+
+    if (!isReadOnly(journal->readOnlyNotifier)
+        && (isRecoveryBlockDirty(block)
+            || !isRecoveryBlockFull(block))) {
+      // Don't recycle partially written or partially full
+      // blocks, except in read-only mode.
+      return;
+    }
+    recycleJournalBlock(block);
+  }
+}
+
+/**
+ * Handle post-commit processing. This is the callback registered by
+ * writeBlock(). If more entries accumulated in the block being committed while
+ * the commit was in progress, another commit will be initiated.
+ *
+ * @param completion  The completion of the VIO writing this block
+ **/
+static void completeWrite(VDOCompletion *completion)
+{
+  RecoveryJournalBlock *block   = completion->parent;
+  RecoveryJournal      *journal = block->journal;
+  assertOnJournalThread(journal, __func__);
+
+  journal->pendingWriteCount        -= 1;
+  journal->events.blocks.committed  += 1;
+  journal->events.entries.committed += block->entriesInCommit;
+  block->uncommittedEntryCount      -= block->entriesInCommit;
+  block->entriesInCommit             = 0;
+  block->committing                  = false;
+
+  // If this block is the latest block to be acknowledged, record that fact.
+  if (block->sequenceNumber > journal->lastWriteAcknowledged) {
+    journal->lastWriteAcknowledged = block->sequenceNumber;
+  }
+
+  RecoveryJournalBlock *lastActiveBlock
+    = blockFromRingNode(journal->activeTailBlocks.next);
+  ASSERT_LOG_ONLY((block->sequenceNumber >= lastActiveBlock->sequenceNumber),
+                  "completed journal write is still active");
+
+  notifyCommitWaiters(journal);
+
+  // Is this block now full? Reaping, and adding entries, might have already
+  // sent it off for rewriting; else, queue it for rewrite.
+  if (isRecoveryBlockDirty(block) && isRecoveryBlockFull(block)) {
+    scheduleBlockWrite(journal, block);
+  }
+
+  recycleJournalBlocks(journal);
+  writeBlocks(journal);
+
+  checkForDrainComplete(journal);
+}
+
+/**********************************************************************/
+static void handleWriteError(VDOCompletion *completion)
+{
+  RecoveryJournalBlock *block   = completion->parent;
+  RecoveryJournal      *journal = block->journal;
+  logErrorWithStringError(completion->result,
+                          "cannot write recovery journal block %llu",
+                          block->sequenceNumber);
+  enterJournalReadOnlyMode(journal, completion->result);
+  completeWrite(completion);
+}
+
+/**
+ * Issue a block for writing. Implements WaiterCallback.
+ **/
+static void writeBlock(Waiter *waiter, void *context __attribute__((unused)))
+{
+  RecoveryJournalBlock *block = blockFromWaiter(waiter);
+  if (isReadOnly(block->journal->readOnlyNotifier)) {
+    return;
+  }
+
+  int result = commitRecoveryBlock(block, completeWrite, handleWriteError);
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(block->journal, result);
+  }
+}
+
+/**
+ * Attempt to commit blocks, according to write policy.
+ *
+ * @param journal     The recovery journal
+ **/
+static void writeBlocks(RecoveryJournal *journal)
+{
+  assertOnJournalThread(journal, __func__);
+  /*
+   * In sync and async-unsafe modes, we call this function each time we queue
+   * a full block on pending writes; in addition, in all cases we call this
+   * function after adding entries to the journal and finishing a block write.
+   * Thus, when this function terminates we must either have no VIOs waiting
+   * in the journal or have some outstanding IO to provide a future wakeup.
+   *
+   * In all modes, if there are no outstanding writes and some unwritten
+   * entries, we must issue a block, even if it's the active block and it
+   * isn't full. Otherwise, in sync/async-unsafe modes, we want to issue
+   * all full blocks every time; since we call it each time we fill a block,
+   * this is equivalent to issuing every full block as soon as its full. In
+   * async mode, we want to only issue full blocks if there are no
+   * pending writes.
+   */
+
+  PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer;
+  if ((layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC)
+      || (journal->pendingWriteCount == 0)) {
+    // Write all the full blocks.
+    notifyAllWaiters(&journal->pendingWrites, writeBlock, NULL);
+  }
+
+  // Do we need to write the active block? Only if we have no outstanding
+  // writes, even after issuing all of the full writes.
+  if ((journal->pendingWriteCount == 0)
+      && canCommitRecoveryBlock(journal->activeBlock)) {
+    writeBlock(&journal->activeBlock->writeWaiter, NULL);
+  }
+}
+
+/**********************************************************************/
+void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO)
+{
+  assertOnJournalThread(journal, __func__);
+  if (!isNormal(&journal->state)) {
+    continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE);
+    return;
+  }
+
+  if (isReadOnly(journal->readOnlyNotifier)) {
+    continueDataVIO(dataVIO, VDO_READ_ONLY);
+    return;
+  }
+
+  bool increment = isIncrementOperation(dataVIO->operation.type);
+  ASSERT_LOG_ONLY((!increment || (dataVIO->recoverySequenceNumber == 0)),
+                  "journal lock not held for increment");
+
+  advanceJournalPoint(&journal->appendPoint, journal->entriesPerBlock);
+  int result = enqueueDataVIO((increment
+                               ? &journal->incrementWaiters
+                               : &journal->decrementWaiters), dataVIO,
+                              THIS_LOCATION("$F($j-$js);io=journal($j-$js)"));
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(journal, result);
+    continueDataVIO(dataVIO, result);
+    return;
+  }
+
+  assignEntries(journal);
+}
+
+/**
+ * Conduct a sweep on a recovery journal to reclaim unreferenced blocks.
+ *
+ * @param journal  The recovery journal
+ **/
+static void reapRecoveryJournal(RecoveryJournal *journal)
+{
+  if (journal->reaping) {
+    // We already have an outstanding reap in progress. We need to wait for it
+    // to finish.
+    return;
+  }
+
+  if (isQuiescent(&journal->state)) {
+    // We are supposed to not do IO. Don't botch it by reaping.
+    return;
+  }
+
+  // Start reclaiming blocks only when the journal head has no references. Then
+  // stop when a block is referenced.
+  while ((journal->blockMapReapHead < journal->lastWriteAcknowledged)
+         && !isLocked(journal->lockCounter, journal->blockMapHeadBlockNumber,
+                      ZONE_TYPE_LOGICAL)) {
+    journal->blockMapReapHead++;
+    if (++journal->blockMapHeadBlockNumber == journal->size) {
+      journal->blockMapHeadBlockNumber = 0;
+    }
+  }
+
+  while ((journal->slabJournalReapHead < journal->lastWriteAcknowledged)
+         && !isLocked(journal->lockCounter,
+                      journal->slabJournalHeadBlockNumber,
+                      ZONE_TYPE_PHYSICAL)) {
+    journal->slabJournalReapHead++;
+    if (++journal->slabJournalHeadBlockNumber == journal->size) {
+      journal->slabJournalHeadBlockNumber = 0;
+    }
+  }
+
+  if ((journal->blockMapReapHead == journal->blockMapHead)
+      && (journal->slabJournalReapHead == journal->slabJournalHead)) {
+    // Nothing happened.
+    return;
+  }
+
+  PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer;
+  if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) {
+    /*
+     * If the block map head will advance, we must flush any block map page
+     * modified by the entries we are reaping. If the slab journal head will
+     * advance, we must flush the slab summary update covering the slab journal
+     * that just released some lock.
+     *
+     * In sync mode, this is unnecessary because we won't record these numbers
+     * on disk until the next journal block write, and in sync mode every
+     * journal block write is preceded by a flush, which does the block map
+     * page and slab summary update flushing itself.
+     */
+    journal->reaping = true;
+    launchFlush(journal->flushVIO, completeReaping, handleFlushError);
+    return;
+  }
+
+  finishReaping(journal);
+}
+
+/**********************************************************************/
+void acquireRecoveryJournalBlockReference(RecoveryJournal *journal,
+                                          SequenceNumber   sequenceNumber,
+                                          ZoneType         zoneType,
+                                          ZoneCount        zoneID)
+{
+  if (sequenceNumber == 0) {
+    return;
+  }
+
+  BlockCount blockNumber
+    = getRecoveryJournalBlockNumber(journal, sequenceNumber);
+  acquireLockCountReference(journal->lockCounter, blockNumber, zoneType,
+                            zoneID);
+}
+
+/**********************************************************************/
+void releaseRecoveryJournalBlockReference(RecoveryJournal *journal,
+                                          SequenceNumber   sequenceNumber,
+                                          ZoneType         zoneType,
+                                          ZoneCount        zoneID)
+{
+  if (sequenceNumber == 0) {
+    return;
+  }
+
+  BlockCount blockNumber
+    = getRecoveryJournalBlockNumber(journal, sequenceNumber);
+  releaseLockCountReference(journal->lockCounter, blockNumber, zoneType,
+                            zoneID);
+}
+
+/**********************************************************************/
+void releasePerEntryLockFromOtherZone(RecoveryJournal *journal,
+                                      SequenceNumber   sequenceNumber)
+{
+  if (sequenceNumber == 0) {
+    return;
+  }
+
+  BlockCount blockNumber
+    = getRecoveryJournalBlockNumber(journal, sequenceNumber);
+  releaseJournalZoneReferenceFromOtherZone(journal->lockCounter, blockNumber);
+}
+
+/**
+ * Initiate a drain.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateDrain(AdminState *state)
+{
+  checkForDrainComplete(container_of(state, RecoveryJournal, state));
+}
+
+/**********************************************************************/
+void drainRecoveryJournal(RecoveryJournal *journal,
+                          AdminStateCode   operation,
+                          VDOCompletion   *parent)
+{
+  assertOnJournalThread(journal, __func__);
+  startDraining(&journal->state, operation, parent, initiateDrain);
+}
+
+/**********************************************************************/
+void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent)
+{
+  assertOnJournalThread(journal, __func__);
+  bool saved = isSaved(&journal->state);
+  setCompletionResult(parent, resumeIfQuiescent(&journal->state));
+
+  if (isReadOnly(journal->readOnlyNotifier)) {
+    finishCompletion(parent, VDO_READ_ONLY);
+    return;
+  }
+
+  if (saved) {
+    initializeJournalState(journal);
+  }
+
+  if (resumeLockCounter(journal->lockCounter)) {
+    // We might have missed a notification.
+    reapRecoveryJournal(journal);
+  }
+
+  completeCompletion(parent);
+}
+
+/**********************************************************************/
+BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal)
+{
+  return journal->logicalBlocksUsed;
+}
+
+/**********************************************************************/
+RecoveryJournalStatistics
+getRecoveryJournalStatistics(const RecoveryJournal *journal)
+{
+  return journal->events;
+}
+
+/**********************************************************************/
+void dumpRecoveryJournalStatistics(const RecoveryJournal *journal)
+{
+  RecoveryJournalStatistics stats = getRecoveryJournalStatistics(journal);
+  logInfo("Recovery Journal");
+  logInfo("  blockMapHead=%llu slabJournalHead=%" PRIu64
+          " lastWriteAcknowledged=%llu tail=%" PRIu64
+          " blockMapReapHead=%llu slabJournalReapHead=%" PRIu64
+          " diskFull=%llu slabJournalCommitsRequested=%" PRIu64
+          " incrementWaiters=%zu decrementWaiters=%zu",
+          journal->blockMapHead, journal->slabJournalHead,
+          journal->lastWriteAcknowledged, journal->tail,
+          journal->blockMapReapHead, journal->slabJournalReapHead,
+          stats.diskFull, stats.slabJournalCommitsRequested,
+          countWaiters(&journal->incrementWaiters),
+          countWaiters(&journal->decrementWaiters));
+  logInfo("  entries: started=%llu written=%llu committed=%"
+          PRIu64,
+          stats.entries.started, stats.entries.written,
+          stats.entries.committed);
+  logInfo("  blocks: started=%llu written=%llu committed=%"
+          PRIu64,
+          stats.blocks.started, stats.blocks.written,
+          stats.blocks.committed);
+
+  logInfo("  active blocks:");
+  const RingNode *head = &journal->activeTailBlocks;
+  for (RingNode *node = head->next; node != head; node = node->next) {
+    dumpRecoveryBlock(blockFromRingNode(node));
+  }
+}
diff --git a/vdo/base/recoveryJournal.h b/vdo/base/recoveryJournal.h
new file mode 100644
index 0000000..8ae7de0
--- /dev/null
+++ b/vdo/base/recoveryJournal.h
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $
+ */
+
+#ifndef RECOVERY_JOURNAL_H
+#define RECOVERY_JOURNAL_H
+
+#include "buffer.h"
+
+#include "adminState.h"
+#include "completion.h"
+#include "fixedLayout.h"
+#include "flush.h"
+#include "readOnlyNotifier.h"
+#include "statistics.h"
+#include "trace.h"
+#include "types.h"
+
+/**
+ * The RecoveryJournal provides a log of all block mapping changes
+ * which have not yet been stably written to the block map. It exists
+ * to help provide resiliency guarantees by allowing synchronous
+ * writes to be acknowledged as soon as the corresponding journal
+ * entry is committed instead of having to wait for the block map
+ * update. For asynchronous writes, the journal aids in meeting the
+ * five second data loss window by ensuring that writes will not be
+ * lost as long as they are committed to the journal before the window
+ * expires. This should be less work than committing all of the
+ * required block map pages.
+ *
+ * The journal consists of a set of on-disk blocks arranged as a
+ * circular log with monotonically increasing sequence numbers. Three
+ * sequence numbers serve to define the active extent of the
+ * journal. The 'head' is the oldest active block in the journal. The
+ * 'tail' is the end of the half-open interval containing the active
+ * blocks. 'active' is the number of the block actively receiving
+ * entries. In an empty journal, head == active == tail. Once any
+ * entries are added, tail = active + 1, and head may be any value in
+ * the interval [tail - size, active].
+ *
+ * The journal also contains a set of in-memory blocks which are used
+ * to buffer up entries until they can be committed. In general the
+ * number of in-memory blocks ('tailBufferCount') will be less than
+ * the on-disk size. Each in-memory block is also a VDOCompletion.
+ * Each in-memory block has a VDOExtent which is used to commit that
+ * block to disk. The extent's data is a PackedJournalBlock (which is a
+ * formatted journal block). In addition each in-memory block has a
+ * buffer which is used to accumulate entries while a partial commit
+ * of the block is in progress. In-memory blocks are kept on two
+ * rings. Free blocks live on the 'freeTailBlocks' ring. When a block
+ * becomes active (see below) it is moved to the 'activeTailBlocks'
+ * ring. When a block is fully committed, it is moved back to the
+ * 'freeTailBlocks' ring.
+ *
+ * When entries are added to the journal, they are added to the active
+ * in-memory block, as indicated by the 'activeBlock' field. If the
+ * caller wishes to wait for the entry to be committed, the requesting
+ * VIO will be attached to the in-memory block to which the caller's
+ * entry was added. If the caller does wish to wait, or if the entry
+ * filled the active block, an attempt will be made to commit that
+ * block to disk. If there is already another commit in progress, the
+ * attempt will be ignored and then automatically retried when the
+ * in-progress commit completes. If there is no commit in progress,
+ * any VIOs waiting on the block are transferred to the extent. The
+ * extent is then written, automatically waking all of the waiters
+ * when it completes. When the extent completes, any entries which
+ * accumulated in the block are copied to the extent's data buffer.
+ *
+ * Finally, the journal maintains a set of counters, one for each on
+ * disk journal block. These counters are used as locks to prevent
+ * premature reaping of journal blocks. Each time a new sequence
+ * number is used, the counter for the corresponding block is
+ * incremented. The counter is subsequently decremented when that
+ * block is filled and then committed for the last time. This prevents
+ * blocks from being reaped while they are still being updated. The
+ * counter is also incremented once for each entry added to a block,
+ * and decremented once each time the block map is updated in memory
+ * for that request. This prevents blocks from being reaped while
+ * their VIOs are still active. Finally, each in-memory block map page
+ * tracks the oldest journal block that contains entries corresponding to
+ * uncommitted updates to that block map page. Each time an in-memory block
+ * map page is updated, it checks if the journal block for the VIO
+ * is earlier than the one it references, in which case it increments
+ * the count on the earlier journal block and decrements the count on the
+ * later journal block, maintaining a lock on the oldest journal block
+ * containing entries for that page. When a block map page has been flushed
+ * from the cache, the counter for the journal block it references is
+ * decremented. Whenever the counter for the head block goes to 0, the
+ * head is advanced until it comes to a block whose counter is not 0
+ * or until it reaches the active block. This is the mechanism for
+ * reclaiming journal space on disk.
+ *
+ * If there is no in-memory space when a VIO attempts to add an entry,
+ * the VIO will be attached to the 'commitCompletion' and will be
+ * woken the next time a full block has committed. If there is no
+ * on-disk space when a VIO attempts to add an entry, the VIO will be
+ * attached to the 'reapCompletion', and will be woken the next time a
+ * journal block is reaped.
+ **/
+
+/**
+ * Return whether a given JournalOperation is an increment type.
+ *
+ * @param operation  The operation in question
+ *
+ * @return true if the type is an increment type
+ **/
+static inline bool isIncrementOperation(JournalOperation operation)
+{
+  return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT));
+}
+
+/**
+ * Get the name of a journal operation.
+ *
+ * @param operation  The operation to name
+ *
+ * @return The name of the operation
+ **/
+const char *getJournalOperationName(JournalOperation operation)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a recovery journal.
+ *
+ * @param [in]  nonce             the nonce of the VDO
+ * @param [in]  layer             the physical layer for the journal
+ * @param [in]  partition         the partition for the journal
+ * @param [in]  recoveryCount     The VDO's number of completed recoveries
+ * @param [in]  journalSize       the number of blocks in the journal on disk
+ * @param [in]  tailBufferSize    the number of blocks for tail buffer
+ * @param [in]  readOnlyNotifier  the read-only mode notifier
+ * @param [in]  threadConfig      the thread configuration of the VDO
+ * @param [out] journalPtr        the pointer to hold the new recovery journal
+ *
+ * @return a success or error code
+ **/
+int makeRecoveryJournal(Nonce                nonce,
+                        PhysicalLayer       *layer,
+                        Partition           *partition,
+                        uint64_t             recoveryCount,
+                        BlockCount           journalSize,
+                        BlockCount           tailBufferSize,
+                        ReadOnlyNotifier    *readOnlyNotifier,
+                        const ThreadConfig  *threadConfig,
+                        RecoveryJournal    **journalPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a recovery journal and null out the reference to it.
+ *
+ * @param [in,out] journalPtr  The reference to the recovery journal to free
+ **/
+void freeRecoveryJournal(RecoveryJournal **journalPtr);
+
+/**
+ * Move the backing partition pointer of the recovery journal.
+ * Assumes that the data in the old and the new partitions is identical.
+ *
+ * @param journal   the journal being moved
+ * @param partition the new journal partition
+ **/
+void setRecoveryJournalPartition(RecoveryJournal *journal,
+                                 Partition       *partition);
+
+/**
+ * Initialize the journal after a recovery.
+ *
+ * @param journal        The journal in question
+ * @param recoveryCount  The number of completed recoveries
+ * @param tail           The new tail block sequence number
+ **/
+void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal,
+                                           uint64_t         recoveryCount,
+                                           SequenceNumber   tail);
+
+/**
+ * Initialize the journal after a rebuild.
+ *
+ * @param journal            The journal in question
+ * @param recoveryCount      The number of completed recoveries
+ * @param tail               The new tail block sequence number
+ * @param logicalBlocksUsed  The new number of logical blocks used
+ * @param blockMapDataBlocks The new number of block map data blocks
+ **/
+void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal,
+                                          uint64_t         recoveryCount,
+                                          SequenceNumber   tail,
+                                          BlockCount       logicalBlocksUsed,
+                                          BlockCount       blockMapDataBlocks);
+
+/**
+ * Get the number of block map pages, allocated from data blocks, currently
+ * in use.
+ *
+ * @param journal   The journal in question
+ *
+ * @return  The number of block map pages allocated from slabs
+ **/
+BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set the number of block map pages, allocated from data blocks, currently
+ * in use.
+ *
+ * @param journal   The journal in question
+ * @param pages     The number of block map pages allocated from slabs
+ **/
+void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal,
+                                      BlockCount       pages);
+
+/**
+ * Get the ID of a recovery journal's thread.
+ *
+ * @param journal  The journal to query
+ *
+ * @return The ID of the journal's thread.
+ **/
+ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prepare the journal for new entries.
+ *
+ * @param journal   The journal in question
+ * @param depot     The slab depot for this VDO
+ * @param blockMap  The block map for this VDO
+ **/
+void openRecoveryJournal(RecoveryJournal *journal,
+                         SlabDepot       *depot,
+                         BlockMap        *blockMap);
+
+/**
+ * Obtain the recovery journal's current sequence number. Exposed only so
+ * the block map can be initialized therefrom.
+ *
+ * @param journal  The journal in question
+ *
+ * @return the sequence number of the tail block
+ **/
+SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal);
+
+/**
+ * Get the number of usable recovery journal blocks.
+ *
+ * @param journalSize  The size of the recovery journal in blocks
+ *
+ * @return the number of recovery journal blocks usable for entries
+ **/
+BlockCount getRecoveryJournalLength(BlockCount journalSize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the size of the encoded state of a recovery journal.
+ *
+ * @return the encoded size of the journal's state
+ **/
+size_t getRecoveryJournalEncodedSize(void)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode the state of a recovery journal.
+ *
+ * @param journal  the recovery journal
+ * @param buffer   the buffer to encode into
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the state of a recovery journal saved in a buffer.
+ *
+ * @param journal  the recovery journal
+ * @param buffer   the buffer containing the saved state
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the state of a Sodium recovery journal saved in a buffer.
+ *
+ * @param journal  the recovery journal
+ * @param buffer   the buffer containing the saved state
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Add an entry to a recovery journal. This method is asynchronous. The DataVIO
+ * will not be called back until the entry is committed to the on-disk journal.
+ *
+ * @param journal  The journal in which to make an entry
+ * @param dataVIO  The DataVIO for which to add the entry. The entry will be
+ *                 taken from the logical and newMapped fields of the
+ *                 DataVIO. The DataVIO's recoverySequenceNumber field will
+ *                 be set to the sequence number of the journal block in
+ *                 which the entry was made.
+ **/
+void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO);
+
+/**
+ * Acquire a reference to a recovery journal block from somewhere other than
+ * the journal itself.
+ *
+ * @param journal         The recovery journal
+ * @param sequenceNumber  The journal sequence number of the referenced block
+ * @param zoneType        The type of the zone making the adjustment
+ * @param zoneID          The ID of the zone making the adjustment
+ **/
+void acquireRecoveryJournalBlockReference(RecoveryJournal *journal,
+                                          SequenceNumber   sequenceNumber,
+                                          ZoneType         zoneType,
+                                          ZoneCount        zoneID);
+
+
+/**
+ * Release a reference to a recovery journal block from somewhere other than
+ * the journal itself. If this is the last reference for a given zone type,
+ * an attempt will be made to reap the journal.
+ *
+ * @param journal         The recovery journal
+ * @param sequenceNumber  The journal sequence number of the referenced block
+ * @param zoneType        The type of the zone making the adjustment
+ * @param zoneID          The ID of the zone making the adjustment
+ **/
+void releaseRecoveryJournalBlockReference(RecoveryJournal *journal,
+                                          SequenceNumber   sequenceNumber,
+                                          ZoneType         zoneType,
+                                          ZoneCount        zoneID);
+
+/**
+ * Release a single per-entry reference count for a recovery journal block. This
+ * method may be called from any zone (but shouldn't be called from the journal
+ * zone as it would be inefficient).
+ *
+ * @param journal         The recovery journal
+ * @param sequenceNumber  The journal sequence number of the referenced block
+ **/
+void releasePerEntryLockFromOtherZone(RecoveryJournal *journal,
+                                      SequenceNumber   sequenceNumber);
+
+/**
+ * Drain recovery journal I/O. All uncommitted entries will be written out.
+ *
+ * @param journal    The journal to drain
+ * @param operation  The drain operation (suspend or save)
+ * @param parent     The completion to finish once the journal is drained
+ **/
+void drainRecoveryJournal(RecoveryJournal *journal,
+                          AdminStateCode   operation,
+                          VDOCompletion   *parent);
+
+/**
+ * Resume a recovery journal which has been drained.
+ *
+ * @param journal  The journal to resume
+ * @param parent   The completion to finish once the journal is resumed
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent);
+
+/**
+ * Get the number of logical blocks in use by the VDO
+ *
+ * @param journal   the journal
+ *
+ * @return the number of logical blocks in use by the VDO
+ **/
+BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the current statistics from the recovery journal.
+ *
+ * @param journal   The recovery journal to query
+ *
+ * @return a copy of the current statistics for the journal
+ **/
+RecoveryJournalStatistics
+getRecoveryJournalStatistics(const RecoveryJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump some current statistics and other debug info from the recovery
+ * journal.
+ *
+ * @param journal   The recovery journal to dump
+ **/
+void dumpRecoveryJournalStatistics(const RecoveryJournal *journal);
+
+#endif // RECOVERY_JOURNAL_H
diff --git a/vdo/base/recoveryJournalBlock.c b/vdo/base/recoveryJournalBlock.c
new file mode 100644
index 0000000..1bbacfc
--- /dev/null
+++ b/vdo/base/recoveryJournalBlock.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.c#13 $
+ */
+
+#include "recoveryJournalBlock.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "dataVIO.h"
+#include "fixedLayout.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalEntry.h"
+#include "recoveryJournalInternals.h"
+#include "ringNode.h"
+#include "vio.h"
+#include "waitQueue.h"
+
+/**********************************************************************/
+int makeRecoveryBlock(PhysicalLayer         *layer,
+                      RecoveryJournal       *journal,
+                      RecoveryJournalBlock **blockPtr)
+{
+  // Ensure that a block is large enough to store
+  // RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries.
+  STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK
+                <= ((VDO_BLOCK_SIZE - sizeof(PackedJournalHeader))
+                    / sizeof(PackedRecoveryJournalEntry)));
+
+  RecoveryJournalBlock *block;
+  int result = ALLOCATE(1, RecoveryJournalBlock, __func__, &block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Allocate a full block for the journal block even though not all of the
+  // space is used since the VIO needs to write a full disk block.
+  result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedJournalBlock", &block->block);
+  if (result != VDO_SUCCESS) {
+    freeRecoveryBlock(&block);
+    return result;
+  }
+
+  result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH,
+                     block, block->block, &block->vio);
+  if (result != VDO_SUCCESS) {
+    freeRecoveryBlock(&block);
+    return result;
+  }
+
+  block->vio->completion.callbackThreadID = journal->threadID;
+  initializeRing(&block->ringNode);
+  block->journal = journal;
+
+  *blockPtr = block;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeRecoveryBlock(RecoveryJournalBlock **blockPtr)
+{
+  RecoveryJournalBlock *block = *blockPtr;
+  if (block == NULL) {
+    return;
+  }
+
+  FREE(block->block);
+  freeVIO(&block->vio);
+  FREE(block);
+  *blockPtr = NULL;
+}
+
+/**
+ * Get a pointer to the packed journal block header in the block buffer.
+ *
+ * @param block  The recovery block
+ *
+ * @return The block's header
+ **/
+static inline
+PackedJournalHeader *getBlockHeader(const RecoveryJournalBlock *block)
+{
+  return (PackedJournalHeader *) block->block;
+}
+
+/**
+ * Set the current sector of the current block and initialize it.
+ *
+ * @param block  The block to update
+ * @param sector A pointer to the first byte of the new sector
+ **/
+static void setActiveSector(RecoveryJournalBlock *block, void *sector)
+{
+  block->sector                = (PackedJournalSector *) sector;
+  block->sector->checkByte     = getBlockHeader(block)->fields.checkByte;
+  block->sector->recoveryCount = block->journal->recoveryCount;
+  block->sector->entryCount    = 0;
+}
+
+/**********************************************************************/
+void initializeRecoveryBlock(RecoveryJournalBlock *block)
+{
+  memset(block->block, 0x0, VDO_BLOCK_SIZE);
+
+  RecoveryJournal *journal     = block->journal;
+  block->sequenceNumber        = journal->tail;
+  block->entryCount            = 0;
+  block->uncommittedEntryCount = 0;
+
+  block->blockNumber = getRecoveryJournalBlockNumber(journal, journal->tail);
+
+  RecoveryBlockHeader unpacked = {
+    .metadataType       = VDO_METADATA_RECOVERY_JOURNAL,
+    .blockMapDataBlocks = journal->blockMapDataBlocks,
+    .logicalBlocksUsed  = journal->logicalBlocksUsed,
+    .nonce              = journal->nonce,
+    .recoveryCount      = journal->recoveryCount,
+    .sequenceNumber     = journal->tail,
+    .checkByte          = computeRecoveryCheckByte(journal, journal->tail),
+  };
+  PackedJournalHeader *header = getBlockHeader(block);
+  packRecoveryBlockHeader(&unpacked, header);
+
+  setActiveSector(block, getJournalBlockSector(header, 1));
+}
+
+/**********************************************************************/
+int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO)
+{
+  // First queued entry indicates this is a journal block we've just opened
+  // or a committing block we're extending and will have to write again.
+  bool newBatch = !hasWaiters(&block->entryWaiters);
+
+  // Enqueue the DataVIO to wait for its entry to commit.
+  int result = enqueueDataVIO(&block->entryWaiters, dataVIO,
+                              THIS_LOCATION("$F($j-$js)"));
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  block->entryCount++;
+  block->uncommittedEntryCount++;
+
+  // Update stats to reflect the journal entry we're going to write.
+  if (newBatch) {
+    block->journal->events.blocks.started++;
+  }
+  block->journal->events.entries.started++;
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Check whether the current sector of a block is full.
+ *
+ * @param block  The block to check
+ *
+ * @return <code>true</code> if the sector is full
+ **/
+__attribute__((warn_unused_result))
+static bool isSectorFull(const RecoveryJournalBlock *block)
+{
+  return (block->sector->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR);
+}
+
+/**
+ * Actually add entries from the queue to the given block.
+ *
+ * @param block  The journal block
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int addQueuedRecoveryEntries(RecoveryJournalBlock *block)
+{
+  while (hasWaiters(&block->entryWaiters)) {
+    DataVIO *dataVIO
+      = waiterAsDataVIO(dequeueNextWaiter(&block->entryWaiters));
+    if (dataVIO->operation.type == DATA_INCREMENT) {
+      // In order to not lose committed sectors of this partial write, we must
+      // flush before the partial write entries are committed.
+      block->hasPartialWriteEntry = (block->hasPartialWriteEntry
+                                     || dataVIO->isPartialWrite);
+      /*
+       * In order to not lose acknowledged writes with the FUA flag set, we
+       * must issue a flush to cover the data write and also all previous
+       * journal writes, and we must issue a FUA on the journal write.
+       */
+      block->hasFUAEntry = (block->hasFUAEntry
+                            || vioRequiresFlushAfter(dataVIOAsVIO(dataVIO)));
+    }
+
+    // Compose and encode the entry.
+    PackedRecoveryJournalEntry *packedEntry
+      = &block->sector->entries[block->sector->entryCount++];
+    TreeLock *lock = &dataVIO->treeLock;
+    RecoveryJournalEntry newEntry = {
+      .mapping   = {
+        .pbn     = dataVIO->operation.pbn,
+        .state   = dataVIO->operation.state,
+      },
+      .operation = dataVIO->operation.type,
+      .slot      = lock->treeSlots[lock->height].blockMapSlot,
+    };
+    *packedEntry = packRecoveryJournalEntry(&newEntry);
+
+    if (isIncrementOperation(dataVIO->operation.type)) {
+      dataVIO->recoverySequenceNumber = block->sequenceNumber;
+    }
+
+    // Enqueue the DataVIO to wait for its entry to commit.
+    int result = enqueueDataVIO(&block->commitWaiters, dataVIO,
+                                THIS_LOCATION("$F($j-$js)"));
+    if (result != VDO_SUCCESS) {
+      continueDataVIO(dataVIO, result);
+      return result;
+    }
+
+    if (isSectorFull(block)) {
+      setActiveSector(block, (char *) block->sector + VDO_SECTOR_SIZE);
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int getRecoveryBlockPBN(RecoveryJournalBlock *block,
+                               PhysicalBlockNumber  *pbnPtr)
+{
+  RecoveryJournal *journal = block->journal;
+  int result = translateToPBN(journal->partition, block->blockNumber, pbnPtr);
+  if (result != VDO_SUCCESS) {
+    logErrorWithStringError(result,
+                            "Error translating recovery journal block "
+                            "number %llu", block->blockNumber);
+  }
+  return result;
+}
+
+/**********************************************************************/
+bool canCommitRecoveryBlock(RecoveryJournalBlock *block)
+{
+  // Cannot commit in read-only mode, if already committing the block, or
+  // if there are no entries to commit.
+  return ((block != NULL)
+           && !block->committing
+           && hasWaiters(&block->entryWaiters)
+           && !isReadOnly(block->journal->readOnlyNotifier));
+}
+
+/**********************************************************************/
+int commitRecoveryBlock(RecoveryJournalBlock *block,
+                        VDOAction            *callback,
+                        VDOAction            *errorHandler)
+{
+  int result = ASSERT(canCommitRecoveryBlock(block), "should never call %s"
+		      " when the block can't be committed", __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  PhysicalBlockNumber blockPBN;
+  result = getRecoveryBlockPBN(block, &blockPBN);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  block->entriesInCommit = countWaiters(&block->entryWaiters);
+  result = addQueuedRecoveryEntries(block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  RecoveryJournal     *journal = block->journal;
+  PackedJournalHeader *header  = getBlockHeader(block);
+
+  // Update stats to reflect the block and entries we're about to write.
+  journal->pendingWriteCount      += 1;
+  journal->events.blocks.written  += 1;
+  journal->events.entries.written += block->entriesInCommit;
+
+  storeUInt64LE(header->fields.blockMapHead,    journal->blockMapHead);
+  storeUInt64LE(header->fields.slabJournalHead, journal->slabJournalHead);
+  storeUInt16LE(header->fields.entryCount,      block->entryCount);
+
+  block->committing = true;
+
+  /*
+   * In sync or async mode, when we are writing an increment entry for a
+   * request with FUA, or when making the increment entry for a partial
+   * write, we need to make sure all the data being mapped to by this block
+   * is stable on disk and also that the recovery journal is stable up to
+   * the current block, so we must flush before writing.
+   *
+   * In sync mode, and for FUA, we also need to make sure that the write we
+   * are doing is stable, so we issue the write with FUA.
+   */
+  PhysicalLayer *layer        = vioAsCompletion(block->vio)->layer;
+  bool fua = (block->hasFUAEntry
+              || (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC));
+  bool flush = (block->hasFUAEntry
+                || (layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC_UNSAFE)
+		|| block->hasPartialWriteEntry);
+  block->hasFUAEntry          = false;
+  block->hasPartialWriteEntry = false;
+  launchWriteMetadataVIOWithFlush(block->vio, blockPBN, callback, errorHandler,
+                                  flush, fua);
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void dumpRecoveryBlock(const RecoveryJournalBlock *block)
+{
+  logInfo("    sequence number %llu; entries %" PRIu16
+          "; %s; %zu entry waiters; %zu commit waiters",
+          block->sequenceNumber,
+          block->entryCount,
+          (block->committing ? "committing" : "waiting"),
+          countWaiters(&block->entryWaiters),
+          countWaiters(&block->commitWaiters));
+}
diff --git a/vdo/base/recoveryJournalBlock.h b/vdo/base/recoveryJournalBlock.h
new file mode 100644
index 0000000..f26f8e8
--- /dev/null
+++ b/vdo/base/recoveryJournalBlock.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.h#8 $
+ */
+
+#ifndef RECOVERY_JOURNAL_BLOCK_H
+#define RECOVERY_JOURNAL_BLOCK_H
+
+#include "permassert.h"
+
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalInternals.h"
+#include "ringNode.h"
+#include "types.h"
+#include "waitQueue.h"
+
+struct recoveryJournalBlock {
+  /** The doubly linked pointers for the free or active lists */
+  RingNode             ringNode;
+  /** The waiter for the pending full block list */
+  Waiter               writeWaiter;
+  /** The journal to which this block belongs */
+  RecoveryJournal     *journal;
+  /** A pointer to a block-sized buffer holding the packed block data */
+  char                *block;
+  /** A pointer to the current sector in the packed block buffer */
+  PackedJournalSector *sector;
+  /** The VIO for writing this block */
+  VIO                 *vio;
+  /** The sequence number for this block */
+  SequenceNumber       sequenceNumber;
+  /** The location of this block in the on-disk journal */
+  PhysicalBlockNumber  blockNumber;
+  /** Whether this block is being committed */
+  bool                 committing;
+  /** Whether this block has an uncommitted increment for a partial write */
+  bool                 hasPartialWriteEntry;
+  /** Whether this block has an uncommitted increment for a write with FUA */
+  bool                 hasFUAEntry;
+  /** The total number of entries in this block */
+  JournalEntryCount    entryCount;
+  /** The total number of uncommitted entries (queued or committing) */
+  JournalEntryCount    uncommittedEntryCount;
+  /** The number of new entries in the current commit */
+  JournalEntryCount    entriesInCommit;
+  /** The queue of VIOs which will make entries for the next commit */
+  WaitQueue            entryWaiters;
+  /** The queue of VIOs waiting for the current commit */
+  WaitQueue            commitWaiters;
+};
+
+/**
+ * Return the block associated with a ring node.
+ *
+ * @param node The ring node to recast as a block
+ *
+ * @return The block
+ **/
+static inline RecoveryJournalBlock *blockFromRingNode(RingNode *node)
+{
+  STATIC_ASSERT(offsetof(RecoveryJournalBlock, ringNode) == 0);
+  return (RecoveryJournalBlock *) node;
+}
+
+/**
+ * Return the block associated with a waiter
+ *
+ * @param waiter  The waiter to recast as a block
+ *
+ * @return The block
+ **/
+static inline RecoveryJournalBlock *blockFromWaiter(Waiter *waiter)
+{
+  return (RecoveryJournalBlock *)
+    ((uintptr_t) waiter - offsetof(RecoveryJournalBlock, writeWaiter));
+}
+
+/**
+ * Check whether a recovery block is dirty, indicating it has any uncommitted
+ * entries, which includes both entries not written and entries written but
+ * not yet acknowledged.
+ *
+ * @param block  The block to check
+ *
+ * @return <code>true</code> if the block has any uncommitted entries
+ **/
+__attribute__((warn_unused_result))
+static inline bool isRecoveryBlockDirty(const RecoveryJournalBlock *block)
+{
+  return (block->uncommittedEntryCount > 0);
+}
+
+/**
+ * Check whether a journal block is empty.
+ *
+ * @param block  The block to check
+ *
+ * @return <code>true</code> if the block has no entries
+ **/
+__attribute__((warn_unused_result))
+static inline bool isRecoveryBlockEmpty(const RecoveryJournalBlock *block)
+{
+  return (block->entryCount == 0);
+}
+
+/**
+ * Check whether a journal block is full.
+ *
+ * @param block  The block to check
+ *
+ * @return <code>true</code> if the the block is full
+ **/
+__attribute__((warn_unused_result))
+static inline bool isRecoveryBlockFull(const RecoveryJournalBlock *block)
+{
+  return ((block == NULL)
+          || (block->journal->entriesPerBlock == block->entryCount));
+}
+
+/**
+ * Construct a journal block.
+ *
+ * @param [in]  layer     The layer from which to construct VIOs
+ * @param [in]  journal   The journal to which the block will belong
+ * @param [out] blockPtr  A pointer to receive the new block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeRecoveryBlock(PhysicalLayer         *layer,
+                      RecoveryJournal       *journal,
+                      RecoveryJournalBlock **blockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a tail block and null out the reference to it.
+ *
+ * @param blockPtr  The reference to the tail block to free
+ **/
+void freeRecoveryBlock(RecoveryJournalBlock **blockPtr);
+
+/**
+ * Initialize the next active recovery journal block.
+ *
+ * @param block  The journal block to initialize
+ **/
+void initializeRecoveryBlock(RecoveryJournalBlock *block);
+
+/**
+ * Enqueue a DataVIO to asynchronously encode and commit its next recovery
+ * journal entry in this block. The DataVIO will not be continued until the
+ * entry is committed to the on-disk journal. The caller is responsible for
+ * ensuring the block is not already full.
+ *
+ * @param block    The journal block in which to make an entry
+ * @param dataVIO  The DataVIO to enqueue
+ *
+ * @return VDO_SUCCESS or an error code if the DataVIO could not be enqueued
+ **/
+int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO)
+  __attribute__((warn_unused_result));
+
+/**
+ * Attempt to commit a block. If the block is not the oldest block with
+ * uncommitted entries or if it is already being committed, nothing will be
+ * done.
+ *
+ * @param block         The block to write
+ * @param callback      The function to call when the write completes
+ * @param errorHandler  The handler for flush or write errors
+ *
+ * @return VDO_SUCCESS, or an error if the write could not be launched
+ **/
+int commitRecoveryBlock(RecoveryJournalBlock *block,
+                        VDOAction            *callback,
+                        VDOAction            *errorHandler)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump the contents of the recovery block to the log.
+ *
+ * @param block  The block to dump
+ **/
+void dumpRecoveryBlock(const RecoveryJournalBlock *block);
+
+/**
+ * Check whether a journal block can be committed.
+ *
+ * @param block  The journal block in question
+ *
+ * @return <code>true</code> if the block can be committed now
+ **/
+bool canCommitRecoveryBlock(RecoveryJournalBlock *block)
+  __attribute__((warn_unused_result));
+
+#endif // RECOVERY_JOURNAL_BLOCK_H
diff --git a/vdo/base/recoveryJournalEntry.h b/vdo/base/recoveryJournalEntry.h
new file mode 100644
index 0000000..bf2a3e0
--- /dev/null
+++ b/vdo/base/recoveryJournalEntry.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalEntry.h#1 $
+ */
+
+#ifndef RECOVERY_JOURNAL_ENTRY_H
+#define RECOVERY_JOURNAL_ENTRY_H
+
+#include "numeric.h"
+
+#include "blockMapEntry.h"
+#include "journalPoint.h"
+#include "types.h"
+
+/**
+ * A recovery journal entry stores two physical locations: a data location
+ * that is the value of a single mapping in the block map tree, and the
+ * location of the block map page and and slot that is either acquiring or
+ * releasing a reference to the data location. The journal entry also stores
+ * an operation code that says whether the reference is being acquired (an
+ * increment) or released (a decrement), and whether the mapping is for a
+ * logical block or for the block map tree itself.
+ **/
+typedef struct {
+  BlockMapSlot     slot;
+  DataLocation     mapping;
+  JournalOperation operation;
+} RecoveryJournalEntry;
+
+/** The packed, on-disk representation of a recovery journal entry. */
+typedef union __attribute__((packed)) {
+  struct __attribute__((packed)) {
+    /**
+     * In little-endian bit order:
+     * Bits 15..12:  The four highest bits of the 36-bit physical block number
+     *               of the block map tree page
+     * Bits 11..2:   The 10-bit block map page slot number
+     * Bits 1..0:    The 2-bit JournalOperation of the entry
+     **/
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    unsigned operation     : 2;
+    unsigned slotLow       : 6;
+    unsigned slotHigh      : 4;
+    unsigned pbnHighNibble : 4;
+#else
+    unsigned slotLow       : 6;
+    unsigned operation     : 2;
+    unsigned pbnHighNibble : 4;
+    unsigned slotHigh      : 4;
+#endif
+
+    /**
+     * Bits 47..16:  The 32 low-order bits of the block map page PBN,
+     *               in little-endian byte order
+     **/
+    byte pbnLowWord[4];
+
+    /**
+     * Bits 87..48:  The five-byte block map entry encoding the location that
+     *               was or will be stored in the block map page slot
+     **/
+    BlockMapEntry blockMapEntry;
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[11];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    unsigned      operation     : 2;
+    unsigned      slot          : 10;
+    unsigned      pbnHighNibble : 4;
+    uint32_t      pbnLowWord;
+    BlockMapEntry blockMapEntry;
+  } littleEndian;
+#endif
+} PackedRecoveryJournalEntry;
+
+/**
+ * Return the packed, on-disk representation of a recovery journal entry.
+ *
+ * @param entry   The journal entry to pack
+ *
+ * @return  The packed representation of the journal entry
+ **/
+static inline PackedRecoveryJournalEntry
+packRecoveryJournalEntry(const RecoveryJournalEntry *entry)
+{
+  PackedRecoveryJournalEntry packed = {
+    .fields = {
+      .operation     = entry->operation,
+      .slotLow       = entry->slot.slot & 0x3F,
+      .slotHigh      = (entry->slot.slot >> 6) & 0x0F,
+      .pbnHighNibble = (entry->slot.pbn >> 32) & 0x0F,
+      .blockMapEntry = packPBN(entry->mapping.pbn, entry->mapping.state),
+    }
+  };
+  storeUInt32LE(packed.fields.pbnLowWord, entry->slot.pbn & UINT_MAX);
+  return packed;
+}
+
+/**
+ * Unpack the on-disk representation of a recovery journal entry.
+ *
+ * @param entry  The recovery journal entry to unpack
+ *
+ * @return  The unpacked entry
+ **/
+static inline RecoveryJournalEntry
+unpackRecoveryJournalEntry(const PackedRecoveryJournalEntry *entry)
+{
+  PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord);
+  PhysicalBlockNumber high4 = entry->fields.pbnHighNibble;
+  return (RecoveryJournalEntry) {
+    .operation = entry->fields.operation,
+    .slot      = {
+      .pbn  = ((high4 << 32) | low32),
+      .slot = (entry->fields.slotLow | (entry->fields.slotHigh << 6)),
+    },
+    .mapping = unpackBlockMapEntry(&entry->fields.blockMapEntry),
+  };
+}
+
+#endif // RECOVERY_JOURNAL_ENTRY_H
diff --git a/vdo/base/recoveryJournalInternals.h b/vdo/base/recoveryJournalInternals.h
new file mode 100644
index 0000000..0266990
--- /dev/null
+++ b/vdo/base/recoveryJournalInternals.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalInternals.h#10 $
+ */
+
+#ifndef RECOVERY_JOURNAL_INTERNALS_H
+#define RECOVERY_JOURNAL_INTERNALS_H
+
+#include "numeric.h"
+
+#include "adminState.h"
+#include "fixedLayout.h"
+#include "journalPoint.h"
+#include "lockCounter.h"
+#include "recoveryJournal.h"
+#include "ringNode.h"
+#include "statistics.h"
+#include "types.h"
+#include "waitQueue.h"
+
+typedef struct recoveryJournalBlock RecoveryJournalBlock;
+
+struct recoveryJournal {
+  /** The thread ID of the journal zone */
+  ThreadID                   threadID;
+  /** The slab depot which can hold locks on this journal */
+  SlabDepot                 *depot;
+  /** The block map which can hold locks on this journal */
+  BlockMap                  *blockMap;
+  /** The queue of VIOs waiting to make increment entries */
+  WaitQueue                  incrementWaiters;
+  /** The queue of VIOs waiting to make decrement entries */
+  WaitQueue                  decrementWaiters;
+  /** The number of free entries in the journal */
+  uint64_t                   availableSpace;
+  /** The number of decrement entries which need to be made */
+  VIOCount                   pendingDecrementCount;
+  /**
+   * Whether the journal is adding entries from the increment or
+   * decrement waiters queues
+   **/
+  bool                       addingEntries;
+  /** The notifier for read-only mode */
+  ReadOnlyNotifier          *readOnlyNotifier;
+  /** The administrative state of the journal */
+  AdminState                 state;
+  /** Whether a reap is in progress */
+  bool                       reaping;
+  /** The partition which holds the journal on disk */
+  Partition                 *partition;
+  /** The oldest active block in the journal on disk for block map rebuild */
+  SequenceNumber             blockMapHead;
+  /** The oldest active block in the journal on disk for slab journal replay */
+  SequenceNumber             slabJournalHead;
+  /** The newest block in the journal on disk to which a write has finished */
+  SequenceNumber             lastWriteAcknowledged;
+  /** The end of the half-open interval of the active journal */
+  SequenceNumber             tail;
+  /** The point at which the last entry will have been added */
+  JournalPoint               appendPoint;
+  /** The journal point of the VIO most recently released from the journal */
+  JournalPoint               commitPoint;
+  /** The nonce of the VDO */
+  Nonce                      nonce;
+  /** The number of recoveries completed by the VDO */
+  uint8_t                    recoveryCount;
+  /** The number of entries which fit in a single block */
+  JournalEntryCount          entriesPerBlock;
+  /** Unused in-memory journal blocks */
+  RingNode                   freeTailBlocks;
+  /** In-memory journal blocks with records */
+  RingNode                   activeTailBlocks;
+  /** A pointer to the active block (the one we are adding entries to now) */
+  RecoveryJournalBlock      *activeBlock;
+  /** Journal blocks that need writing */
+  WaitQueue                  pendingWrites;
+  /** The new block map reap head after reaping */
+  SequenceNumber             blockMapReapHead;
+  /** The head block number for the block map rebuild range */
+  BlockCount                 blockMapHeadBlockNumber;
+  /** The new slab journal reap head after reaping */
+  SequenceNumber             slabJournalReapHead;
+  /** The head block number for the slab journal replay range */
+  BlockCount                 slabJournalHeadBlockNumber;
+  /** The VIO on which we can call flush (less ick, but still ick) */
+  VIO                       *flushVIO;
+  /** The data block which must live in the VIO in the flush extent */
+  char                      *unusedFlushVIOData;
+  /** The number of blocks in the on-disk journal */
+  BlockCount                 size;
+  /** The number of logical blocks that are in-use */
+  BlockCount                 logicalBlocksUsed;
+  /** The number of block map pages that are allocated */
+  BlockCount                 blockMapDataBlocks;
+  /** The number of journal blocks written but not yet acknowledged */
+  BlockCount                 pendingWriteCount;
+  /** The threshold at which slab journal tail blocks will be written out */
+  BlockCount                 slabJournalCommitThreshold;
+  /** Counters for events in the journal that are reported as statistics */
+  RecoveryJournalStatistics  events;
+  /** The locks for each on-disk block */
+  LockCounter               *lockCounter;
+};
+
+/**
+ * Get the physical block number for a given sequence number.
+ *
+ * @param journal   The journal
+ * @param sequence  The sequence number of the desired block
+ *
+ * @return The block number corresponding to the sequence number
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber
+getRecoveryJournalBlockNumber(const RecoveryJournal *journal,
+                              SequenceNumber         sequence)
+{
+  // Since journal size is a power of two, the block number modulus can just
+  // be extracted from the low-order bits of the sequence.
+  return (sequence & (journal->size - 1));
+}
+
+/**
+ * Compute the checkByte for a given sequence number.
+ *
+ * @param journal   The journal
+ * @param sequence  The sequence number
+ *
+ * @return The check byte corresponding to the sequence number
+ **/
+__attribute__((warn_unused_result))
+static inline uint8_t computeRecoveryCheckByte(const RecoveryJournal *journal,
+                                               SequenceNumber         sequence)
+{
+  // The check byte must change with each trip around the journal.
+  return (((sequence / journal->size) & 0x7F) | 0x80);
+}
+
+#endif // RECOVERY_JOURNAL_INTERNALS_H
diff --git a/vdo/base/recoveryUtils.c b/vdo/base/recoveryUtils.c
new file mode 100644
index 0000000..44f16ee
--- /dev/null
+++ b/vdo/base/recoveryUtils.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.c#4 $
+ */
+
+#include "recoveryUtils.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "completion.h"
+#include "extent.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalEntry.h"
+#include "recoveryJournalInternals.h"
+#include "slabDepot.h"
+#include "vdoInternal.h"
+
+/**
+ * Finish loading the journal by freeing the extent and notifying the parent.
+ * This callback is registered in loadJournalAsync().
+ *
+ * @param completion  The load extent
+ **/
+static void finishJournalLoad(VDOCompletion *completion)
+{
+  int            result = completion->result;
+  VDOCompletion *parent = completion->parent;
+  VDOExtent     *extent = asVDOExtent(completion);
+  freeExtent(&extent);
+  finishCompletion(parent, result);
+}
+
+/**********************************************************************/
+void loadJournalAsync(RecoveryJournal  *journal,
+                      VDOCompletion    *parent,
+                      char            **journalDataPtr)
+{
+  int result = ALLOCATE(journal->size * VDO_BLOCK_SIZE, char, __func__,
+                        journalDataPtr);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  VDOExtent *extent;
+  result = createExtent(parent->layer, VIO_TYPE_RECOVERY_JOURNAL,
+                        VIO_PRIORITY_METADATA, journal->size,
+                        *journalDataPtr, &extent);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  prepareCompletion(&extent->completion, finishJournalLoad, finishJournalLoad,
+                    parent->callbackThreadID, parent);
+  readMetadataExtent(extent,
+                     getFixedLayoutPartitionOffset(journal->partition));
+}
+
+/**
+ * Determine whether the given header describe a valid block for the
+ * given journal that could appear at the given offset in the journal.
+ *
+ * @param journal  The journal to use
+ * @param header   The unpacked block header to check
+ * @param offset   An offset indicating where the block was in the journal
+ *
+ * @return <code>True</code> if the header matches
+ **/
+__attribute__((warn_unused_result))
+static bool isCongruentRecoveryJournalBlock(RecoveryJournal           *journal,
+                                            const RecoveryBlockHeader *header,
+                                            PhysicalBlockNumber        offset)
+{
+  PhysicalBlockNumber expectedOffset
+    = getRecoveryJournalBlockNumber(journal, header->sequenceNumber);
+  return ((expectedOffset == offset)
+          && isValidRecoveryJournalBlock(journal, header));
+}
+
+/**********************************************************************/
+bool findHeadAndTail(RecoveryJournal *journal,
+                     char            *journalData,
+                     SequenceNumber  *tailPtr,
+                     SequenceNumber  *blockMapHeadPtr,
+                     SequenceNumber  *slabJournalHeadPtr)
+{
+  SequenceNumber   highestTail        = journal->tail;
+  SequenceNumber   blockMapHeadMax    = 0;
+  SequenceNumber   slabJournalHeadMax = 0;
+  bool             foundEntries       = false;
+  for (PhysicalBlockNumber i = 0; i < journal->size; i++) {
+    PackedJournalHeader *packedHeader
+      = getJournalBlockHeader(journal, journalData, i);
+    RecoveryBlockHeader header;
+    unpackRecoveryBlockHeader(packedHeader, &header);
+
+    if (!isCongruentRecoveryJournalBlock(journal, &header, i)) {
+      // This block is old, unformatted, or doesn't belong at this location.
+      continue;
+    }
+
+    if (header.sequenceNumber >= highestTail) {
+      foundEntries = true;
+      highestTail  = header.sequenceNumber;
+    }
+    if (header.blockMapHead > blockMapHeadMax) {
+      blockMapHeadMax = header.blockMapHead;
+    }
+    if (header.slabJournalHead > slabJournalHeadMax) {
+      slabJournalHeadMax = header.slabJournalHead;
+    }
+  }
+
+  *tailPtr = highestTail;
+  if (!foundEntries) {
+    return false;
+  }
+
+  *blockMapHeadPtr = blockMapHeadMax;
+  if (slabJournalHeadPtr != NULL) {
+    *slabJournalHeadPtr = slabJournalHeadMax;
+  }
+  return true;
+}
+
+/**********************************************************************/
+int validateRecoveryJournalEntry(const VDO                  *vdo,
+                                 const RecoveryJournalEntry *entry)
+{
+  if ((entry->slot.pbn >= vdo->config.physicalBlocks)
+      || (entry->slot.slot >= BLOCK_MAP_ENTRIES_PER_PAGE)
+      || !isValidLocation(&entry->mapping)
+      || !isPhysicalDataBlock(vdo->depot, entry->mapping.pbn)) {
+    return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:"
+                                   " (%llu, %" PRIu16 ") to %" PRIu64
+                                   " (%s) is not within bounds",
+                                   entry->slot.pbn, entry->slot.slot,
+                                   entry->mapping.pbn,
+                                   getJournalOperationName(entry->operation));
+  }
+
+  if ((entry->operation == BLOCK_MAP_INCREMENT)
+      && (isCompressed(entry->mapping.state)
+          || (entry->mapping.pbn == ZERO_BLOCK))) {
+    return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:"
+                                   " (%llu, %" PRIu16 ") to %" PRIu64
+                                   " (%s) is not a valid tree mapping",
+                                   entry->slot.pbn, entry->slot.slot,
+                                   entry->mapping.pbn,
+                                   getJournalOperationName(entry->operation));
+  }
+
+  return VDO_SUCCESS;
+}
diff --git a/vdo/base/recoveryUtils.h b/vdo/base/recoveryUtils.h
new file mode 100644
index 0000000..6778af9
--- /dev/null
+++ b/vdo/base/recoveryUtils.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.h#5 $
+ */
+
+#ifndef RECOVERY_UTILS_H
+#define RECOVERY_UTILS_H
+
+#include "constants.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournalEntry.h"
+#include "recoveryJournalInternals.h"
+#include "types.h"
+
+/**
+ * Get the block header for a block at a position in the journal data.
+ *
+ * @param journal      The recovery journal
+ * @param journalData  The recovery journal data
+ * @param sequence     The sequence number
+ *
+ * @return A pointer to a packed recovery journal block header.
+ **/
+__attribute__((warn_unused_result))
+static inline
+PackedJournalHeader *getJournalBlockHeader(RecoveryJournal *journal,
+                                           char            *journalData,
+                                           SequenceNumber   sequence)
+{
+  off_t blockOffset = (getRecoveryJournalBlockNumber(journal, sequence)
+                       * VDO_BLOCK_SIZE);
+  return (PackedJournalHeader *) &journalData[blockOffset];
+}
+
+/**
+ * Determine whether the given header describes a valid block for the
+ * given journal. A block is not valid if it is unformatted, or if it
+ * is older than the last successful recovery or reformat.
+ *
+ * @param journal  The journal to use
+ * @param header   The unpacked block header to check
+ *
+ * @return <code>True</code> if the header is valid
+ **/
+__attribute__((warn_unused_result))
+static inline
+bool isValidRecoveryJournalBlock(const RecoveryJournal     *journal,
+                                 const RecoveryBlockHeader *header)
+{
+  return ((header->metadataType == VDO_METADATA_RECOVERY_JOURNAL)
+          && (header->nonce == journal->nonce)
+          && (header->recoveryCount == journal->recoveryCount));
+}
+
+/**
+ * Determine whether the given header describes the exact block indicated.
+ *
+ * @param journal   The journal to use
+ * @param header    The unpacked block header to check
+ * @param sequence  The expected sequence number
+ *
+ * @return <code>True</code> if the block matches
+ **/
+__attribute__((warn_unused_result))
+static inline
+bool isExactRecoveryJournalBlock(const RecoveryJournal     *journal,
+                                 const RecoveryBlockHeader *header,
+                                 SequenceNumber             sequence)
+{
+  return ((header->sequenceNumber == sequence)
+          && isValidRecoveryJournalBlock(journal, header));
+}
+
+/**
+ * Determine whether the header of the given sector could describe a
+ * valid sector for the given journal block header.
+ *
+ * @param header  The unpacked block header to compare against
+ * @param sector  The packed sector to check
+ *
+ * @return <code>True</code> if the sector matches the block header
+ **/
+__attribute__((warn_unused_result))
+static inline
+bool isValidRecoveryJournalSector(const RecoveryBlockHeader *header,
+                                  const PackedJournalSector *sector)
+{
+  return ((header->checkByte == sector->checkByte)
+          && (header->recoveryCount == sector->recoveryCount));
+}
+
+/**
+ * Load the journal data off the disk.
+ *
+ * @param [in]  journal         The recovery journal to load
+ * @param [in]  parent          The completion to notify when the load is
+ *                              complete
+ * @param [out] journalDataPtr  A pointer to the journal data buffer (it is the
+ *                              caller's responsibility to free this buffer)
+ **/
+void loadJournalAsync(RecoveryJournal  *journal,
+                      VDOCompletion    *parent,
+                      char            **journalDataPtr);
+
+/**
+ * Find the tail and the head of the journal by searching for the highest
+ * sequence number in a block with a valid nonce, and the highest head value
+ * among the blocks with valid nonces.
+ *
+ * @param [in]  journal             The recovery journal
+ * @param [in]  journalData         The journal data read from disk
+ * @param [out] tailPtr             A pointer to return the tail found, or if
+ *                                  no higher block is found, the value
+ *                                  currently in the journal
+ * @param [out] blockMapHeadPtr     A pointer to return the block map head
+ * @param [out] slabJournalHeadPtr  An optional pointer to return the slab
+ *                                  journal head
+ *
+ * @return  <code>True</code> if there were valid journal blocks
+ **/
+bool findHeadAndTail(RecoveryJournal *journal,
+                     char            *journalData,
+                     SequenceNumber  *tailPtr,
+                     SequenceNumber  *blockMapHeadPtr,
+                     SequenceNumber  *slabJournalHeadPtr);
+
+/**
+ * Validate a recovery journal entry.
+ *
+ * @param vdo    The VDO
+ * @param entry  The entry to validate
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int validateRecoveryJournalEntry(const VDO                  *vdo,
+                                 const RecoveryJournalEntry *entry)
+  __attribute__((warn_unused_result));
+
+#endif // RECOVERY_UTILS_H
diff --git a/vdo/base/refCounts.c b/vdo/base/refCounts.c
new file mode 100644
index 0000000..daf04c4
--- /dev/null
+++ b/vdo/base/refCounts.c
@@ -0,0 +1,1451 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.c#9 $
+ */
+
+#include "refCounts.h"
+#include "refCountsInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#include "adminState.h"
+#include "blockAllocatorInternals.h"
+#include "completion.h"
+#include "extent.h"
+#include "header.h"
+#include "journalPoint.h"
+#include "numUtils.h"
+#include "pbnLock.h"
+#include "readOnlyNotifier.h"
+#include "referenceBlock.h"
+#include "referenceOperation.h"
+#include "slab.h"
+#include "slabJournal.h"
+#include "slabJournalInternals.h"
+#include "slabSummary.h"
+#include "statusCodes.h"
+#include "stringUtils.h"
+#include "vdo.h"
+#include "vioPool.h"
+#include "waitQueue.h"
+
+static const uint64_t BYTES_PER_WORD   = sizeof(uint64_t);
+static const bool     NORMAL_OPERATION = true;
+
+/**
+ * Return the RefCounts from the RefCounts waiter.
+ *
+ * @param waiter  The waiter to convert
+ *
+ * @return  The RefCounts
+ **/
+__attribute__((warn_unused_result))
+static inline RefCounts *refCountsFromWaiter(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+  return (RefCounts *)
+    ((uintptr_t) waiter - offsetof(RefCounts, slabSummaryWaiter));
+}
+
+/**
+ * Convert the index of a reference counter back to the block number of the
+ * physical block for which it is counting references. The index is assumed to
+ * be valid and in-range.
+ *
+ * @param refCounts  The reference counts object
+ * @param index      The array index of the reference counter
+ *
+ * @return the physical block number corresponding to the index
+ **/
+static PhysicalBlockNumber indexToPBN(const RefCounts *refCounts,
+                                      uint64_t         index)
+{
+  return (refCounts->slab->start + index);
+}
+
+/**
+ * Convert a block number to the index of a reference counter for that block.
+ * Out of range values are pinned to the beginning or one past the end of the
+ * array.
+ *
+ * @param refCounts  The reference counts object
+ * @param pbn        The physical block number
+ *
+ * @return the index corresponding to the physical block number
+ **/
+static uint64_t pbnToIndex(const RefCounts *refCounts, PhysicalBlockNumber pbn)
+{
+  if (pbn < refCounts->slab->start) {
+    return 0;
+  }
+  uint64_t index = (pbn - refCounts->slab->start);
+  return minBlock(index, refCounts->blockCount);
+}
+
+/**********************************************************************/
+ReferenceStatus referenceCountToStatus(ReferenceCount count)
+{
+  if (count == EMPTY_REFERENCE_COUNT) {
+    return RS_FREE;
+  } else if (count == 1) {
+    return RS_SINGLE;
+  } else if (count == PROVISIONAL_REFERENCE_COUNT) {
+    return RS_PROVISIONAL;
+  } else {
+    return RS_SHARED;
+  }
+}
+
+/**
+ * Reset the free block search back to the first reference counter
+ * in the first reference block.
+ *
+ * @param refCounts  The RefCounts object containing the search cursor
+ **/
+static void resetSearchCursor(RefCounts *refCounts)
+{
+  SearchCursor *cursor = &refCounts->searchCursor;
+
+  cursor->block    = cursor->firstBlock;
+  cursor->index    = 0;
+  // Unit tests have slabs with only one reference block (and it's a runt).
+  cursor->endIndex = minBlock(COUNTS_PER_BLOCK, refCounts->blockCount);
+}
+
+/**
+ * Advance the search cursor to the start of the next reference block,
+ * wrapping around to the first reference block if the current block is the
+ * last reference block.
+ *
+ * @param refCounts  The RefCounts object containing the search cursor
+ *
+ * @return true unless the cursor was at the last reference block
+ **/
+static bool advanceSearchCursor(RefCounts *refCounts)
+{
+  SearchCursor *cursor = &refCounts->searchCursor;
+
+  // If we just finished searching the last reference block, then wrap back
+  // around to the start of the array.
+  if (cursor->block == cursor->lastBlock) {
+    resetSearchCursor(refCounts);
+    return false;
+  }
+
+  // We're not already at the end, so advance to cursor to the next block.
+  cursor->block++;
+  cursor->index = cursor->endIndex;
+
+  if (cursor->block == cursor->lastBlock) {
+    // The last reference block will usually be a runt.
+    cursor->endIndex = refCounts->blockCount;
+  } else {
+    cursor->endIndex += COUNTS_PER_BLOCK;
+  }
+  return true;
+}
+
+/**********************************************************************/
+int makeRefCounts(BlockCount            blockCount,
+                  Slab                 *slab,
+                  PhysicalBlockNumber   origin,
+                  ReadOnlyNotifier     *readOnlyNotifier,
+                  RefCounts           **refCountsPtr)
+{
+  BlockCount  refBlockCount = getSavedReferenceCountSize(blockCount);
+  RefCounts  *refCounts;
+  int result = ALLOCATE_EXTENDED(RefCounts, refBlockCount, ReferenceBlock,
+                                 "ref counts structure", &refCounts);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Allocate such that the runt slab has a full-length memory array,
+  // plus a little padding so we can word-search even at the very end.
+  size_t bytes = ((refBlockCount * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD));
+  result = ALLOCATE(bytes, ReferenceCount, "ref counts array",
+                    &refCounts->counters);
+  if (result != UDS_SUCCESS) {
+    freeRefCounts(&refCounts);
+    return result;
+  }
+
+  refCounts->slab                    = slab;
+  refCounts->blockCount              = blockCount;
+  refCounts->freeBlocks              = blockCount;
+  refCounts->origin                  = origin;
+  refCounts->referenceBlockCount     = refBlockCount;
+  refCounts->readOnlyNotifier        = readOnlyNotifier;
+  refCounts->statistics              = &slab->allocator->refCountStatistics;
+  refCounts->searchCursor.firstBlock = &refCounts->blocks[0];
+  refCounts->searchCursor.lastBlock  = &refCounts->blocks[refBlockCount - 1];
+  resetSearchCursor(refCounts);
+
+  for (size_t index = 0; index < refBlockCount; index++) {
+    refCounts->blocks[index] = (ReferenceBlock) {
+      .refCounts = refCounts,
+    };
+  }
+
+  *refCountsPtr = refCounts;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeRefCounts(RefCounts **refCountsPtr)
+{
+  RefCounts *refCounts = *refCountsPtr;
+  if (refCounts == NULL) {
+    return;
+  }
+
+  FREE(refCounts->counters);
+  FREE(refCounts);
+  *refCountsPtr = NULL;
+}
+
+/**
+ * Check whether a RefCounts has active I/O.
+ *
+ * @param refCounts  The RefCounts to check
+ *
+ * @return <code>true</code> if there is reference block I/O or a summary
+ *         update in progress
+ **/
+__attribute__((warn_unused_result))
+static bool hasActiveIO(RefCounts *refCounts)
+{
+  return ((refCounts->activeCount > 0) || refCounts->updatingSlabSummary);
+}
+
+/**********************************************************************/
+bool areRefCountsActive(RefCounts *refCounts)
+{
+  if (hasActiveIO(refCounts)) {
+    return true;
+  }
+
+    // When not suspending or recovering, the refCounts must be clean.
+  AdminStateCode code = refCounts->slab->state.state;
+  return (hasWaiters(&refCounts->dirtyBlocks)
+          && (code != ADMIN_STATE_SUSPENDING)
+          && (code != ADMIN_STATE_RECOVERING));
+}
+
+/**********************************************************************/
+static void enterRefCountsReadOnlyMode(RefCounts *refCounts, int result)
+{
+  enterReadOnlyMode(refCounts->readOnlyNotifier, result);
+  checkIfSlabDrained(refCounts->slab);
+}
+
+/**
+ * Enqueue a block on the dirty queue.
+ *
+ * @param block  The block to enqueue
+ **/
+static void enqueueDirtyBlock(ReferenceBlock *block)
+{
+  int result = enqueueWaiter(&block->refCounts->dirtyBlocks, &block->waiter);
+  if (result != VDO_SUCCESS) {
+    // This should never happen.
+    enterRefCountsReadOnlyMode(block->refCounts, result);
+  }
+}
+
+/**
+ * Mark a reference count block as dirty, potentially adding it to the dirty
+ * queue if it wasn't already dirty.
+ *
+ * @param block  The reference block to mark as dirty
+ **/
+static void dirtyBlock(ReferenceBlock *block)
+{
+  if (block->isDirty) {
+    return;
+  }
+
+  block->isDirty = true;
+  if (block->isWriting) {
+    // The conclusion of the current write will enqueue the block again.
+    return;
+  }
+
+  enqueueDirtyBlock(block);
+}
+
+/**********************************************************************/
+BlockCount getUnreferencedBlockCount(RefCounts *refCounts)
+{
+  return refCounts->freeBlocks;
+}
+
+/**********************************************************************/
+ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index)
+{
+  return &refCounts->blocks[index / COUNTS_PER_BLOCK];
+}
+
+/**
+ * Get the reference counter that covers the given physical block number.
+ *
+ * @param [in]  refCounts       The refcounts object
+ * @param [in]  pbn             The physical block number
+ * @param [out] counterPtr      A pointer to the reference counter
+
+ **/
+static int getReferenceCounter(RefCounts            *refCounts,
+                               PhysicalBlockNumber   pbn,
+                               ReferenceCount      **counterPtr)
+{
+  SlabBlockNumber index;
+  int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &index);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *counterPtr = &refCounts->counters[index];
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn)
+{
+  ReferenceCount *counterPtr = NULL;
+  int result = getReferenceCounter(refCounts, pbn, &counterPtr);
+  if (result != VDO_SUCCESS) {
+    return 0;
+  }
+
+  if (*counterPtr == PROVISIONAL_REFERENCE_COUNT) {
+    return (MAXIMUM_REFERENCE_COUNT - 1);
+  }
+
+  return (MAXIMUM_REFERENCE_COUNT - *counterPtr);
+}
+
+/**
+ * Increment the reference count for a data block.
+ *
+ * @param [in]     refCounts          The refCounts responsible for the block
+ * @param [in]     block              The reference block which contains the
+ *                                    block being updated
+ * @param [in]     slabBlockNumber    The block to update
+ * @param [in]     oldStatus          The reference status of the data block
+ *                                    before this increment
+ * @param [in]     lock               The PBNLock associated with this
+ *                                    increment (may be NULL)
+ * @param [in,out] counterPtr         A pointer to the count for the data block
+ * @param [out]    freeStatusChanged  A pointer which will be set to true if
+ *                                    this update changed the free status of
+ *                                    the block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int incrementForData(RefCounts       *refCounts,
+                            ReferenceBlock  *block,
+                            SlabBlockNumber  slabBlockNumber,
+                            ReferenceStatus  oldStatus,
+                            PBNLock         *lock,
+                            ReferenceCount  *counterPtr,
+                            bool            *freeStatusChanged)
+{
+  switch (oldStatus) {
+  case RS_FREE:
+    *counterPtr = 1;
+    block->allocatedCount++;
+    refCounts->freeBlocks--;
+    *freeStatusChanged = true;
+    break;
+
+  case RS_PROVISIONAL:
+    *counterPtr        = 1;
+    *freeStatusChanged = false;
+    break;
+
+  default:
+    // Single or shared
+    if (*counterPtr >= MAXIMUM_REFERENCE_COUNT) {
+      return logErrorWithStringError(VDO_REF_COUNT_INVALID,
+                                     "Incrementing a block already having"
+                                     " 254 references (slab %u, offset %"
+                                     PRIu32 ")",
+                                     refCounts->slab->slabNumber,
+                                     slabBlockNumber);
+    }
+    (*counterPtr)++;
+    *freeStatusChanged = false;
+  }
+
+  if (lock != NULL) {
+    unassignProvisionalReference(lock);
+  }
+  return VDO_SUCCESS;
+}
+
+/**
+ * Decrement the reference count for a data block.
+ *
+ * @param [in]     refCounts          The refCounts responsible for the block
+ * @param [in]     block              The reference block which contains the
+ *                                    block being updated
+ * @param [in]     slabBlockNumber    The block to update
+ * @param [in]     oldStatus          The reference status of the data block
+ *                                    before this decrement
+ * @param [in]     lock               The PBNLock associated with the block
+ *                                    being decremented (may be NULL)
+ * @param [in,out] counterPtr         A pointer to the count for the data block
+ * @param [out]    freeStatusChanged  A pointer which will be set to true if
+ *                                    this update changed the free status of
+ *                                    the block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int decrementForData(RefCounts       *refCounts,
+                            ReferenceBlock  *block,
+                            SlabBlockNumber  slabBlockNumber,
+                            ReferenceStatus  oldStatus,
+                            PBNLock         *lock,
+                            ReferenceCount  *counterPtr,
+                            bool            *freeStatusChanged)
+{
+  switch (oldStatus) {
+  case RS_FREE:
+    return logErrorWithStringError(VDO_REF_COUNT_INVALID,
+                                   "Decrementing free block at offset %"
+                                   PRIu32 " in slab %u", slabBlockNumber,
+                                   refCounts->slab->slabNumber);
+
+  case RS_PROVISIONAL:
+  case RS_SINGLE:
+    if (lock != NULL) {
+      // There is a read lock on this block, so the block must not become
+      // unreferenced.
+      *counterPtr        = PROVISIONAL_REFERENCE_COUNT;
+      *freeStatusChanged = false;
+      assignProvisionalReference(lock);
+    } else {
+      *counterPtr = EMPTY_REFERENCE_COUNT;
+      block->allocatedCount--;
+      refCounts->freeBlocks++;
+      *freeStatusChanged = true;
+    }
+    break;
+
+  default:
+    // Shared
+    (*counterPtr)--;
+    *freeStatusChanged = false;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Increment the reference count for a block map page. All block map increments
+ * should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map blocks
+ * never dedupe they should never be adjusted from any other state. The
+ * adjustment always results in MAXIMUM_REFERENCE_COUNT as this value is used to
+ * prevent dedupe against block map blocks.
+ *
+ * @param [in]     refCounts          The refCounts responsible for the block
+ * @param [in]     block              The reference block which contains the
+ *                                    block being updated
+ * @param [in]     slabBlockNumber    The block to update
+ * @param [in]     oldStatus          The reference status of the block
+ *                                    before this increment
+ * @param [in]     lock               The PBNLock associated with this
+ *                                    increment (may be NULL)
+ * @param [in]     normalOperation    Whether we are in normal operation vs.
+ *                                    recovery or rebuild
+ * @param [in,out] counterPtr         A pointer to the count for the block
+ * @param [out]    freeStatusChanged  A pointer which will be set to true if
+ *                                    this update changed the free status of the
+ *                                    block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int incrementForBlockMap(RefCounts       *refCounts,
+                                ReferenceBlock  *block,
+                                SlabBlockNumber  slabBlockNumber,
+                                ReferenceStatus  oldStatus,
+                                PBNLock         *lock,
+                                bool             normalOperation,
+                                ReferenceCount  *counterPtr,
+                                bool            *freeStatusChanged)
+{
+  switch (oldStatus) {
+  case RS_FREE:
+    if (normalOperation) {
+      return logErrorWithStringError(VDO_REF_COUNT_INVALID,
+                                     "Incrementing unallocated block map block"
+                                     " (slab %u, offset %" PRIu32 ")",
+                                     refCounts->slab->slabNumber,
+                                     slabBlockNumber);
+    }
+
+    *counterPtr = MAXIMUM_REFERENCE_COUNT;
+    block->allocatedCount++;
+    refCounts->freeBlocks--;
+    *freeStatusChanged = true;
+    return VDO_SUCCESS;
+
+  case RS_PROVISIONAL:
+    if (!normalOperation) {
+      return logErrorWithStringError(VDO_REF_COUNT_INVALID,
+                                     "Block map block had provisional "
+                                     "reference during replay"
+                                     " (slab %u, offset %" PRIu32 ")",
+                                     refCounts->slab->slabNumber,
+                                     slabBlockNumber);
+    }
+
+    *counterPtr        = MAXIMUM_REFERENCE_COUNT;
+    *freeStatusChanged = false;
+    if (lock != NULL) {
+      unassignProvisionalReference(lock);
+    }
+    return VDO_SUCCESS;
+
+  default:
+    return logErrorWithStringError(VDO_REF_COUNT_INVALID,
+                                   "Incrementing a block map block which is "
+                                   "already referenced %u times (slab %u, "
+                                   "offset %" PRIu32 ")",
+                                   *counterPtr,
+                                   refCounts->slab->slabNumber,
+                                   slabBlockNumber);
+  }
+}
+
+/**
+ * Update the reference count of a block.
+ *
+ * @param [in]  refCounts                The refCounts responsible for the
+ *                                       block
+ * @param [in]  block                    The reference block which contains the
+ *                                       block being updated
+ * @param [in]  slabBlockNumber          The block to update
+ * @param [in]  slabJournalPoint         The slab journal point at which this
+ *                                       update is journaled
+ * @param [in]  operation                How to update the count
+ * @param [in]  normalOperation          Whether we are in normal operation vs.
+ *                                       recovery or rebuild
+ * @param [out] freeStatusChanged        A pointer which will be set to true if
+ *                                       this update changed the free status of
+ *                                       the block
+ * @param [out] provisionalDecrementPtr  A pointer which will be set to true if
+ *                                       this update was a decrement of a
+ *                                       provisional reference
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int updateReferenceCount(RefCounts          *refCounts,
+                                ReferenceBlock     *block,
+                                SlabBlockNumber     slabBlockNumber,
+                                const JournalPoint *slabJournalPoint,
+                                ReferenceOperation  operation,
+                                bool                normalOperation,
+                                bool               *freeStatusChanged,
+                                bool               *provisionalDecrementPtr)
+{
+  ReferenceCount  *counterPtr = &refCounts->counters[slabBlockNumber];
+  ReferenceStatus  oldStatus  = referenceCountToStatus(*counterPtr);
+  PBNLock         *lock       = getReferenceOperationPBNLock(operation);
+  int result;
+
+  switch (operation.type) {
+  case DATA_INCREMENT:
+    result = incrementForData(refCounts, block, slabBlockNumber, oldStatus,
+                              lock, counterPtr, freeStatusChanged);
+    break;
+
+  case DATA_DECREMENT:
+    result = decrementForData(refCounts, block, slabBlockNumber, oldStatus,
+                              lock, counterPtr, freeStatusChanged);
+    if ((result == VDO_SUCCESS) && (oldStatus == RS_PROVISIONAL)) {
+      if (provisionalDecrementPtr != NULL) {
+        *provisionalDecrementPtr = true;
+      }
+      return VDO_SUCCESS;
+    }
+    break;
+
+  case BLOCK_MAP_INCREMENT:
+    result = incrementForBlockMap(refCounts, block, slabBlockNumber, oldStatus,
+                                  lock, normalOperation, counterPtr,
+                                  freeStatusChanged);
+    break;
+
+  default:
+    logError("Unknown reference count operation: %u", operation.type);
+    enterRefCountsReadOnlyMode(refCounts, VDO_NOT_IMPLEMENTED);
+    result = VDO_NOT_IMPLEMENTED;
+  }
+
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (isValidJournalPoint(slabJournalPoint)) {
+    refCounts->slabJournalPoint = *slabJournalPoint;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int adjustReferenceCount(RefCounts          *refCounts,
+                         ReferenceOperation  operation,
+                         const JournalPoint *slabJournalPoint,
+                         bool               *freeStatusChanged)
+{
+  if (!isSlabOpen(refCounts->slab)) {
+    return VDO_INVALID_ADMIN_STATE;
+  }
+
+  SlabBlockNumber slabBlockNumber;
+  int result = slabBlockNumberFromPBN(refCounts->slab, operation.pbn,
+                                      &slabBlockNumber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber);
+  bool provisionalDecrement = false;
+  result = updateReferenceCount(refCounts, block, slabBlockNumber,
+                                slabJournalPoint, operation,
+                                NORMAL_OPERATION, freeStatusChanged,
+                                &provisionalDecrement);
+  if ((result != VDO_SUCCESS) || provisionalDecrement) {
+    return result;
+  }
+
+  if (block->isDirty && (block->slabJournalLock > 0)) {
+    /*
+     * This block is already dirty and a slab journal entry has been made
+     * for it since the last time it was clean. We must release the per-entry
+     * slab journal lock for the entry associated with the update we are now
+     * doing.
+     */
+    result = ASSERT(isValidJournalPoint(slabJournalPoint),
+                    "Reference count adjustments need slab journal points.");
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+
+    SequenceNumber entryLock = slabJournalPoint->sequenceNumber;
+    adjustSlabJournalBlockReference(refCounts->slab->journal, entryLock, -1);
+    return VDO_SUCCESS;
+  }
+
+  /*
+   * This may be the first time we are applying an update for which there
+   * is a slab journal entry to this block since the block was
+   * cleaned. Therefore, we convert the per-entry slab journal lock to an
+   * uncommitted reference block lock, if there is a per-entry lock.
+   */
+  if (isValidJournalPoint(slabJournalPoint)) {
+    block->slabJournalLock = slabJournalPoint->sequenceNumber;
+  } else {
+    block->slabJournalLock = 0;
+  }
+
+  dirtyBlock(block);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int adjustReferenceCountForRebuild(RefCounts           *refCounts,
+                                   PhysicalBlockNumber  pbn,
+                                   JournalOperation     operation)
+{
+  SlabBlockNumber slabBlockNumber;
+  int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber);
+  bool unusedFreeStatus;
+  ReferenceOperation physicalOperation = {
+    .type = operation,
+  };
+  result = updateReferenceCount(refCounts, block, slabBlockNumber, NULL,
+                                physicalOperation, !NORMAL_OPERATION,
+                                &unusedFreeStatus, NULL);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  dirtyBlock(block);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int replayReferenceCountChange(RefCounts          *refCounts,
+                               const JournalPoint *entryPoint,
+                               SlabJournalEntry    entry)
+{
+  ReferenceBlock *block = getReferenceBlock(refCounts, entry.sbn);
+  SectorCount sector
+    = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
+  if (!beforeJournalPoint(&block->commitPoints[sector], entryPoint)) {
+    // This entry is already reflected in the existing counts, so do nothing.
+    return VDO_SUCCESS;
+  }
+
+  // This entry is not yet counted in the reference counts.
+  bool unusedFreeStatus;
+  ReferenceOperation operation = {
+    .type = entry.operation
+  };
+  int result = updateReferenceCount(refCounts, block, entry.sbn,
+                                    entryPoint, operation, !NORMAL_OPERATION,
+                                    &unusedFreeStatus, NULL);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  dirtyBlock(block);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int getReferenceStatus(RefCounts           *refCounts,
+                       PhysicalBlockNumber  pbn,
+                       ReferenceStatus     *statusPtr)
+{
+  ReferenceCount *counterPtr = NULL;
+  int result = getReferenceCounter(refCounts, pbn, &counterPtr);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *statusPtr = referenceCountToStatus(*counterPtr);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB)
+{
+  if ((counterA->blockCount             != counterB->blockCount)
+      || (counterA->freeBlocks          != counterB->freeBlocks)
+      || (counterA->referenceBlockCount != counterB->referenceBlockCount)) {
+    return false;
+  }
+
+  for (size_t i = 0; i < counterA->referenceBlockCount; i++) {
+    ReferenceBlock *blockA = &counterA->blocks[i];
+    ReferenceBlock *blockB = &counterB->blocks[i];
+    if (blockA->allocatedCount != blockB->allocatedCount) {
+      return false;
+    }
+  }
+
+  return (memcmp(counterA->counters, counterB->counters,
+                 sizeof(ReferenceCount) * counterA->blockCount) == 0);
+}
+
+/**
+ * Find the array index of the first zero byte in word-sized range of
+ * reference counters. The search does no bounds checking; the function relies
+ * on the array being sufficiently padded.
+ *
+ * @param wordPtr     A pointer to the eight counter bytes to check
+ * @param startIndex  The array index corresponding to wordPtr[0]
+ * @param failIndex   The array index to return if no zero byte is found
+
+ * @return the array index of the first zero byte in the word, or
+ *         the value passed as failIndex if no zero byte was found
+ **/
+static inline SlabBlockNumber findZeroByteInWord(const byte      *wordPtr,
+                                                 SlabBlockNumber  startIndex,
+                                                 SlabBlockNumber  failIndex)
+{
+  uint64_t word = getUInt64LE(wordPtr);
+
+  // This looks like a loop, but GCC will unroll the eight iterations for us.
+  for (unsigned int offset = 0; offset < BYTES_PER_WORD; offset++) {
+    // Assumes little-endian byte order, which we have on X86.
+    if ((word & 0xFF) == 0) {
+      return (startIndex + offset);
+    }
+    word >>= 8;
+  }
+
+  return failIndex;
+}
+
+/**********************************************************************/
+bool findFreeBlock(const RefCounts *refCounts,
+                   SlabBlockNumber  startIndex,
+                   SlabBlockNumber  endIndex,
+                   SlabBlockNumber *indexPtr)
+{
+  SlabBlockNumber  zeroIndex;
+  SlabBlockNumber  nextIndex   = startIndex;
+  byte            *nextCounter = &refCounts->counters[nextIndex];
+  byte            *endCounter  = &refCounts->counters[endIndex];
+
+  // Search every byte of the first unaligned word. (Array is padded so
+  // reading past end is safe.)
+  zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex);
+  if (zeroIndex < endIndex) {
+    *indexPtr = zeroIndex;
+    return true;
+  }
+
+  // On architectures where unaligned word access is expensive, this
+  // would be a good place to advance to an alignment boundary.
+  nextIndex   += BYTES_PER_WORD;
+  nextCounter += BYTES_PER_WORD;
+
+  // Now we're word-aligned; check an word at a time until we find a word
+  // containing a zero. (Array is padded so reading past end is safe.)
+  while (nextCounter < endCounter) {
+    /*
+     * The following code is currently an exact copy of the code preceding the
+     * loop, but if you try to merge them by using a do loop, it runs slower
+     * because a jump instruction gets added at the start of the iteration.
+     */
+    zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex);
+    if (zeroIndex < endIndex) {
+      *indexPtr = zeroIndex;
+      return true;
+    }
+
+    nextIndex   += BYTES_PER_WORD;
+    nextCounter += BYTES_PER_WORD;
+  }
+
+  return false;
+}
+
+/**
+ * Search the reference block currently saved in the search cursor for a
+ * reference count of zero, starting at the saved counter index.
+ *
+ * @param [in]  refCounts     The RefCounts object to search
+ * @param [out] freeIndexPtr  A pointer to receive the array index of the
+ *                            zero reference count
+ *
+ * @return true if an unreferenced counter was found
+ **/
+static bool searchCurrentReferenceBlock(const RefCounts *refCounts,
+                                        SlabBlockNumber *freeIndexPtr)
+{
+  // Don't bother searching if the current block is known to be full.
+  return ((refCounts->searchCursor.block->allocatedCount < COUNTS_PER_BLOCK)
+          && findFreeBlock(refCounts, refCounts->searchCursor.index,
+                           refCounts->searchCursor.endIndex, freeIndexPtr));
+}
+
+/**
+ * Search each reference block for a reference count of zero, starting at the
+ * reference block and counter index saved in the search cursor and searching
+ * up to the end of the last reference block. The search does not wrap.
+ *
+ * @param [in]  refCounts     The RefCounts object to search
+ * @param [out] freeIndexPtr  A pointer to receive the array index of the
+ *                            zero reference count
+ *
+ * @return true if an unreferenced counter was found
+ **/
+static bool searchReferenceBlocks(RefCounts       *refCounts,
+                                  SlabBlockNumber *freeIndexPtr)
+{
+  // Start searching at the saved search position in the current block.
+  if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) {
+    return true;
+  }
+
+  // Search each reference block up to the end of the slab.
+  while (advanceSearchCursor(refCounts)) {
+    if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+ * Do the bookkeeping for making a provisional reference.
+ *
+ * @param refCounts        The RefCounts
+ * @param slabBlockNumber  The block to reference
+ **/
+static void makeProvisionalReference(RefCounts       *refCounts,
+                                     SlabBlockNumber  slabBlockNumber)
+{
+  // Make the initial transition from an unreferenced block to a provisionally
+  // allocated block.
+  refCounts->counters[slabBlockNumber] = PROVISIONAL_REFERENCE_COUNT;
+
+  // Account for the allocation.
+  ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber);
+  block->allocatedCount++;
+  refCounts->freeBlocks--;
+}
+
+/**********************************************************************/
+int allocateUnreferencedBlock(RefCounts           *refCounts,
+                              PhysicalBlockNumber *allocatedPtr)
+{
+  if (!isSlabOpen(refCounts->slab)) {
+    return VDO_INVALID_ADMIN_STATE;
+  }
+
+  SlabBlockNumber freeIndex;
+  if (!searchReferenceBlocks(refCounts, &freeIndex)) {
+    return VDO_NO_SPACE;
+  }
+
+  ASSERT_LOG_ONLY((refCounts->counters[freeIndex] == EMPTY_REFERENCE_COUNT),
+                  "free block must have refCount of zero");
+  makeProvisionalReference(refCounts, freeIndex);
+
+  // Update the search hint so the next search will start at the array
+  // index just past the free block we just found.
+  refCounts->searchCursor.index = (freeIndex + 1);
+
+  *allocatedPtr = indexToPBN(refCounts, freeIndex);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int provisionallyReferenceBlock(RefCounts           *refCounts,
+                                PhysicalBlockNumber  pbn,
+                                PBNLock             *lock)
+{
+  if (!isSlabOpen(refCounts->slab)) {
+    return VDO_INVALID_ADMIN_STATE;
+  }
+
+  SlabBlockNumber slabBlockNumber;
+  int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (refCounts->counters[slabBlockNumber] == EMPTY_REFERENCE_COUNT) {
+    makeProvisionalReference(refCounts, slabBlockNumber);
+    if (lock != NULL) {
+      assignProvisionalReference(lock);
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+BlockCount countUnreferencedBlocks(RefCounts           *refCounts,
+                                   PhysicalBlockNumber  startPBN,
+                                   PhysicalBlockNumber  endPBN)
+{
+  BlockCount freeBlocks = 0;
+  SlabBlockNumber   startIndex = pbnToIndex(refCounts, startPBN);
+  SlabBlockNumber   endIndex   = pbnToIndex(refCounts, endPBN);
+  for (SlabBlockNumber index = startIndex; index < endIndex; index++) {
+    if (refCounts->counters[index] == EMPTY_REFERENCE_COUNT) {
+      freeBlocks++;
+    }
+  }
+
+  return freeBlocks;
+}
+
+/**
+ * Convert a ReferenceBlock's generic wait queue entry back into the
+ * ReferenceBlock.
+ *
+ * @param waiter        The wait queue entry to convert
+ *
+ * @return  The wrapping ReferenceBlock
+ **/
+static inline ReferenceBlock *waiterAsReferenceBlock(Waiter *waiter)
+{
+  STATIC_ASSERT(offsetof(ReferenceBlock, waiter) == 0);
+  return (ReferenceBlock *) waiter;
+}
+
+/**
+ * WaitCallback to clean dirty reference blocks when resetting.
+ *
+ * @param blockWaiter  The dirty block
+ * @param context      Unused
+ **/
+static void
+clearDirtyReferenceBlocks(Waiter *blockWaiter,
+                          void   *context __attribute__((unused)))
+{
+  waiterAsReferenceBlock(blockWaiter)->isDirty = false;
+}
+
+/**********************************************************************/
+void resetReferenceCounts(RefCounts *refCounts)
+{
+  // We can just use memset() since each ReferenceCount is exactly one byte.
+  STATIC_ASSERT(sizeof(ReferenceCount) == 1);
+  memset(refCounts->counters, 0, refCounts->blockCount);
+  refCounts->freeBlocks       = refCounts->blockCount;
+  refCounts->slabJournalPoint = (JournalPoint) {
+    .sequenceNumber = 0,
+    .entryCount     = 0,
+  };
+
+  for (size_t i = 0; i < refCounts->referenceBlockCount; i++) {
+    refCounts->blocks[i].allocatedCount = 0;
+  }
+
+  notifyAllWaiters(&refCounts->dirtyBlocks, clearDirtyReferenceBlocks, NULL);
+}
+
+/**********************************************************************/
+BlockCount getSavedReferenceCountSize(BlockCount blockCount)
+{
+  return computeBucketCount(blockCount, COUNTS_PER_BLOCK);
+}
+
+/**
+ * A waiter callback that resets the writing state of refCounts.
+ **/
+static void finishSummaryUpdate(Waiter *waiter, void *context)
+{
+  RefCounts *refCounts           = refCountsFromWaiter(waiter);
+  refCounts->updatingSlabSummary = false;
+
+  int result = *((int *) context);
+  if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) {
+    checkIfSlabDrained(refCounts->slab);
+    return;
+  }
+
+  logErrorWithStringError(result, "failed to update slab summary");
+  enterRefCountsReadOnlyMode(refCounts, result);
+}
+
+/**
+ * Update slab summary that the RefCounts is clean.
+ *
+ * @param refCounts    The RefCounts object that is being written
+ **/
+static void updateSlabSummaryAsClean(RefCounts *refCounts)
+{
+  SlabSummaryZone *summary = getSlabSummaryZone(refCounts->slab->allocator);
+  if (summary == NULL) {
+    return;
+  }
+
+  // Update the slab summary to indicate this refCounts is clean.
+  TailBlockOffset offset
+    = getSummarizedTailBlockOffset(summary, refCounts->slab->slabNumber);
+  refCounts->updatingSlabSummary        = true;
+  refCounts->slabSummaryWaiter.callback = finishSummaryUpdate;
+  updateSlabSummaryEntry(summary, &refCounts->slabSummaryWaiter,
+                         refCounts->slab->slabNumber, offset, true, true,
+                         getSlabFreeBlockCount(refCounts->slab));
+}
+
+/**
+ * Handle an I/O error reading or writing a reference count block.
+ *
+ * @param completion  The VIO doing the I/O as a completion
+ **/
+static void handleIOError(VDOCompletion *completion)
+{
+  int           result    = completion->result;
+  VIOPoolEntry *entry     = completion->parent;
+  RefCounts    *refCounts = ((ReferenceBlock *) entry->parent)->refCounts;
+  returnVIO(refCounts->slab->allocator, entry);
+  refCounts->activeCount--;
+  enterRefCountsReadOnlyMode(refCounts, result);
+}
+
+/**
+ * After a reference block has written, clean it, release its locks, and return
+ * its VIO to the pool.
+ *
+ * @param completion  The VIO that just finished writing
+ **/
+static void finishReferenceBlockWrite(VDOCompletion *completion)
+{
+  VIOPoolEntry   *entry     = completion->parent;
+  ReferenceBlock *block     = entry->parent;
+  RefCounts      *refCounts = block->refCounts;
+  refCounts->activeCount--;
+
+  // Release the slab journal lock.
+  adjustSlabJournalBlockReference(refCounts->slab->journal,
+                                  block->slabJournalLockToRelease, -1);
+  returnVIO(refCounts->slab->allocator, entry);
+
+  /*
+   * We can't clear the isWriting flag earlier as releasing the slab journal
+   * lock may cause us to be dirtied again, but we don't want to double
+   * enqueue.
+   */
+  block->isWriting = false;
+
+  if (isReadOnly(refCounts->readOnlyNotifier)) {
+    checkIfSlabDrained(refCounts->slab);
+    return;
+  }
+
+  // Re-queue the block if it was re-dirtied while it was writing.
+  if (block->isDirty) {
+    enqueueDirtyBlock(block);
+    if (isSlabDraining(refCounts->slab)) {
+      // We must be saving, and this block will otherwise not be relaunched.
+      saveDirtyReferenceBlocks(refCounts);
+    }
+
+    return;
+  }
+
+  // Mark the RefCounts as clean in the slab summary if there are no dirty
+  // or writing blocks and no summary update in progress.
+  if (!hasActiveIO(refCounts) && !hasWaiters(&refCounts->dirtyBlocks)) {
+    updateSlabSummaryAsClean(refCounts);
+  }
+}
+
+/**********************************************************************/
+ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block)
+{
+  size_t blockIndex = block - block->refCounts->blocks;
+  return &block->refCounts->counters[blockIndex * COUNTS_PER_BLOCK];
+}
+
+/**********************************************************************/
+void packReferenceBlock(ReferenceBlock *block, void *buffer)
+{
+  PackedJournalPoint commitPoint;
+  packJournalPoint(&block->refCounts->slabJournalPoint, &commitPoint);
+
+  PackedReferenceBlock *packed   = buffer;
+  ReferenceCount       *counters = getReferenceCountersForBlock(block);
+  for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) {
+    packed->sectors[i].commitPoint = commitPoint;
+    memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
+           (sizeof(ReferenceCount) * COUNTS_PER_SECTOR));
+  }
+}
+
+/**
+ * After a dirty block waiter has gotten a VIO from the VIO pool, copy its
+ * counters and associated data into the VIO, and launch the write.
+ *
+ * @param blockWaiter  The waiter of the dirty block
+ * @param vioContext   The VIO returned by the pool
+ **/
+static void writeReferenceBlock(Waiter *blockWaiter, void *vioContext)
+{
+  VIOPoolEntry   *entry = vioContext;
+  ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter);
+  packReferenceBlock(block, entry->buffer);
+
+  size_t              blockOffset = (block - block->refCounts->blocks);
+  PhysicalBlockNumber pbn         = (block->refCounts->origin + blockOffset);
+  block->slabJournalLockToRelease = block->slabJournalLock;
+  entry->parent                   = block;
+
+  /*
+   * Mark the block as clean, since we won't be committing any updates that
+   * happen after this moment. As long as VIO order is preserved, two
+   * VIOs updating this block at once will not cause complications.
+   */
+  block->isDirty = false;
+
+  // Flush before writing to ensure that the recovery journal and slab journal
+  // entries which cover this reference update are stable (VDO-2331).
+  relaxedAdd64(&block->refCounts->statistics->blocksWritten, 1);
+  entry->vio->completion.callbackThreadID
+    = block->refCounts->slab->allocator->threadID;
+  launchWriteMetadataVIOWithFlush(entry->vio, pbn, finishReferenceBlockWrite,
+                                  handleIOError, true, false);
+}
+
+/**
+ * Launch the write of a dirty reference block by first acquiring a VIO for it
+ * from the pool. This can be asynchronous since the writer will have to wait
+ * if all VIOs in the pool are currently in use.
+ *
+ * @param blockWaiter  The waiter of the block which is starting to write
+ * @param context      The parent refCounts of the block
+ **/
+static void launchReferenceBlockWrite(Waiter *blockWaiter, void *context)
+{
+  RefCounts *refCounts = context;
+  if (isReadOnly(refCounts->readOnlyNotifier)) {
+    return;
+  }
+
+  refCounts->activeCount++;
+  ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter);
+  block->isWriting      = true;
+  blockWaiter->callback = writeReferenceBlock;
+  int result = acquireVIO(refCounts->slab->allocator, blockWaiter);
+  if (result != VDO_SUCCESS) {
+    // This should never happen.
+    refCounts->activeCount--;
+    enterRefCountsReadOnlyMode(refCounts, result);
+  }
+}
+
+/**********************************************************************/
+void saveOldestReferenceBlock(RefCounts *refCounts)
+{
+  notifyNextWaiter(&refCounts->dirtyBlocks, launchReferenceBlockWrite,
+                   refCounts);
+}
+
+/**********************************************************************/
+void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor)
+{
+  BlockCount dirtyBlockCount = countWaiters(&refCounts->dirtyBlocks);
+  if (dirtyBlockCount == 0) {
+    return;
+  }
+
+  BlockCount blocksToWrite = dirtyBlockCount / flushDivisor;
+  // Always save at least one block.
+  if (blocksToWrite == 0) {
+    blocksToWrite = 1;
+  }
+
+  for (BlockCount written = 0; written < blocksToWrite; written++) {
+    saveOldestReferenceBlock(refCounts);
+  }
+}
+
+/**********************************************************************/
+void saveDirtyReferenceBlocks(RefCounts *refCounts)
+{
+  notifyAllWaiters(&refCounts->dirtyBlocks, launchReferenceBlockWrite,
+                   refCounts);
+  checkIfSlabDrained(refCounts->slab);
+}
+
+/**********************************************************************/
+void dirtyAllReferenceBlocks(RefCounts *refCounts)
+{
+  for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) {
+    dirtyBlock(&refCounts->blocks[i]);
+  }
+}
+
+/**
+ * Clear the provisional reference counts from a reference block.
+ *
+ * @param block  The block to clear
+ **/
+static void clearProvisionalReferences(ReferenceBlock *block)
+{
+  ReferenceCount *counters = getReferenceCountersForBlock(block);
+  for (BlockCount j = 0; j < COUNTS_PER_BLOCK; j++) {
+    if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
+      counters[j] = EMPTY_REFERENCE_COUNT;
+      block->allocatedCount--;
+    }
+  }
+}
+
+/**
+ * Unpack reference counts blocks into the internal memory structure.
+ *
+ * @param packed  The written reference block to be unpacked
+ * @param block   The internal reference block to be loaded
+ **/
+static void unpackReferenceBlock(PackedReferenceBlock *packed,
+                                 ReferenceBlock       *block)
+{
+  RefCounts      *refCounts    = block->refCounts;
+  ReferenceCount *counters     = getReferenceCountersForBlock(block);
+  for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) {
+    PackedReferenceSector *sector = &packed->sectors[i];
+    unpackJournalPoint(&sector->commitPoint, &block->commitPoints[i]);
+    memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
+           (sizeof(ReferenceCount) * COUNTS_PER_SECTOR));
+    // The slabJournalPoint must be the latest point found in any sector.
+    if (beforeJournalPoint(&refCounts->slabJournalPoint,
+                           &block->commitPoints[i])) {
+      refCounts->slabJournalPoint = block->commitPoints[i];
+    }
+
+    if ((i > 0) && !areEquivalentJournalPoints(&block->commitPoints[0],
+                                               &block->commitPoints[i])) {
+      size_t blockIndex = block - block->refCounts->blocks;
+      logWarning("Torn write detected in sector %u of reference block"
+                 " %zu of slab %" PRIu16,
+                 i, blockIndex, block->refCounts->slab->slabNumber);
+    }
+  }
+
+  block->allocatedCount = 0;
+  for (BlockCount i = 0; i < COUNTS_PER_BLOCK; i++) {
+    if (counters[i] != EMPTY_REFERENCE_COUNT) {
+      block->allocatedCount++;
+    }
+  }
+}
+
+/**
+ * After a reference block has been read, unpack it.
+ *
+ * @param completion  The VIO that just finished reading
+ **/
+static void finishReferenceBlockLoad(VDOCompletion *completion)
+{
+  VIOPoolEntry   *entry = completion->parent;
+  ReferenceBlock *block = entry->parent;
+  unpackReferenceBlock((PackedReferenceBlock *) entry->buffer, block);
+
+  RefCounts *refCounts = block->refCounts;
+  returnVIO(refCounts->slab->allocator, entry);
+  refCounts->activeCount--;
+  clearProvisionalReferences(block);
+
+  refCounts->freeBlocks -= block->allocatedCount;
+  checkIfSlabDrained(block->refCounts->slab);
+}
+
+/**
+ * After a block waiter has gotten a VIO from the VIO pool, load the block.
+ *
+ * @param blockWaiter  The waiter of the block to load
+ * @param vioContext   The VIO returned by the pool
+ **/
+static void loadReferenceBlock(Waiter *blockWaiter, void *vioContext)
+{
+  VIOPoolEntry        *entry       = vioContext;
+  ReferenceBlock      *block       = waiterAsReferenceBlock(blockWaiter);
+  size_t               blockOffset = (block - block->refCounts->blocks);
+  PhysicalBlockNumber  pbn         = (block->refCounts->origin + blockOffset);
+  entry->parent                    = block;
+
+  entry->vio->completion.callbackThreadID
+    = block->refCounts->slab->allocator->threadID;
+  launchReadMetadataVIO(entry->vio, pbn, finishReferenceBlockLoad,
+                        handleIOError);
+}
+
+/**
+ * Load reference blocks from the underlying storage into a pre-allocated
+ * reference counter.
+ *
+ * @param refCounts  The reference counter to be loaded
+ **/
+static void loadReferenceBlocks(RefCounts *refCounts)
+{
+  refCounts->freeBlocks  = refCounts->blockCount;
+  refCounts->activeCount = refCounts->referenceBlockCount;
+  for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) {
+    Waiter *blockWaiter = &refCounts->blocks[i].waiter;
+    blockWaiter->callback = loadReferenceBlock;
+    int result = acquireVIO(refCounts->slab->allocator, blockWaiter);
+    if (result != VDO_SUCCESS) {
+      // This should never happen.
+      refCounts->activeCount -= (refCounts->referenceBlockCount - i);
+      enterRefCountsReadOnlyMode(refCounts, result);
+      return;
+    }
+  }
+}
+
+/**********************************************************************/
+void drainRefCounts(RefCounts *refCounts)
+{
+  Slab *slab = refCounts->slab;
+  bool  save = false;
+  switch (slab->state.state) {
+  case ADMIN_STATE_SCRUBBING:
+    if (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) {
+      loadReferenceBlocks(refCounts);
+      return;
+    }
+
+    break;
+
+  case ADMIN_STATE_SAVE_FOR_SCRUBBING:
+    if (!mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) {
+      // These reference counts were never written, so mark them all dirty.
+      dirtyAllReferenceBlocks(refCounts);
+    }
+    save = true;
+    break;
+
+  case ADMIN_STATE_REBUILDING:
+    if (shouldSaveFullyBuiltSlab(slab)) {
+      dirtyAllReferenceBlocks(refCounts);
+      save = true;
+    }
+    break;
+
+  case ADMIN_STATE_SAVING:
+    save = !isUnrecoveredSlab(slab);
+    break;
+
+  case ADMIN_STATE_RECOVERING:
+  case ADMIN_STATE_SUSPENDING:
+    break;
+
+  default:
+    notifyRefCountsAreDrained(slab, VDO_SUCCESS);
+    return;
+  }
+
+  if (save) {
+    saveDirtyReferenceBlocks(refCounts);
+  }
+}
+
+/**********************************************************************/
+void acquireDirtyBlockLocks(RefCounts *refCounts)
+{
+  dirtyAllReferenceBlocks(refCounts);
+  for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) {
+    refCounts->blocks[i].slabJournalLock = 1;
+  }
+
+  adjustSlabJournalBlockReference(refCounts->slab->journal, 1,
+                                  refCounts->referenceBlockCount);
+}
+
+/**********************************************************************/
+void dumpRefCounts(const RefCounts *refCounts)
+{
+  // Terse because there are a lot of slabs to dump and syslog is lossy.
+  logInfo("  refCounts: free=%" PRIu32 "/%" PRIu32 " blocks=%" PRIu32
+          " dirty=%zu active=%zu journal@(%llu,%" PRIu16 ")%s",
+          refCounts->freeBlocks, refCounts->blockCount,
+          refCounts->referenceBlockCount,
+          countWaiters(&refCounts->dirtyBlocks),
+          refCounts->activeCount,
+          refCounts->slabJournalPoint.sequenceNumber,
+          refCounts->slabJournalPoint.entryCount,
+          (refCounts->updatingSlabSummary ? " updating" : ""));
+}
diff --git a/vdo/base/refCounts.h b/vdo/base/refCounts.h
new file mode 100644
index 0000000..f140c8c
--- /dev/null
+++ b/vdo/base/refCounts.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.h#7 $
+ */
+
+#ifndef REF_COUNTS_H
+#define REF_COUNTS_H
+
+#include "completion.h"
+#include "journalPoint.h"
+#include "slab.h"
+#include "types.h"
+
+/**
+ * Create a reference counting object.
+ *
+ * <p>A reference counting object can keep a reference count for every physical
+ * block in the VDO configuration. Since we expect the vast majority of the
+ * blocks to have 0 or 1 reference counts, the structure is optimized for that
+ * situation.
+ *
+ * @param [in]  blockCount        The number of physical blocks that can be
+ *                                referenced
+ * @param [in]  slab              The slab of the ref counts object
+ * @param [in]  origin            The layer PBN at which to save RefCounts
+ * @param [in]  readOnlyNotifier  The context for tracking read-only mode
+ * @param [out] refCountsPtr      The pointer to hold the new ref counts object
+ *
+ * @return a success or error code
+ **/
+int makeRefCounts(BlockCount            blockCount,
+                  Slab                 *slab,
+                  PhysicalBlockNumber   origin,
+                  ReadOnlyNotifier     *readOnlyNotifier,
+                  RefCounts           **refCountsPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a reference counting object and null out the reference to it.
+ *
+ * @param refCountsPtr  The reference to the reference counting object to free
+ **/
+void freeRefCounts(RefCounts **refCountsPtr);
+
+/**
+ * Check whether a RefCounts is active.
+ *
+ * @param refCounts  The RefCounts to check
+ **/
+bool areRefCountsActive(RefCounts *refCounts)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the stored count of the number of blocks that are currently free.
+ *
+ * @param  refCounts  The RefCounts object
+ *
+ * @return the number of blocks with a reference count of zero
+ **/
+BlockCount getUnreferencedBlockCount(RefCounts *refCounts)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine how many times a reference count can be incremented without
+ * overflowing.
+ *
+ * @param  refCounts  The RefCounts object
+ * @param  pbn        The physical block number
+ *
+ * @return the number of increments that can be performed
+ **/
+uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Adjust the reference count of a block.
+ *
+ * @param [in]  refCounts          The refcounts object
+ * @param [in]  operation          The operation to perform
+ * @param [in]  slabJournalPoint   The slab journal entry for this adjustment
+ * @param [out] freeStatusChanged  A pointer which will be set to true if the
+ *                                 free status of the block changed
+ *
+ *
+ * @return A success or error code, specifically:
+ *           VDO_REF_COUNT_INVALID   if a decrement would result in a negative
+ *                                   reference count, or an increment in a
+ *                                   count greater than MAXIMUM_REFS
+ *
+ **/
+int adjustReferenceCount(RefCounts          *refCounts,
+                         ReferenceOperation  operation,
+                         const JournalPoint *slabJournalPoint,
+                         bool               *freeStatusChanged)
+  __attribute__((warn_unused_result));
+
+/**
+ * Adjust the reference count of a block during rebuild.
+ *
+ * @param refCounts  The refcounts object
+ * @param pbn        The number of the block to adjust
+ * @param operation  The operation to perform on the count
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int adjustReferenceCountForRebuild(RefCounts           *refCounts,
+                                   PhysicalBlockNumber  pbn,
+                                   JournalOperation     operation)
+  __attribute__((warn_unused_result));
+
+/**
+ * Replay the reference count adjustment from a slab journal entry into the
+ * reference count for a block. The adjustment will be ignored if it was already
+ * recorded in the reference count.
+ *
+ * @param refCounts   The refcounts object
+ * @param entryPoint  The slab journal point for the entry
+ * @param entry       The slab journal entry being replayed
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int replayReferenceCountChange(RefCounts          *refCounts,
+                               const JournalPoint *entryPoint,
+                               SlabJournalEntry    entry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether two reference counters are equivalent. This method is
+ * used for unit testing.
+ *
+ * @param counterA The first counter to compare
+ * @param counterB The second counter to compare
+ *
+ * @return <code>true</code> if the two counters are equivalent
+ **/
+bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find a block with a reference count of zero in the range of physical block
+ * numbers tracked by the reference counter. If a free block is found, that
+ * block is allocated by marking it as provisionally referenced, and the
+ * allocated block number is returned.
+ *
+ * @param [in]  refCounts     The reference counters to scan
+ * @param [out] allocatedPtr  A pointer to hold the physical block number of
+ *                            the block that was found and allocated
+ *
+ * @return VDO_SUCCESS if a free block was found and allocated;
+ *         VDO_NO_SPACE if there are no unreferenced blocks;
+ *         otherwise an error code
+ **/
+int allocateUnreferencedBlock(RefCounts           *refCounts,
+                              PhysicalBlockNumber *allocatedPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Provisionally reference a block if it is unreferenced.
+ *
+ * @param refCounts  The reference counters
+ * @param pbn        The PBN to reference
+ * @param lock       The PBNLock on the block (may be NULL)
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int provisionallyReferenceBlock(RefCounts           *refCounts,
+                                PhysicalBlockNumber  pbn,
+                                PBNLock             *lock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Count all unreferenced blocks in a range [startBlock, endBlock) of physical
+ * block numbers.
+ *
+ * @param refCounts  The reference counters to scan
+ * @param startPBN   The physical block number at which to start
+ *                   scanning (included in the scan)
+ * @param endPBN     The physical block number at which to stop
+ *                   scanning (excluded from the scan)
+ *
+ * @return The number of unreferenced blocks
+ **/
+BlockCount countUnreferencedBlocks(RefCounts           *refCounts,
+                                   PhysicalBlockNumber  startPBN,
+                                   PhysicalBlockNumber  endPBN)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of blocks required to save a reference counts state covering
+ * the specified number of data blocks.
+ *
+ * @param blockCount  The number of physical data blocks that can be referenced
+ *
+ * @return The number of blocks required to save reference counts with the
+ *         given block count
+ **/
+BlockCount getSavedReferenceCountSize(BlockCount blockCount)
+  __attribute__((warn_unused_result));
+
+/**
+ * Request a RefCounts save several dirty blocks asynchronously. This function
+ * currently writes 1 / flushDivisor of the dirty blocks.
+ *
+ * @param refCounts       The RefCounts object to notify
+ * @param flushDivisor    The inverse fraction of the dirty blocks to write
+ **/
+void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor);
+
+/**
+ * Ask a RefCounts to save all its dirty blocks asynchronously.
+ *
+ * @param refCounts     The RefCounts object to notify
+ **/
+void saveDirtyReferenceBlocks(RefCounts *refCounts);
+
+/**
+ * Mark all reference count blocks as dirty.
+ *
+ * @param refCounts  The RefCounts of the reference blocks
+ **/
+void dirtyAllReferenceBlocks(RefCounts *refCounts);
+
+/**
+ * Drain all reference count I/O. Depending upon the type of drain being
+ * performed (as recorded in the RefCount's Slab), the reference blocks may
+ * be loaded from disk or dirty reference blocks may be written out.
+ *
+ * @param refCounts  The reference counts to drain
+ **/
+void drainRefCounts(RefCounts *refCounts);
+
+/**
+ * Mark all reference count blocks dirty and cause them to hold locks on slab
+ * journal block 1.
+ *
+ * @param refCounts  The RefCounts of the reference blocks
+ **/
+void acquireDirtyBlockLocks(RefCounts *refCounts);
+
+/**
+ * Dump information about this RefCounts structure.
+ *
+ * @param refCounts     The RefCounts to dump
+ **/
+void dumpRefCounts(const RefCounts *refCounts);
+
+#endif // REF_COUNTS_H
diff --git a/vdo/base/refCountsInternals.h b/vdo/base/refCountsInternals.h
new file mode 100644
index 0000000..a1bd1db
--- /dev/null
+++ b/vdo/base/refCountsInternals.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCountsInternals.h#4 $
+ */
+
+#ifndef REF_COUNTS_INTERNALS_H
+#define REF_COUNTS_INTERNALS_H
+
+#include "refCounts.h"
+
+#include "journalPoint.h"
+#include "referenceBlock.h"
+#include "slab.h"
+#include "blockAllocatorInternals.h"
+#include "waitQueue.h"
+
+/**
+ * Represents the possible status of a block.
+ **/
+typedef enum referenceStatus {
+  RS_FREE,        // this block is free
+  RS_SINGLE,      // this block is singly-referenced
+  RS_SHARED,      // this block is shared
+  RS_PROVISIONAL  // this block is provisionally allocated
+} ReferenceStatus;
+
+/**
+ * The SearchCursor represents the saved position of a free block search.
+ **/
+typedef struct searchCursor {
+  /** The reference block containing the current search index */
+  ReferenceBlock      *block;
+  /** The position at which to start searching for the next free counter */
+  SlabBlockNumber      index;
+  /** The position just past the last valid counter in the current block */
+  SlabBlockNumber      endIndex;
+
+  /** A pointer to the first reference block in the slab */
+  ReferenceBlock      *firstBlock;
+  /** A pointer to the last reference block in the slab */
+  ReferenceBlock      *lastBlock;
+} SearchCursor;
+
+/*
+ * RefCounts structure
+ *
+ * A reference count is maintained for each PhysicalBlockNumber.  The vast
+ * majority of blocks have a very small reference count (usually 0 or 1).
+ * For references less than or equal to MAXIMUM_REFS (254) the reference count
+ * is stored in counters[pbn].
+ *
+ */
+struct refCounts {
+  /** The slab of this reference block */
+  Slab                     *slab;
+
+  /** The size of the counters array */
+  uint32_t                  blockCount;
+  /** The number of free blocks */
+  uint32_t                  freeBlocks;
+  /** The array of reference counts */
+  ReferenceCount           *counters; // use ALLOCATE to align data ptr
+
+  /** The saved block pointer and array indexes for the free block search */
+  SearchCursor              searchCursor;
+
+  /** A list of the dirty blocks waiting to be written out */
+  WaitQueue                 dirtyBlocks;
+  /** The number of blocks which are currently writing */
+  size_t                    activeCount;
+
+  /** A waiter object for updating the slab summary */
+  Waiter                    slabSummaryWaiter;
+  /** Whether slab summary update is in progress */
+  bool                      updatingSlabSummary;
+
+  /** The notifier for read-only mode */
+  ReadOnlyNotifier         *readOnlyNotifier;
+  /** The refcount statistics, shared by all refcounts in our physical zone */
+  AtomicRefCountStatistics *statistics;
+  /** The layer PBN for the first ReferenceBlock */
+  PhysicalBlockNumber       origin;
+  /** The latest slab journal entry this RefCounts has been updated with */
+  JournalPoint              slabJournalPoint;
+
+  /** The number of reference count blocks */
+  uint32_t                  referenceBlockCount;
+  /** reference count block array */
+  ReferenceBlock            blocks[];
+};
+
+/**
+ * Convert a reference count to a reference status.
+ *
+ * @param count The count to convert
+ *
+ * @return  The appropriate reference status
+ **/
+__attribute__((warn_unused_result))
+ReferenceStatus referenceCountToStatus(ReferenceCount count);
+
+/**
+ * Convert a generic VDOCompletion to a RefCounts.
+ *
+ * @param completion The completion to convert
+ *
+ * @return The completion as a RefCounts
+ **/
+RefCounts *asRefCounts(VDOCompletion *completion)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the reference block that covers the given block index (exposed for
+ * testing).
+ *
+ * @param refCounts  The refcounts object
+ * @param index      The block index
+ **/
+ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the reference counters for a given block (exposed for testing).
+ *
+ * @param block  The ReferenceBlock in question
+ *
+ * @return A pointer to the reference counters for this block
+ **/
+ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block)
+  __attribute__((warn_unused_result));
+
+/**
+ * Copy data from a reference block to a buffer ready to be written out
+ * (exposed for testing).
+ *
+ * @param block   The block to copy
+ * @param buffer  The char buffer to fill with the packed block
+ **/
+void packReferenceBlock(ReferenceBlock *block, void *buffer);
+
+/**
+ * Get the reference status of a block. Exposed only for unit testing.
+ *
+ * @param [in]  refCounts   The refcounts object
+ * @param [in]  pbn         The physical block number
+ * @param [out] statusPtr   Where to put the status of the block
+ *
+ * @return                  A success or error code, specifically:
+ *                          VDO_OUT_OF_RANGE if the pbn is out of range.
+ **/
+int getReferenceStatus(RefCounts           *refCounts,
+                       PhysicalBlockNumber  pbn,
+                       ReferenceStatus     *statusPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Find the first block with a reference count of zero in the specified range
+ * of reference counter indexes. Exposed for unit testing.
+ *
+ * @param [in]  refCounts   The reference counters to scan
+ * @param [in]  startIndex  The array index at which to start scanning
+ *                          (included in the scan)
+ * @param [in]  endIndex    The array index at which to stop scanning
+ *                          (excluded from the scan)
+ * @param [out] indexPtr    A pointer to hold the array index of the free block
+ *
+ * @return true if a free block was found in the specified range
+ **/
+bool findFreeBlock(const RefCounts *refCounts,
+                   SlabBlockNumber  startIndex,
+                   SlabBlockNumber  endIndex,
+                   SlabBlockNumber *indexPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Request a RefCounts save its oldest dirty block asynchronously.
+ *
+ * @param refCounts  The RefCounts object to notify
+ **/
+void saveOldestReferenceBlock(RefCounts *refCounts);
+
+/**
+ * Reset all reference counts back to RS_FREE.
+ *
+ * @param refCounts   The reference counters to reset
+ **/
+void resetReferenceCounts(RefCounts *refCounts);
+
+#endif // REF_COUNTS_INTERNALS_H
diff --git a/vdo/base/referenceBlock.h b/vdo/base/referenceBlock.h
new file mode 100644
index 0000000..8014c3b
--- /dev/null
+++ b/vdo/base/referenceBlock.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceBlock.h#1 $
+ */
+
+#ifndef REFERENCE_BLOCK_H
+#define REFERENCE_BLOCK_H
+
+#include "constants.h"
+#include "journalPoint.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * A type representing a reference count.
+ **/
+typedef uint8_t ReferenceCount;
+
+/**
+ * Special ReferenceCount values.
+ **/
+enum {
+  EMPTY_REFERENCE_COUNT       = 0,
+  MAXIMUM_REFERENCE_COUNT     = 254,
+  PROVISIONAL_REFERENCE_COUNT = 255,
+};
+
+enum {
+  COUNTS_PER_SECTOR = ((VDO_SECTOR_SIZE - sizeof(PackedJournalPoint))
+                       / sizeof(ReferenceCount)),
+  COUNTS_PER_BLOCK  = COUNTS_PER_SECTOR * SECTORS_PER_BLOCK,
+};
+
+/**
+ * The format of a ReferenceSector on disk.
+ **/
+typedef struct {
+  PackedJournalPoint commitPoint;
+  ReferenceCount     counts[COUNTS_PER_SECTOR];
+} __attribute__((packed)) PackedReferenceSector;
+
+typedef struct {
+  PackedReferenceSector sectors[SECTORS_PER_BLOCK];
+} PackedReferenceBlock;
+
+/*
+ * ReferenceBlock structure
+ *
+ * Blocks are used as a proxy, permitting saves of partial refcounts.
+ **/
+typedef struct {
+  /** This block waits on the refCounts to tell it to write */
+  Waiter          waiter;
+  /** The parent RefCount structure */
+  RefCounts      *refCounts;
+  /** The number of references in this block that represent allocations */
+  BlockSize       allocatedCount;
+  /** The slab journal block on which this block must hold a lock */
+  SequenceNumber  slabJournalLock;
+  /**
+   * The slab journal block which should be released when this block
+   * is committed
+   **/
+  SequenceNumber  slabJournalLockToRelease;
+  /** The point up to which each sector is accurate on disk */
+  JournalPoint    commitPoints[SECTORS_PER_BLOCK];
+  /** Whether this block has been modified since it was written to disk */
+  bool            isDirty;
+  /** Whether this block is currently writing */
+  bool            isWriting;
+} ReferenceBlock;
+
+#endif // REFERENCE_BLOCK_H
diff --git a/vdo/base/referenceCountRebuild.c b/vdo/base/referenceCountRebuild.c
new file mode 100644
index 0000000..a3d91ac
--- /dev/null
+++ b/vdo/base/referenceCountRebuild.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.c#6 $
+ */
+
+#include "referenceCountRebuild.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMap.h"
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "forest.h"
+#include "constants.h"
+#include "numUtils.h"
+#include "refCounts.h"
+#include "slabDepot.h"
+#include "vdoInternal.h"
+#include "vdoPageCache.h"
+
+/**
+ * A reference count rebuild completion.
+ * Note that the page completions kept in this structure are not immediately
+ * freed, so the corresponding pages will be locked down in the page cache
+ * until the rebuild frees them.
+ **/
+typedef struct {
+  /** completion header */
+  VDOCompletion      completion;
+  /** the completion for flushing the block map */
+  VDOCompletion      subTaskCompletion;
+  /** the thread on which all block map operations must be done */
+  ThreadID           logicalThreadID;
+  /** the admin thread */
+  ThreadID           adminThreadID;
+  /** the block map */
+  BlockMap          *blockMap;
+  /** the slab depot */
+  SlabDepot         *depot;
+  /** whether this recovery has been aborted */
+  bool               aborted;
+  /** whether we are currently launching the initial round of requests */
+  bool               launching;
+  /** The number of logical blocks observed used */
+  BlockCount        *logicalBlocksUsed;
+  /** The number of block map data blocks */
+  BlockCount        *blockMapDataBlocks;
+  /** the next page to fetch */
+  PageCount          pageToFetch;
+  /** the number of leaf pages in the block map */
+  PageCount          leafPages;
+  /** the last slot of the block map */
+  BlockMapSlot       lastSlot;
+  /** number of pending (non-ready) requests*/
+  PageCount          outstanding;
+  /** number of page completions */
+  PageCount          pageCount;
+  /** array of requested, potentially ready page completions */
+  VDOPageCompletion  pageCompletions[];
+} RebuildCompletion;
+
+/**
+ * Convert a VDOCompletion to a RebuildCompletion.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a RebuildCompletion
+ **/
+__attribute__((warn_unused_result))
+static inline RebuildCompletion *asRebuildCompletion(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(RebuildCompletion, completion) == 0);
+  assertCompletionType(completion->type, REFERENCE_COUNT_REBUILD_COMPLETION);
+  return (RebuildCompletion *) completion;
+}
+
+/**
+ * Free a RebuildCompletion and null out the reference to it.
+ *
+ * @param completionPtr  a pointer to the completion to free
+ **/
+static void freeRebuildCompletion(VDOCompletion **completionPtr)
+{
+  VDOCompletion *completion = *completionPtr;
+  if (completion == NULL) {
+    return;
+  }
+
+  RebuildCompletion *rebuild = asRebuildCompletion(completion);
+  destroyEnqueueable(&rebuild->subTaskCompletion);
+  destroyEnqueueable(completion);
+  FREE(rebuild);
+  *completionPtr = NULL;
+}
+
+/**
+ * Free the RebuildCompletion and notify the parent that the block map
+ * rebuild is done. This callback is registered in rebuildBlockMap().
+ *
+ * @param completion  The RebuildCompletion
+ **/
+static void finishRebuild(VDOCompletion *completion)
+{
+  int            result = completion->result;
+  VDOCompletion *parent = completion->parent;
+  freeRebuildCompletion(&completion);
+  finishCompletion(parent, result);
+}
+
+/**
+ * Make a new rebuild completion.
+ *
+ * @param [in]  vdo                 The VDO
+ * @param [in]  logicalBlocksUsed   A pointer to hold the logical blocks used
+ * @param [in]  blockMapDataBlocks  A pointer to hold the number of block map
+ *                                  data blocks
+ * @param [in]  parent              The parent of the rebuild completion
+ * @param [out] rebuildPtr          The new block map rebuild completion
+ *
+ * @return a success or error code
+ **/
+static int makeRebuildCompletion(VDO                *vdo,
+                                 BlockCount         *logicalBlocksUsed,
+                                 BlockCount         *blockMapDataBlocks,
+                                 VDOCompletion      *parent,
+                                 RebuildCompletion **rebuildPtr)
+{
+  BlockMap *blockMap = getBlockMap(vdo);
+  PageCount pageCount
+    = minPageCount(getConfiguredCacheSize(vdo) >> 1,
+                   MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS);
+
+  RebuildCompletion *rebuild;
+  int result = ALLOCATE_EXTENDED(RebuildCompletion, pageCount,
+                                 VDOPageCompletion, __func__, &rebuild);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&rebuild->completion,
+                                           REFERENCE_COUNT_REBUILD_COMPLETION,
+                                           vdo->layer);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &rebuild->completion;
+    freeRebuildCompletion(&completion);
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion,
+                                           SUB_TASK_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    VDOCompletion *completion = &rebuild->completion;
+    freeRebuildCompletion(&completion);
+    return result;
+  }
+
+  rebuild->blockMap           = blockMap;
+  rebuild->depot              = vdo->depot;
+  rebuild->logicalBlocksUsed  = logicalBlocksUsed;
+  rebuild->blockMapDataBlocks = blockMapDataBlocks;
+  rebuild->pageCount          = pageCount;
+  rebuild->leafPages          = computeBlockMapPageCount(blockMap->entryCount);
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  rebuild->logicalThreadID         = getLogicalZoneThread(threadConfig, 0);
+  rebuild->adminThreadID           = getAdminThread(threadConfig);
+
+  ASSERT_LOG_ONLY((getCallbackThreadID() == rebuild->logicalThreadID),
+                  "%s must be called on logical thread %u (not %u)", __func__,
+                  rebuild->logicalThreadID, getCallbackThreadID());
+  prepareCompletion(&rebuild->completion, finishRebuild, finishRebuild,
+                    rebuild->logicalThreadID, parent);
+
+  *rebuildPtr = rebuild;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Flush the block map now that all the reference counts are rebuilt. This
+ * callback is registered in finishIfDone().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void flushBlockMapUpdates(VDOCompletion *completion)
+{
+  logInfo("Flushing block map changes");
+  prepareToFinishParent(completion, completion->parent);
+  drainBlockMap(asRebuildCompletion(completion->parent)->blockMap,
+                ADMIN_STATE_RECOVERING, completion);
+}
+
+/**
+ * Check whether the rebuild is done. If it succeeded, continue by flushing the
+ * block map.
+ *
+ * @param rebuild  The rebuild completion
+ *
+ * @return <code>true</code> if the rebuild is complete
+ **/
+static bool finishIfDone(RebuildCompletion *rebuild)
+{
+  if (rebuild->launching || (rebuild->outstanding > 0)) {
+    return false;
+  }
+
+  if (rebuild->aborted) {
+    completeCompletion(&rebuild->completion);
+    return true;
+  }
+
+  if (rebuild->pageToFetch < rebuild->leafPages) {
+    return false;
+  }
+
+  prepareCompletion(&rebuild->subTaskCompletion, flushBlockMapUpdates,
+                    finishParentCallback, rebuild->adminThreadID, rebuild);
+  invokeCallback(&rebuild->subTaskCompletion);
+  return true;
+}
+
+/**
+ * Record that there has been an error during the rebuild.
+ *
+ * @param rebuild  The rebuild completion
+ * @param result   The error result to use, if one is not already saved
+ **/
+static void abortRebuild(RebuildCompletion *rebuild, int result)
+{
+  rebuild->aborted = true;
+  setCompletionResult(&rebuild->completion, result);
+}
+
+/**
+ * Handle an error loading a page.
+ *
+ * @param completion  The VDOPageCompletion
+ **/
+static void handlePageLoadError(VDOCompletion *completion)
+{
+  RebuildCompletion *rebuild = asRebuildCompletion(completion->parent);
+  rebuild->outstanding--;
+  abortRebuild(rebuild, completion->result);
+  releaseVDOPageCompletion(completion);
+  finishIfDone(rebuild);
+}
+
+/**
+ * Rebuild reference counts from a block map page.
+ *
+ * @param rebuild     The rebuild completion
+ * @param completion  The page completion holding the page
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int rebuildReferenceCountsFromPage(RebuildCompletion *rebuild,
+                                          VDOCompletion     *completion)
+{
+  BlockMapPage *page = dereferenceWritableVDOPage(completion);
+  int result = ASSERT(page != NULL, "page available");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (!isBlockMapPageInitialized(page)) {
+    return VDO_SUCCESS;
+  }
+
+  // Remove any bogus entries which exist beyond the end of the logical space.
+  if (getBlockMapPagePBN(page) == rebuild->lastSlot.pbn) {
+    for (SlotNumber slot = rebuild->lastSlot.slot;
+         slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
+      DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]);
+      if (isMappedLocation(&mapping)) {
+        page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+        requestVDOPageWrite(completion);
+      }
+    }
+  }
+
+  // Inform the slab depot of all entries on this page.
+  for (SlotNumber slot = 0; slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) {
+    DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]);
+    if (!isValidLocation(&mapping)) {
+      // This entry is invalid, so remove it from the page.
+      page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+      requestVDOPageWrite(completion);
+      continue;
+    }
+
+    if (!isMappedLocation(&mapping)) {
+      continue;
+    }
+
+    (*rebuild->logicalBlocksUsed)++;
+    if (mapping.pbn == ZERO_BLOCK) {
+      continue;
+    }
+
+    if (!isPhysicalDataBlock(rebuild->depot, mapping.pbn)) {
+      // This is a nonsense mapping. Remove it from the map so we're at least
+      // consistent and mark the page dirty.
+      page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+      requestVDOPageWrite(completion);
+      continue;
+    }
+
+    Slab *slab   = getSlab(rebuild->depot, mapping.pbn);
+    int   result = adjustReferenceCountForRebuild(slab->referenceCounts,
+                                                  mapping.pbn, DATA_INCREMENT);
+    if (result != VDO_SUCCESS) {
+      logErrorWithStringError(result,
+                              "Could not adjust reference count for PBN"
+                              " %llu, slot %u mapped to PBN %llu",
+                              getBlockMapPagePBN(page), slot, mapping.pbn);
+      page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED);
+      requestVDOPageWrite(completion);
+    }
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion);
+
+/**
+ * Process a page which has just been loaded. This callback is registered by
+ * fetchPage().
+ *
+ * @param completion  The VDOPageCompletion for the fetched page
+ **/
+static void pageLoaded(VDOCompletion *completion)
+{
+  RebuildCompletion *rebuild = asRebuildCompletion(completion->parent);
+  rebuild->outstanding--;
+
+  int result = rebuildReferenceCountsFromPage(rebuild, completion);
+  if (result != VDO_SUCCESS) {
+    abortRebuild(rebuild, result);
+  }
+
+  releaseVDOPageCompletion(completion);
+  if (finishIfDone(rebuild)) {
+    return;
+  }
+
+  // Advance progress to the next page, and fetch the next page we
+  // haven't yet requested.
+  fetchPage(rebuild, completion);
+}
+
+/**
+ * Fetch a page from the block map.
+ *
+ * @param rebuild     the RebuildCompletion
+ * @param completion  the page completion to use
+ **/
+static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion)
+{
+  while (rebuild->pageToFetch < rebuild->leafPages) {
+    PhysicalBlockNumber pbn = findBlockMapPagePBN(rebuild->blockMap,
+                                                  rebuild->pageToFetch++);
+    if (pbn == ZERO_BLOCK) {
+      continue;
+    }
+
+    if (!isPhysicalDataBlock(rebuild->depot, pbn)) {
+      abortRebuild(rebuild, VDO_BAD_MAPPING);
+      if (finishIfDone(rebuild)) {
+        return;
+      }
+      continue;
+    }
+
+    initVDOPageCompletion(((VDOPageCompletion *) completion),
+                          rebuild->blockMap->zones[0].pageCache,
+                          pbn, true, &rebuild->completion,
+                          pageLoaded, handlePageLoadError);
+    rebuild->outstanding++;
+    getVDOPageAsync(completion);
+    return;
+  }
+}
+
+/**
+ * Rebuild reference counts from the leaf block map pages now that reference
+ * counts have been rebuilt from the interior tree pages (which have been
+ * loaded in the process). This callback is registered in
+ * rebuildReferenceCounts().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void rebuildFromLeaves(VDOCompletion *completion)
+{
+  RebuildCompletion *rebuild = asRebuildCompletion(completion->parent);
+  *rebuild->logicalBlocksUsed = 0;
+
+  // The PBN calculation doesn't work until the tree pages have been loaded,
+  // so we can't set this value at the start of rebuild.
+  rebuild->lastSlot = (BlockMapSlot) {
+    .slot = rebuild->blockMap->entryCount % BLOCK_MAP_ENTRIES_PER_PAGE,
+    .pbn  = findBlockMapPagePBN(rebuild->blockMap, rebuild->leafPages - 1),
+  };
+
+  // Prevent any page from being processed until all pages have been launched.
+  rebuild->launching = true;
+  for (PageCount i = 0; i < rebuild->pageCount; i++) {
+    fetchPage(rebuild, &rebuild->pageCompletions[i].completion);
+  }
+  rebuild->launching = false;
+  finishIfDone(rebuild);
+}
+
+/**
+ * Process a single entry from the block map tree.
+ *
+ * <p>Implements EntryCallback.
+ *
+ * @param pbn         A pbn which holds a block map tree page
+ * @param completion  The parent completion of the traversal
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int processEntry(PhysicalBlockNumber pbn, VDOCompletion *completion)
+{
+  RebuildCompletion *rebuild = asRebuildCompletion(completion->parent);
+  if ((pbn == ZERO_BLOCK) || !isPhysicalDataBlock(rebuild->depot, pbn)) {
+    return logErrorWithStringError(VDO_BAD_CONFIGURATION,
+                                   "PBN %llu out of range",
+                                   pbn);
+  }
+
+  Slab *slab   = getSlab(rebuild->depot, pbn);
+  int   result = adjustReferenceCountForRebuild(slab->referenceCounts, pbn,
+                                                BLOCK_MAP_INCREMENT);
+  if (result != VDO_SUCCESS) {
+    return logErrorWithStringError(result,
+                                   "Could not adjust reference count for "
+                                   "block map tree PBN %llu",
+                                   pbn);
+  }
+
+  (*rebuild->blockMapDataBlocks)++;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void rebuildReferenceCounts(VDO           *vdo,
+                            VDOCompletion *parent,
+                            BlockCount    *logicalBlocksUsed,
+                            BlockCount    *blockMapDataBlocks)
+{
+  RebuildCompletion *rebuild;
+  int result = makeRebuildCompletion(vdo, logicalBlocksUsed,
+                                     blockMapDataBlocks, parent, &rebuild);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  // Completion chaining from page cache hits can lead to stack overflow
+  // during the rebuild, so clear out the cache before this rebuild phase.
+  result = invalidateVDOPageCache(rebuild->blockMap->zones[0].pageCache);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  // First traverse the block map trees.
+  *rebuild->blockMapDataBlocks = 0;
+  VDOCompletion *completion = &rebuild->subTaskCompletion;
+  prepareCompletion(completion, rebuildFromLeaves, finishParentCallback,
+                    rebuild->logicalThreadID, rebuild);
+  traverseForest(rebuild->blockMap, processEntry, completion);
+}
diff --git a/vdo/base/referenceCountRebuild.h b/vdo/base/referenceCountRebuild.h
new file mode 100644
index 0000000..59363ac
--- /dev/null
+++ b/vdo/base/referenceCountRebuild.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.h#1 $
+ */
+
+#ifndef REFERENCE_COUNT_REBUILD_H
+#define REFERENCE_COUNT_REBUILD_H
+
+#include "types.h"
+
+/**
+ * Rebuild the reference counts from the block map (read-only rebuild).
+ *
+ * @param [in]  vdo                 The VDO
+ * @param [in]  parent              The completion to notify when the rebuild is
+ *                                  complete
+ * @param [out] logicalBlocksUsed   A pointer to hold the logical blocks used
+ * @param [out] blockMapDataBlocks  A pointer to hold the number of block map
+ *                                  data blocks
+ **/
+void rebuildReferenceCounts(VDO           *vdo,
+                            VDOCompletion *parent,
+                            BlockCount    *logicalBlocksUsed,
+                            BlockCount    *blockMapDataBlocks);
+
+#endif // REFERENCE_COUNT_REBUILD_H
diff --git a/vdo/base/referenceOperation.c b/vdo/base/referenceOperation.c
new file mode 100644
index 0000000..a8ea9a0
--- /dev/null
+++ b/vdo/base/referenceOperation.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.c#1 $
+ */
+
+#include "referenceOperation.h"
+
+#include "physicalZone.h"
+#include "types.h"
+
+/**********************************************************************/
+static PBNLock *returnPBNLock(ReferenceOperation operation)
+{
+  return (PBNLock *) operation.context;
+}
+
+/**********************************************************************/
+void setUpReferenceOperationWithLock(JournalOperation     type,
+                                     PhysicalBlockNumber  pbn,
+                                     BlockMappingState    state,
+                                     PBNLock             *lock,
+                                     ReferenceOperation  *operation)
+{
+  *operation = (ReferenceOperation) {
+    .type       = type,
+    .pbn        = pbn,
+    .state      = state,
+    .lockGetter = returnPBNLock,
+    .context    = lock,
+  };
+}
+
+/**********************************************************************/
+static PBNLock *lookUpPBNLock(ReferenceOperation operation)
+{
+  return ((operation.context == NULL)
+          ? NULL : getPBNLock(operation.context, operation.pbn));
+}
+
+/**********************************************************************/
+void setUpReferenceOperationWithZone(JournalOperation     type,
+                                     PhysicalBlockNumber  pbn,
+                                     BlockMappingState    state,
+                                     PhysicalZone        *zone,
+                                     ReferenceOperation  *operation)
+{
+  *operation = (ReferenceOperation) {
+    .type       = type,
+    .pbn        = pbn,
+    .state      = state,
+    .lockGetter = lookUpPBNLock,
+    .context    = zone,
+  };
+}
diff --git a/vdo/base/referenceOperation.h b/vdo/base/referenceOperation.h
new file mode 100644
index 0000000..c846ec6
--- /dev/null
+++ b/vdo/base/referenceOperation.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.h#1 $
+ */
+
+#ifndef REFERENCE_OPERATION_H
+#define REFERENCE_OPERATION_H
+
+#include "types.h"
+
+typedef struct referenceOperation ReferenceOperation;
+
+/**
+ * Get the PBNLock associated with a ReferenceOperation.
+ *
+ * @param operation  The ReferenceOperation
+ *
+ * @return The PBNLock on the block of a ReferenceOperation or NULL if there
+ *         isn't one
+ **/
+typedef PBNLock *PBNLockGetter(ReferenceOperation operation);
+
+/**
+ * The current operation on a physical block (from the point of view of the
+ * DataVIO doing the operation)
+ **/
+struct referenceOperation {
+  /** The operation being performed */
+  JournalOperation     type;
+  /** The PBN of the block being operated on */
+  PhysicalBlockNumber  pbn;
+  /** The mapping state of the block being operated on */
+  BlockMappingState    state;
+  /** A function to use to get any PBNLock associated with this operation */
+  PBNLockGetter       *lockGetter;
+  /** The context to pass to the PBNLockGetter */
+  void                *context;
+};
+
+/**
+ * Get the PBNLock associated with the current ReferenceOperation.
+ *
+ * @param operation  The reference operation
+ *
+ * @return The PBNLock on the block of the current operation or NULL if there
+ *         isn't one
+ **/
+__attribute__((warn_unused_result))
+static inline
+PBNLock *getReferenceOperationPBNLock(ReferenceOperation operation)
+{
+  return ((operation.lockGetter == NULL)
+          ? NULL : operation.lockGetter(operation));
+}
+
+/**
+ * Set up a ReferenceOperation for which we already have the lock.
+ *
+ * @param type       The type of operation
+ * @param pbn        The PBN of the block on which to operate
+ * @param state      The mapping state of the block on which to operate
+ * @param lock       The PBNLock to associate with the operation
+ * @param operation  The ReferenceOperation to set up
+ **/
+void setUpReferenceOperationWithLock(JournalOperation     type,
+                                     PhysicalBlockNumber  pbn,
+                                     BlockMappingState    state,
+                                     PBNLock             *lock,
+                                     ReferenceOperation  *operation);
+
+/**
+ * Set up a ReferenceOperation for which we will need to look up the lock later.
+ *
+ * @param type       The type of operation
+ * @param pbn        The PBN of the block on which to operate
+ * @param state      The mapping state of the block on which to operate
+ * @param zone       The PhysicalZone from which the PBNLock can be retrieved
+ *                   when needed
+ * @param operation  The ReferenceOperation to set up
+ **/
+void setUpReferenceOperationWithZone(JournalOperation     type,
+                                     PhysicalBlockNumber  pbn,
+                                     BlockMappingState    state,
+                                     PhysicalZone        *zone,
+                                     ReferenceOperation  *operation);
+
+#endif // REFERENCE_OPERATION_H
diff --git a/vdo/base/releaseVersions.h b/vdo/base/releaseVersions.h
new file mode 100644
index 0000000..7620f17
--- /dev/null
+++ b/vdo/base/releaseVersions.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ */
+
+#ifndef RELEASE_VERSIONS_H
+#define RELEASE_VERSIONS_H
+
+enum {
+  OXYGEN_RELEASE_VERSION_NUMBER = 109583,
+  FLUORINE_RELEASE_VERSION_NUMBER = 115838,
+  NEON_RELEASE_VERSION_NUMBER = 120965,
+  SODIUM_RELEASE_VERSION_NUMBER = 127441,
+  MAGNESIUM_RELEASE_VERSION_NUMBER = 131337,
+  ALUMINUM_RELEASE_VERSION_NUMBER = 133524,
+  HEAD_RELEASE_VERSION_NUMBER = 0,
+  CURRENT_RELEASE_VERSION_NUMBER = ALUMINUM_RELEASE_VERSION_NUMBER,
+};
+
+#endif /* not RELEASE_VERSIONS_H */
diff --git a/vdo/base/ringNode.h b/vdo/base/ringNode.h
new file mode 100644
index 0000000..5f389f4
--- /dev/null
+++ b/vdo/base/ringNode.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/ringNode.h#1 $
+ */
+
+#ifndef RING_NODE_H
+#define RING_NODE_H
+
+#include "types.h"
+
+/**
+ * A ring node is a member of a doubly-linked circular list.
+ *
+ * Each node is usually embedded within a data structure that contains the
+ * relevant payload. In addition the ring head is also represented by a
+ * node where the next field designates the first element of the ring and the
+ * prev field designates the last.
+ *
+ * An empty ring contains next and prev fields that point back to the ring
+ * head itself.
+ *
+ * Typical iteration over a ring, from the front and back:
+ *
+ * for (RingNode *n = head->next; n != head; n = n->next) { ... }
+ * for (RingNode *p = head->prev; p != head; p = p->prev) { ... }
+ **/
+typedef struct ringNode RingNode;
+
+struct ringNode {
+  RingNode *next;
+  RingNode *prev;
+};
+
+/**
+ * Initialize a ring to be empty.
+ *
+ * @param head The head of the ring
+ **/
+static inline void initializeRing(RingNode *head)
+{
+  head->next = head->prev = head;
+}
+
+/**
+ * Check whether a ring is empty.
+ *
+ * @param head The head of the ring
+ *
+ * @return <code>true</code> if the ring is empty
+ **/
+static inline bool isRingEmpty(const RingNode *head)
+{
+  return (head->next == head);
+}
+
+/**
+ * Check whether a ring contains exactly one node.
+ *
+ * @param head  The head of the ring
+ *
+ * @return <code>true</code> if the ring contains exactly one member
+ **/
+static inline bool isRingSingleton(const RingNode *head)
+{
+  return (!isRingEmpty(head) && (head->prev == head->next));
+}
+
+/**
+ * Unsplice a contiguous chain of at least one node from its ring.
+ *
+ * @param first         the first entry in the ring to unsplice
+ * @param last          the last entry in the ring to unsplice,
+ *                      may be the same as ``first``
+ *
+ * The effect of this is to create two rings, the one designated
+ * by first through last, and the other consisting of anything remaining.
+ **/
+static inline void unspliceRingChain(RingNode *first,
+                                     RingNode *last)
+{
+  first->prev->next = last->next;
+  last->next->prev = first->prev;
+  first->prev = last;
+  last->next = first;
+}
+
+/**
+ * Remove a ring node from its ring.
+ *
+ * @param node  the ring node
+ *
+ * @return the removed node, for convenience
+ **/
+static inline RingNode *unspliceRingNode(RingNode *node)
+{
+  unspliceRingChain(node, node);
+  return node;
+}
+
+/**
+ * Splice a contiguous chain of at least one node after the specified entry,
+ * which may be the head of a ring.
+ *
+ * @param first         the first entry in a contiguous span of nodes
+ * @param last          the last entry in a contiguous span of nodes,
+ *                      may be the same as ``first``
+ * @param where         the entry after which ``first`` through ``last``
+ *                      shall appear
+ *
+ * The effect of this is to unsplice first through last (if necessary) and
+ * insert them after ``where`` so that the previous nodes after ``where``
+ * now appear after ``last``.
+ **/
+static inline void spliceRingChainAfter(RingNode *first,
+                                        RingNode *last,
+                                        RingNode *where)
+{
+  if (last->next != first) {
+    unspliceRingChain(first, last);
+  }
+  last->next = where->next;
+  first->prev = where;
+  where->next->prev = last;
+  where->next = first;
+}
+
+/**
+ * Splice a contiguous chain of at least one node before the specified entry,
+ * which may be the tail of a list.
+ *
+ * @param first         the first entry in a contiguous span of nodes
+ * @param last          the last entry in a contiguous span of nodes,
+ *                      may be the same as ``first``
+ * @param where         the entry before which ``first`` through ``last``
+ *                      shall appear
+ *
+ * The effect of this is to unsplice first through last (if necessary) and
+ * insert them before ``where`` so that the previous nodes before ``where``
+ * now appear before ``first``.
+ **/
+static inline void spliceRingChainBefore(RingNode *first,
+                                         RingNode *last,
+                                         RingNode *where)
+{
+  if (last->next != first) {
+    unspliceRingChain(first, last);
+  }
+  first->prev = where->prev;
+  last->next = where;
+  where->prev->next = first;
+  where->prev = last;
+}
+
+/**
+ * Push a single node on the end of a ring.
+ *
+ * @param head The ring head
+ * @param node The node to push
+ **/
+static inline void pushRingNode(RingNode *head, RingNode *node)
+{
+  spliceRingChainBefore(node, node, head);
+}
+
+/**
+ * Pop a single node off the end of a ring.
+ *
+ * @param head  The ring head
+ *
+ * @return NULL if the ring was empty, otherwise the node that was
+ *         removed from the ring (``head->prev``)
+ **/
+static inline RingNode *popRingNode(RingNode *head)
+{
+  return (isRingEmpty(head) ? NULL : unspliceRingNode(head->prev));
+}
+
+/**
+ * Remove a single node off the front of the list
+ **/
+static inline RingNode *chopRingNode(RingNode *head)
+{
+  return (isRingEmpty(head) ? NULL : unspliceRingNode(head->next));
+}
+
+#endif // RING_NODE_H
diff --git a/vdo/base/slab.c b/vdo/base/slab.c
new file mode 100644
index 0000000..f2903d6
--- /dev/null
+++ b/vdo/base/slab.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.c#9 $
+ */
+
+#include "slab.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "blockAllocatorInternals.h"
+#include "completion.h"
+#include "constants.h"
+#include "numUtils.h"
+#include "pbnLock.h"
+#include "recoveryJournal.h"
+#include "refCounts.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "slabJournalInternals.h"
+#include "slabSummary.h"
+
+/**********************************************************************/
+int configureSlab(BlockCount  slabSize,
+                  BlockCount  slabJournalBlocks,
+                  SlabConfig *slabConfig)
+{
+  if (slabJournalBlocks >= slabSize) {
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  /*
+   * This calculation should technically be a recurrence, but the total number
+   * of metadata blocks is currently less than a single block of refCounts, so
+   * we'd gain at most one data block in each slab with more iteration.
+   */
+  BlockCount refBlocks
+    = getSavedReferenceCountSize(slabSize - slabJournalBlocks);
+  BlockCount metaBlocks = (refBlocks + slabJournalBlocks);
+
+  // Make sure test code hasn't configured slabs to be too small.
+  if (metaBlocks >= slabSize) {
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  /*
+   * If the slab size is very small, assume this must be a unit test and
+   * override the number of data blocks to be a power of two (wasting blocks
+   * in the slab). Many tests need their dataBlocks fields to be the exact
+   * capacity of the configured volume, and that used to fall out since they
+   * use a power of two for the number of data blocks, the slab size was a
+   * power of two, and every block in a slab was a data block.
+   *
+   * XXX Try to figure out some way of structuring testParameters and unit
+   * tests so this hack isn't needed without having to edit several unit tests
+   * every time the metadata size changes by one block.
+   */
+  BlockCount dataBlocks = slabSize - metaBlocks;
+  if ((slabSize < 1024) && !isPowerOfTwo(dataBlocks)) {
+    dataBlocks = ((BlockCount) 1 << logBaseTwo(dataBlocks));
+  }
+
+  /*
+   * Configure the slab journal thresholds. The flush threshold is 168 of 224
+   * blocks in production, or 3/4ths, so we use this ratio for all sizes.
+   */
+  BlockCount flushingThreshold = ((slabJournalBlocks * 3) + 3) / 4;
+  /*
+   * The blocking threshold should be far enough from the the flushing
+   * threshold to not produce delays, but far enough from the end of the
+   * journal to allow multiple successive recovery failures.
+   */
+  BlockCount remaining = slabJournalBlocks - flushingThreshold;
+  BlockCount blockingThreshold = flushingThreshold + ((remaining * 5) / 7);
+  /*
+   * The scrubbing threshold should be at least 2048 entries before the end of
+   * the journal.
+   */
+  BlockCount minimalExtraSpace
+    = 1 + (MAXIMUM_USER_VIOS / SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK);
+  BlockCount scrubbingThreshold = blockingThreshold;
+  if (slabJournalBlocks > minimalExtraSpace) {
+    scrubbingThreshold = slabJournalBlocks - minimalExtraSpace;
+  }
+  if (blockingThreshold > scrubbingThreshold) {
+    blockingThreshold = scrubbingThreshold;
+  }
+
+  *slabConfig = (SlabConfig) {
+    .slabBlocks                    = slabSize,
+    .dataBlocks                    = dataBlocks,
+    .referenceCountBlocks          = refBlocks,
+    .slabJournalBlocks             = slabJournalBlocks,
+    .slabJournalFlushingThreshold  = flushingThreshold,
+    .slabJournalBlockingThreshold  = blockingThreshold,
+    .slabJournalScrubbingThreshold = scrubbingThreshold
+  };
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig    *slabConfig,
+                                             PhysicalBlockNumber  origin)
+{
+  return origin + slabConfig->dataBlocks + slabConfig->referenceCountBlocks;
+}
+
+/**********************************************************************/
+int makeSlab(PhysicalBlockNumber   slabOrigin,
+             BlockAllocator       *allocator,
+             PhysicalBlockNumber   translation,
+             RecoveryJournal      *recoveryJournal,
+             SlabCount             slabNumber,
+             bool                  isNew,
+             Slab                **slabPtr)
+{
+  Slab *slab;
+  int result = ALLOCATE(1, Slab, __func__, &slab);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  const SlabConfig *slabConfig = getSlabConfig(allocator->depot);
+
+  slab->allocator  = allocator;
+  slab->start      = slabOrigin;
+  slab->end        = slab->start + slabConfig->slabBlocks;
+  slab->slabNumber = slabNumber;
+  initializeRing(&slab->ringNode);
+
+  slab->refCountsOrigin = slabOrigin + slabConfig->dataBlocks + translation;
+  slab->journalOrigin   = (getSlabJournalStartBlock(slabConfig, slabOrigin)
+                           + translation);
+
+  result = makeSlabJournal(allocator, slab, recoveryJournal, &slab->journal);
+  if (result != VDO_SUCCESS) {
+    freeSlab(&slab);
+    return result;
+  }
+
+  if (isNew) {
+    slab->state.state = ADMIN_STATE_NEW;
+    result = allocateRefCountsForSlab(slab);
+    if (result != VDO_SUCCESS) {
+      freeSlab(&slab);
+      return result;
+    }
+  }
+
+  *slabPtr = slab;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int allocateRefCountsForSlab(Slab *slab)
+{
+  BlockAllocator   *allocator  = slab->allocator;
+  const SlabConfig *slabConfig = getSlabConfig(allocator->depot);
+
+  int result = ASSERT(slab->referenceCounts == NULL,
+                      "Slab %u doesn't allocate refcounts twice",
+                      slab->slabNumber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return makeRefCounts(slabConfig->dataBlocks, slab, slab->refCountsOrigin,
+                       allocator->readOnlyNotifier, &slab->referenceCounts);
+}
+
+/**********************************************************************/
+void freeSlab(Slab **slabPtr)
+{
+  Slab *slab = *slabPtr;
+  if (slab == NULL) {
+    return;
+  }
+
+  unspliceRingNode(&slab->ringNode);
+  freeSlabJournal(&slab->journal);
+  freeRefCounts(&slab->referenceCounts);
+  FREE(slab);
+  *slabPtr = NULL;
+}
+
+/**********************************************************************/
+ZoneCount getSlabZoneNumber(Slab *slab)
+{
+  return slab->allocator->zoneNumber;
+}
+
+/**********************************************************************/
+void markSlabReplaying(Slab *slab)
+{
+  if (slab->status == SLAB_REBUILT) {
+    slab->status = SLAB_REPLAYING;
+  }
+}
+
+/**********************************************************************/
+void markSlabUnrecovered(Slab *slab)
+{
+  slab->status = SLAB_REQUIRES_SCRUBBING;
+}
+
+/**********************************************************************/
+BlockCount getSlabFreeBlockCount(const Slab *slab)
+{
+  return getUnreferencedBlockCount(slab->referenceCounts);
+}
+
+/**********************************************************************/
+int modifySlabReferenceCount(Slab               *slab,
+                             const JournalPoint *journalPoint,
+                             ReferenceOperation  operation)
+{
+  if (slab == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  /*
+   * If the slab is unrecovered, preserve the refCount state and let scrubbing
+   * correct the refCount. Note that the slab journal has already captured all
+   * refCount updates.
+   */
+  if (isUnrecoveredSlab(slab)) {
+    SequenceNumber entryLock = journalPoint->sequenceNumber;
+    adjustSlabJournalBlockReference(slab->journal, entryLock, -1);
+    return VDO_SUCCESS;
+  }
+
+  bool freeStatusChanged;
+  int result = adjustReferenceCount(slab->referenceCounts, operation,
+                                    journalPoint, &freeStatusChanged);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (freeStatusChanged) {
+    adjustFreeBlockCount(slab, !isIncrementOperation(operation.type));
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int acquireProvisionalReference(Slab                *slab,
+                                PhysicalBlockNumber  pbn,
+                                PBNLock             *lock)
+{
+  if (hasProvisionalReference(lock)) {
+    return VDO_SUCCESS;
+  }
+
+  int result = provisionallyReferenceBlock(slab->referenceCounts, pbn, lock);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (hasProvisionalReference(lock)) {
+    adjustFreeBlockCount(slab, false);
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int slabBlockNumberFromPBN(Slab                *slab,
+                           PhysicalBlockNumber  physicalBlockNumber,
+                           SlabBlockNumber     *slabBlockNumberPtr)
+{
+  if (physicalBlockNumber < slab->start) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  uint64_t slabBlockNumber = physicalBlockNumber - slab->start;
+  if (slabBlockNumber >= getSlabConfig(slab->allocator->depot)->dataBlocks) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  *slabBlockNumberPtr = slabBlockNumber;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+bool shouldSaveFullyBuiltSlab(const Slab *slab)
+{
+  // Write out the refCounts if the slab has written them before, or it has
+  // any non-zero reference counts, or there are any slab journal blocks.
+  BlockCount dataBlocks = getSlabConfig(slab->allocator->depot)->dataBlocks;
+  return (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)
+          || (getSlabFreeBlockCount(slab) != dataBlocks)
+          || !isSlabJournalBlank(slab->journal));
+}
+
+/**
+ * Initiate a slab action.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateSlabAction(AdminState *state)
+{
+  Slab *slab = container_of(state, Slab, state);
+  if (isDraining(state)) {
+    if (state->state == ADMIN_STATE_SCRUBBING) {
+      slab->status = SLAB_REBUILDING;
+    }
+
+    drainSlabJournal(slab->journal);
+
+    if (slab->referenceCounts != NULL) {
+      drainRefCounts(slab->referenceCounts);
+    }
+
+    checkIfSlabDrained(slab);
+    return;
+  }
+
+  if (isLoading(state)) {
+    decodeSlabJournal(slab->journal);
+    return;
+  }
+
+  if (isResuming(state)) {
+    queueSlab(slab);
+    finishResuming(state);
+    return;
+  }
+
+  finishOperationWithResult(state, VDO_INVALID_ADMIN_STATE);
+}
+
+/**********************************************************************/
+void startSlabAction(Slab           *slab,
+                     AdminStateCode  operation,
+                     VDOCompletion  *parent)
+{
+  startOperationWithWaiter(&slab->state, operation, parent,
+                           initiateSlabAction);
+}
+
+/**********************************************************************/
+void notifySlabJournalIsLoaded(Slab *slab, int result)
+{
+  if ((result == VDO_SUCCESS) && isCleanLoad(&slab->state)) {
+    // Since this is a normal or new load, we don't need the memory to read and
+    // process the recovery journal, so we can allocate reference counts now.
+    result = allocateRefCountsForSlab(slab);
+  }
+
+  finishLoadingWithResult(&slab->state, result);
+}
+
+/**********************************************************************/
+bool isSlabOpen(Slab *slab)
+{
+  return (!isQuiescing(&slab->state) && !isQuiescent(&slab->state));
+}
+
+/**********************************************************************/
+bool isSlabDraining(Slab *slab)
+{
+  return isDraining(&slab->state);
+}
+
+/**********************************************************************/
+void checkIfSlabDrained(Slab *slab)
+{
+  if (isDraining(&slab->state)
+      && !isSlabJournalActive(slab->journal)
+      && ((slab->referenceCounts == NULL)
+          || !areRefCountsActive(slab->referenceCounts))) {
+    finishDrainingWithResult(&slab->state,
+                             (isReadOnly(slab->allocator->readOnlyNotifier)
+                              ? VDO_READ_ONLY : VDO_SUCCESS));
+  }
+}
+
+/**********************************************************************/
+void notifySlabJournalIsDrained(Slab *slab, int result)
+{
+  if (slab->referenceCounts == NULL) {
+    // This can happen when shutting down a VDO that was in read-only mode when
+    // loaded.
+    notifyRefCountsAreDrained(slab, result);
+    return;
+  }
+
+  setOperationResult(&slab->state, result);
+  drainRefCounts(slab->referenceCounts);
+}
+
+/**********************************************************************/
+void notifyRefCountsAreDrained(Slab *slab, int result)
+{
+  finishDrainingWithResult(&slab->state, result);
+}
+
+/**********************************************************************/
+bool isSlabResuming(Slab *slab)
+{
+  return isResuming(&slab->state);
+}
+
+/**********************************************************************/
+void finishScrubbingSlab(Slab *slab)
+{
+  slab->status = SLAB_REBUILT;
+  queueSlab(slab);
+  reopenSlabJournal(slab->journal);
+}
+
+/**********************************************************************/
+static const char *statusToString(SlabRebuildStatus status)
+{
+  switch (status) {
+  case SLAB_REBUILT:
+    return "REBUILT";
+  case SLAB_REQUIRES_SCRUBBING:
+    return "SCRUBBING";
+  case SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
+    return "PRIORITY_SCRUBBING";
+  case SLAB_REBUILDING:
+    return "REBUILDING";
+  case SLAB_REPLAYING:
+    return "REPLAYING";
+  default:
+    return "UNKNOWN";
+  }
+}
+
+/**********************************************************************/
+void dumpSlab(const Slab *slab)
+{
+  if (slab->referenceCounts != NULL) {
+    // Terse because there are a lot of slabs to dump and syslog is lossy.
+    logInfo("slab %u: P%u, %llu free",
+            slab->slabNumber, slab->priority, getSlabFreeBlockCount(slab));
+  } else {
+    logInfo("slab %u: status %s", slab->slabNumber,
+            statusToString(slab->status));
+  }
+
+  dumpSlabJournal(slab->journal);
+
+  if (slab->referenceCounts != NULL) {
+    dumpRefCounts(slab->referenceCounts);
+  } else {
+    logInfo("refCounts is null");
+  }
+}
diff --git a/vdo/base/slab.h b/vdo/base/slab.h
new file mode 100644
index 0000000..c7f204b
--- /dev/null
+++ b/vdo/base/slab.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.h#8 $
+ */
+
+#ifndef VDO_SLAB_H
+#define VDO_SLAB_H
+
+#include "permassert.h"
+
+#include "adminState.h"
+#include "fixedLayout.h"
+#include "journalPoint.h"
+#include "referenceOperation.h"
+#include "ringNode.h"
+#include "types.h"
+
+typedef uint32_t SlabBlockNumber;
+
+typedef enum {
+  SLAB_REBUILT = 0,
+  SLAB_REPLAYING,
+  SLAB_REQUIRES_SCRUBBING,
+  SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
+  SLAB_REBUILDING,
+} SlabRebuildStatus;
+
+/**
+ * This is the type declaration for the Slab type. (The struct tag is named
+ * vdoSlab to avoid a conflict with the linux kernel type). A Slab currently
+ * consists of a run of 2^23 data blocks, but that will soon change to
+ * dedicate a small number of those blocks for metadata storage for the
+ * reference counts and slab journal for the slab.
+ **/
+struct vdoSlab {
+  /** A RingNode to queue this slab in a BlockAllocator ring */
+  RingNode             ringNode;
+
+  /** The BlockAllocator that owns this slab */
+  BlockAllocator      *allocator;
+
+  /** The reference counts for the data blocks in this slab */
+  RefCounts           *referenceCounts;
+  /** The journal for this slab */
+  SlabJournal         *journal;
+
+  /** The slab number of this slab */
+  SlabCount            slabNumber;
+  /** The offset in the allocator partition of the first block in this slab */
+  PhysicalBlockNumber  start;
+  /** The offset of the first block past the end of this slab */
+  PhysicalBlockNumber  end;
+  /** The starting translated PBN of the slab journal */
+  PhysicalBlockNumber  journalOrigin;
+  /** The starting translated PBN of the reference counts */
+  PhysicalBlockNumber  refCountsOrigin;
+
+  /** The administrative state of the slab */
+  AdminState           state;
+  /** The status of the slab */
+  SlabRebuildStatus    status;
+  /** Whether the slab was ever queued for scrubbing */
+  bool                 wasQueuedForScrubbing;
+
+  /** The priority at which this slab has been queued for allocation */
+  uint8_t              priority;
+};
+
+/**
+ * Measure and initialize the configuration to use for each slab.
+ *
+ * @param [in]  slabSize           The number of blocks per slab
+ * @param [in]  slabJournalBlocks  The number of blocks for the slab journal
+ * @param [out] slabConfig         The slab configuration to initialize
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int configureSlab(BlockCount  slabSize,
+                  BlockCount  slabJournalBlocks,
+                  SlabConfig *slabConfig)
+  __attribute__((warn_unused_result));
+
+/**
+ * Convert a Slab's RingNode back to the Slab.
+ *
+ * @param ringNode  The RingNode to convert
+ *
+ * @return  The RingNode as a Slab
+ **/
+static inline Slab *slabFromRingNode(RingNode *ringNode)
+{
+  STATIC_ASSERT(offsetof(Slab, ringNode) == 0);
+  return (Slab *) ringNode;
+}
+
+/**
+ * Get the physical block number of the start of the slab journal
+ * relative to the start block allocator partition.
+ *
+ * @param slabConfig  The slab configuration of the VDO
+ * @param origin      The first block of the slab
+ **/
+__attribute__((warn_unused_result))
+PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig    *slabConfig,
+                                             PhysicalBlockNumber  origin);
+
+/**
+ * Construct a new, empty slab.
+ *
+ * @param [in]  slabOrigin       The physical block number within the block
+ *                               allocator partition of the first block in the
+ *                               slab
+ * @param [in]  allocator        The block allocator to which the slab belongs
+ * @param [in]  translation      The translation from the depot's partition to
+ *                               the physical storage
+ * @param [in]  recoveryJournal  The recovery journal of the VDO
+ * @param [in]  slabNumber       The slab number of the slab
+ * @param [in]  isNew            <code>true</code> if this slab is being
+ *                               allocated as part of a resize
+ * @param [out] slabPtr          A pointer to receive the new slab
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeSlab(PhysicalBlockNumber   slabOrigin,
+             BlockAllocator       *allocator,
+             PhysicalBlockNumber   translation,
+             RecoveryJournal      *recoveryJournal,
+             SlabCount             slabNumber,
+             bool                  isNew,
+             Slab                **slabPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate the reference counts for a slab.
+ *
+ * @param slab  The slab whose reference counts need allocation.
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int allocateRefCountsForSlab(Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a slab and null out the reference to it.
+ *
+ * @param slabPtr  The reference to the slab to destroy
+ **/
+void freeSlab(Slab **slabPtr);
+
+/**
+ * Get the physical zone number of a slab.
+ *
+ * @param slab  The slab
+ *
+ * @return The number of the slab's physical zone
+ **/
+ZoneCount getSlabZoneNumber(Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a slab is unrecovered.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is unrecovered
+ **/
+static inline bool isUnrecoveredSlab(const Slab *slab)
+{
+  return (slab->status != SLAB_REBUILT);
+}
+
+/**
+ * Check whether a slab is being replayed into.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is replaying
+ **/
+static inline bool isReplayingSlab(const Slab *slab)
+{
+  return (slab->status == SLAB_REPLAYING);
+}
+
+/**
+ * Check whether a slab is being rebuilt.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is being rebuilt
+ **/
+static inline bool slabIsRebuilding(const Slab *slab)
+{
+  return (slab->status == SLAB_REBUILDING);
+}
+
+/**
+ * Mark a slab as replaying, during offline recovery.
+ *
+ * @param slab  The slab to mark
+ **/
+void markSlabReplaying(Slab *slab);
+
+/**
+ * Mark a slab as unrecovered, for online recovery.
+ *
+ * @param slab  The slab to mark
+ **/
+void markSlabUnrecovered(Slab *slab);
+
+/**
+ * Get the current number of free blocks in a slab.
+ *
+ * @param slab  The slab to query
+ *
+ * @return the number of free blocks in the slab
+ **/
+BlockCount getSlabFreeBlockCount(const Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Increment or decrement the reference count of a block in a slab.
+ *
+ * @param slab          The slab containing the block (may be NULL when
+ *                      referencing the zero block)
+ * @param journalPoint  The slab journal entry corresponding to this change
+ * @param operation     The operation to perform on the reference count
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int modifySlabReferenceCount(Slab               *slab,
+                             const JournalPoint *journalPoint,
+                             ReferenceOperation  operation)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire a provisional reference on behalf of a PBN lock if the block it
+ * locks is unreferenced.
+ *
+ * @param slab  The slab which contains the block
+ * @param pbn   The physical block to reference
+ * @param lock  The lock
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int acquireProvisionalReference(Slab                *slab,
+                                PhysicalBlockNumber  pbn,
+                                PBNLock             *lock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine the index within the slab of a particular physical block number.
+ *
+ * @param [in]  slab                    The slab
+ * @param [in]  physicalBlockNumber     The physical block number
+ * @param [out] slabBlockNumberPtr      A pointer to the slab block number
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int slabBlockNumberFromPBN(Slab                *slab,
+                           PhysicalBlockNumber  physicalBlockNumber,
+                           SlabBlockNumber     *slabBlockNumberPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the reference counts for a given rebuilt slab should be saved.
+ * Implements SlabStatusChecker.
+ *
+ * @param slab  The slab to check
+ *
+ * @return true if the slab should be saved
+ **/
+bool shouldSaveFullyBuiltSlab(const Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Start an administrative operation on a slab.
+ *
+ * @param slab       The slab to load
+ * @param operation  The type of load to perform
+ * @param parent     The object to notify when the operation is complete
+ **/
+void startSlabAction(Slab           *slab,
+                     AdminStateCode  operation,
+                     VDOCompletion  *parent);
+
+/**
+ * Inform a slab that its journal has been loaded.
+ *
+ * @param slab    The slab whose journal has been loaded
+ * @param result  The result of the load operation
+ **/
+void notifySlabJournalIsLoaded(Slab *slab, int result);
+
+/**
+ * Check whether a slab is open, i.e. is neither quiescent nor quiescing.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is open
+ **/
+bool isSlabOpen(Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a slab is currently draining.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is performing a drain operation
+ **/
+bool isSlabDraining(Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a slab has drained, and if so, send a notification thereof.
+ *
+ * @param slab  The slab to check
+ **/
+void checkIfSlabDrained(Slab *slab);
+
+/**
+ * Inform a slab that its journal has finished draining.
+ *
+ * @param slab    The slab whose journal has been drained
+ * @param result  The result of the drain operation
+ **/
+void notifySlabJournalIsDrained(Slab *slab, int result);
+
+/**
+ * Inform a slab that its RefCounts have finished draining.
+ *
+ * @param slab    The slab whose RefCounts has been drained
+ * @param result  The result of the drain operation
+ **/
+void notifyRefCountsAreDrained(Slab *slab, int result);
+
+/**
+ * Check whether a slab is currently resuming.
+ *
+ * @param slab  The slab to check
+ *
+ * @return <code>true</code> if the slab is performing a resume operation
+ **/
+bool isSlabResuming(Slab *slab)
+  __attribute__((warn_unused_result));
+
+/**
+ * Finish scrubbing a slab now that it has been rebuilt by updating its status,
+ * queueing it for allocation, and reopening its journal.
+ *
+ * @param slab  The slab whose reference counts have been rebuilt from its
+ *              journal
+ **/
+void finishScrubbingSlab(Slab *slab);
+
+/**
+ * Dump information about a slab to the log for debugging.
+ *
+ * @param slab   The slab to dump
+ **/
+void dumpSlab(const Slab *slab);
+
+#endif // VDO_SLAB_H
diff --git a/vdo/base/slabDepot.c b/vdo/base/slabDepot.c
new file mode 100644
index 0000000..6c10c29
--- /dev/null
+++ b/vdo/base/slabDepot.c
@@ -0,0 +1,1145 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.c#23 $
+ */
+
+#include "slabDepot.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "actionManager.h"
+#include "adminState.h"
+#include "blockAllocatorInternals.h"
+#include "constants.h"
+#include "header.h"
+#include "numUtils.h"
+#include "readOnlyNotifier.h"
+#include "refCounts.h"
+#include "slab.h"
+#include "slabDepotInternals.h"
+#include "slabJournal.h"
+#include "slabIterator.h"
+#include "slabSummary.h"
+#include "threadConfig.h"
+#include "types.h"
+
+typedef struct {
+  SlabConfig           slabConfig;
+  PhysicalBlockNumber  firstBlock;
+  PhysicalBlockNumber  lastBlock;
+  ZoneCount            zoneCount;
+} __attribute__((packed)) SlabDepotState2_0;
+
+static const Header SLAB_DEPOT_HEADER_2_0 = {
+  .id = SLAB_DEPOT,
+  .version = {
+    .majorVersion = 2,
+    .minorVersion = 0,
+  },
+  .size = sizeof(SlabDepotState2_0),
+};
+
+/**
+ * Compute the number of slabs a depot with given parameters would have.
+ *
+ * @param firstBlock     PBN of the first data block
+ * @param lastBlock      PBN of the last data block
+ * @param slabSizeShift  Exponent for the number of blocks per slab
+ *
+ * @return The number of slabs
+ **/
+__attribute__((warn_unused_result))
+static SlabCount computeSlabCount(PhysicalBlockNumber firstBlock,
+                                  PhysicalBlockNumber lastBlock,
+                                  unsigned int        slabSizeShift)
+{
+  BlockCount dataBlocks = lastBlock - firstBlock;
+  return (SlabCount) (dataBlocks >> slabSizeShift);
+}
+
+/**********************************************************************/
+SlabCount calculateSlabCount(SlabDepot *depot)
+{
+  return computeSlabCount(depot->firstBlock, depot->lastBlock,
+                          depot->slabSizeShift);
+}
+
+/**
+ * Get an iterator over all the slabs in the depot.
+ *
+ * @param depot  The depot
+ *
+ * @return An iterator over the depot's slabs
+ **/
+static SlabIterator getSlabIterator(SlabDepot *depot)
+{
+  return iterateSlabs(depot->slabs, depot->slabCount - 1, 0, 1);
+}
+
+/**
+ * Allocate a new slab pointer array. Any existing slab pointers will be
+ * copied into the new array, and slabs will be allocated as needed. The
+ * newly allocated slabs will not be distributed for use by the block
+ * allocators.
+ *
+ * @param depot      The depot
+ * @param slabCount  The number of slabs the depot should have in the new
+ *                   array
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int allocateSlabs(SlabDepot *depot, SlabCount slabCount)
+{
+  int result = ALLOCATE(slabCount, Slab *, "slab pointer array",
+                        &depot->newSlabs);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  bool resizing = false;
+  if (depot->slabs != NULL) {
+    memcpy(depot->newSlabs, depot->slabs, depot->slabCount * sizeof(Slab *));
+    resizing = true;
+  }
+
+  BlockCount slabSize = getSlabConfig(depot)->slabBlocks;
+  PhysicalBlockNumber slabOrigin
+    = depot->firstBlock + (depot->slabCount * slabSize);
+
+  // The translation between allocator partition PBNs and layer PBNs.
+  BlockCount translation = depot->origin - depot->firstBlock;
+  depot->newSlabCount = depot->slabCount;
+  while (depot->newSlabCount < slabCount) {
+    BlockAllocator *allocator
+      = depot->allocators[depot->newSlabCount % depot->zoneCount];
+    Slab **slabPtr = &depot->newSlabs[depot->newSlabCount];
+    result = makeSlab(slabOrigin, allocator, translation, depot->journal,
+                      depot->newSlabCount, resizing, slabPtr);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+    // Increment here to ensure that abandonNewSlabs will clean up correctly.
+    depot->newSlabCount++;
+
+    slabOrigin += slabSize;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void abandonNewSlabs(SlabDepot *depot)
+{
+  if (depot->newSlabs == NULL) {
+    return;
+  }
+  for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) {
+    freeSlab(&depot->newSlabs[i]);
+  }
+  depot->newSlabCount = 0;
+  FREE(depot->newSlabs);
+  depot->newSlabs = NULL;
+  depot->newSize  = 0;
+}
+
+/**
+ * Get the ID of the thread on which a given allocator operates.
+ *
+ * <p>Implements ZoneThreadGetter.
+ **/
+static ThreadID getAllocatorThreadID(void *context, ZoneCount zoneNumber)
+{
+  return getBlockAllocatorForZone(context, zoneNumber)->threadID;
+}
+
+/**
+ * Prepare to commit oldest tail blocks.
+ *
+ * <p>Implements ActionPreamble.
+ **/
+static void prepareForTailBlockCommit(void *context, VDOCompletion *parent)
+{
+  SlabDepot *depot = context;
+  depot->activeReleaseRequest = depot->newReleaseRequest;
+  completeCompletion(parent);
+}
+
+/**
+ * Schedule a tail block commit if necessary. This method should not be called
+ * directly. Rather, call scheduleDefaultAction() on the depot's action
+ * manager.
+ *
+ * <p>Implements ActionScheduler,
+ **/
+static bool scheduleTailBlockCommit(void *context)
+{
+  SlabDepot *depot = context;
+  if (depot->newReleaseRequest == depot->activeReleaseRequest) {
+    return false;
+  }
+
+  return scheduleAction(depot->actionManager, prepareForTailBlockCommit,
+                        releaseTailBlockLocks, NULL, NULL);
+}
+
+/**
+ * Allocate those components of the slab depot which are needed only at load
+ * time, not at format time.
+ *
+ * @param depot             The depot
+ * @param nonce             The nonce of the VDO
+ * @param threadConfig      The thread config of the VDO
+ * @param vioPoolSize       The size of the VIO pool
+ * @param layer             The physical layer below this depot
+ * @param summaryPartition  The partition which holds the slab summary
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int allocateComponents(SlabDepot          *depot,
+                              Nonce               nonce,
+                              const ThreadConfig *threadConfig,
+                              BlockCount          vioPoolSize,
+                              PhysicalLayer      *layer,
+                              Partition          *summaryPartition)
+{
+  /*
+   * If createVIO is NULL, the slab depot is only being used to format
+   * or audit the VDO. These only require the SuperBlock component, so we can
+   * just skip allocating all the memory needed for runtime components.
+   */
+  if (layer->createMetadataVIO == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  int result = initializeEnqueueableCompletion(&depot->scrubbingCompletion,
+                                               SUB_TASK_COMPLETION, layer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeActionManager(depot->zoneCount, getAllocatorThreadID,
+                             getJournalZoneThread(threadConfig), depot,
+                             scheduleTailBlockCommit, layer,
+                             &depot->actionManager);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  depot->origin = depot->firstBlock;
+
+  result = makeSlabSummary(layer, summaryPartition, threadConfig,
+                           depot->slabSizeShift, depot->slabConfig.dataBlocks,
+                           depot->readOnlyNotifier, &depot->slabSummary);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabCount slabCount = calculateSlabCount(depot);
+  if (threadConfig->physicalZoneCount > slabCount) {
+    return logErrorWithStringError(VDO_BAD_CONFIGURATION,
+                                   "%u physical zones exceeds slab count %u",
+                                   threadConfig->physicalZoneCount, slabCount);
+  }
+
+  // Allocate the block allocators.
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    ThreadID threadID = getPhysicalZoneThread(threadConfig, zone);
+    result = makeBlockAllocator(depot, zone, threadID, nonce, vioPoolSize,
+                                layer, depot->readOnlyNotifier,
+                                &depot->allocators[zone]);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Allocate slabs.
+  result = allocateSlabs(depot, slabCount);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Use the new slabs.
+  for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) {
+    Slab *slab = depot->newSlabs[i];
+    registerSlabWithAllocator(slab->allocator, slab);
+    depot->slabCount++;
+  }
+
+  depot->slabs        = depot->newSlabs;
+  depot->newSlabs     = NULL;
+  depot->newSlabCount = 0;
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Allocate a slab depot.
+ *
+ * @param [in]  state             The parameters for the new depot
+ * @param [in]  threadConfig      The thread config of the VDO
+ * @param [in]  nonce             The nonce of the VDO
+ * @param [in]  vioPoolSize       The size of the VIO pool
+ * @param [in]  layer             The physical layer below this depot
+ * @param [in]  summaryPartition  The partition which holds the slab summary
+ *                                 (if NULL, the depot is format-only)
+ * @param [in]  readOnlyNotifier  The context for entering read-only mode
+ * @param [in]  recoveryJournal   The recovery journal of the VDO
+ * @param [out] depotPtr          A pointer to hold the depot
+ *
+ * @return A success or error code
+ **/
+__attribute__((warn_unused_result))
+static int allocateDepot(const SlabDepotState2_0  *state,
+                         const ThreadConfig       *threadConfig,
+                         Nonce                     nonce,
+                         BlockCount                vioPoolSize,
+                         PhysicalLayer            *layer,
+                         Partition                *summaryPartition,
+                         ReadOnlyNotifier         *readOnlyNotifier,
+                         RecoveryJournal          *recoveryJournal,
+                         SlabDepot               **depotPtr)
+{
+  // Calculate the bit shift for efficiently mapping block numbers to slabs.
+  // Using a shift requires that the slab size be a power of two.
+  BlockCount slabSize = state->slabConfig.slabBlocks;
+  if (!isPowerOfTwo(slabSize)) {
+    return logErrorWithStringError(UDS_INVALID_ARGUMENT,
+                                   "slab size must be a power of two");
+  }
+  unsigned int slabSizeShift = logBaseTwo(slabSize);
+
+  SlabDepot *depot;
+  int result = ALLOCATE_EXTENDED(SlabDepot, threadConfig->physicalZoneCount,
+                                 BlockAllocator *, __func__, &depot);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  depot->oldZoneCount     = state->zoneCount;
+  depot->zoneCount        = threadConfig->physicalZoneCount;
+  depot->slabConfig       = state->slabConfig;
+  depot->readOnlyNotifier = readOnlyNotifier;
+  depot->firstBlock       = state->firstBlock;
+  depot->lastBlock        = state->lastBlock;
+  depot->slabSizeShift    = slabSizeShift;
+  depot->journal          = recoveryJournal;
+
+  result = allocateComponents(depot, nonce, threadConfig, vioPoolSize,
+                              layer, summaryPartition);
+  if (result != VDO_SUCCESS) {
+    freeSlabDepot(&depot);
+    return result;
+  }
+
+  *depotPtr = depot;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Configure the SlabDepot for the specified storage capacity, finding the
+ * number of data blocks that will fit and still leave room for the depot
+ * metadata, then return the saved state for that configuration.
+ *
+ * @param [in]  blockCount  The number of blocks in the underlying storage
+ * @param [in]  firstBlock  The number of the first block that may be allocated
+ * @param [in]  slabConfig  The configuration of a single slab
+ * @param [in]  zoneCount   The number of zones the depot will use
+ * @param [out] state       The state structure to be configured
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int configureState(BlockCount           blockCount,
+                          PhysicalBlockNumber  firstBlock,
+                          SlabConfig           slabConfig,
+                          ZoneCount            zoneCount,
+                          SlabDepotState2_0   *state)
+{
+  BlockCount slabSize = slabConfig.slabBlocks;
+  logDebug("slabDepot configureState(blockCount=%" PRIu64
+           ", firstBlock=%llu, slabSize=%llu, zoneCount=%u)",
+           blockCount, firstBlock, slabSize, zoneCount);
+
+  // We do not allow runt slabs, so we waste up to a slab's worth.
+  size_t slabCount = (blockCount / slabSize);
+  if (slabCount == 0) {
+    return VDO_NO_SPACE;
+  }
+
+  if (slabCount > MAX_SLABS) {
+    return VDO_TOO_MANY_SLABS;
+  }
+
+  BlockCount          totalSlabBlocks = slabCount * slabConfig.slabBlocks;
+  BlockCount          totalDataBlocks = slabCount * slabConfig.dataBlocks;
+  PhysicalBlockNumber lastBlock       = firstBlock + totalSlabBlocks;
+
+  *state = (SlabDepotState2_0) {
+    .slabConfig = slabConfig,
+    .firstBlock = firstBlock,
+    .lastBlock  = lastBlock,
+    .zoneCount  = zoneCount,
+  };
+
+  logDebug("slabDepot lastBlock=%llu, totalDataBlocks=%" PRIu64
+           ", slabCount=%zu, leftOver=%llu",
+           lastBlock, totalDataBlocks, slabCount,
+           blockCount - (lastBlock - firstBlock));
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeSlabDepot(BlockCount            blockCount,
+                  PhysicalBlockNumber   firstBlock,
+                  SlabConfig            slabConfig,
+                  const ThreadConfig   *threadConfig,
+                  Nonce                 nonce,
+                  BlockCount            vioPoolSize,
+                  PhysicalLayer        *layer,
+                  Partition            *summaryPartition,
+                  ReadOnlyNotifier     *readOnlyNotifier,
+                  RecoveryJournal      *recoveryJournal,
+                  SlabDepot           **depotPtr)
+{
+  SlabDepotState2_0 state;
+  int result = configureState(blockCount, firstBlock, slabConfig, 0, &state);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabDepot *depot = NULL;
+  result = allocateDepot(&state, threadConfig, nonce, vioPoolSize, layer,
+                         summaryPartition, readOnlyNotifier, recoveryJournal,
+                         &depot);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *depotPtr = depot;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeSlabDepot(SlabDepot **depotPtr)
+{
+  SlabDepot *depot = *depotPtr;
+  if (depot == NULL) {
+    return;
+  }
+
+  abandonNewSlabs(depot);
+
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    freeBlockAllocator(&depot->allocators[zone]);
+  }
+
+  if (depot->slabs != NULL) {
+    for (SlabCount i = 0; i < depot->slabCount; i++) {
+      freeSlab(&depot->slabs[i]);
+    }
+  }
+
+  FREE(depot->slabs);
+  freeActionManager(&depot->actionManager);
+  freeSlabSummary(&depot->slabSummary);
+  destroyEnqueueable(&depot->scrubbingCompletion);
+  FREE(depot);
+  *depotPtr = NULL;
+}
+
+/**********************************************************************/
+size_t getSlabDepotEncodedSize(void)
+{
+  return ENCODED_HEADER_SIZE + sizeof(SlabDepotState2_0);
+}
+
+/**
+ * Decode a slab config from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param config  The config structure to receive the decoded values
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodeSlabConfig(Buffer *buffer, SlabConfig *config)
+{
+  BlockCount count;
+  int result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->slabBlocks = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->dataBlocks = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->referenceCountBlocks = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->slabJournalBlocks = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->slabJournalFlushingThreshold = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->slabJournalBlockingThreshold = count;
+
+  result = getUInt64LEFromBuffer(buffer, &count);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  config->slabJournalScrubbingThreshold = count;
+
+  return UDS_SUCCESS;
+}
+
+/**
+ * Encode a slab config into a buffer.
+ *
+ * @param config  The config structure to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int encodeSlabConfig(const SlabConfig *config, Buffer *buffer)
+{
+  int result = putUInt64LEIntoBuffer(buffer, config->slabBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->dataBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->referenceCountBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->slabJournalFlushingThreshold);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlockingThreshold);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return putUInt64LEIntoBuffer(buffer, config->slabJournalScrubbingThreshold);
+}
+
+/**********************************************************************/
+int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer)
+{
+  int result = encodeHeader(&SLAB_DEPOT_HEADER_2_0, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t initialLength = contentLength(buffer);
+
+  result = encodeSlabConfig(&depot->slabConfig, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, depot->firstBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, depot->lastBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  /*
+   * If this depot is currently using 0 zones, it must have been
+   * synchronously loaded by a tool and is now being saved. We
+   * did not load and combine the slab summary, so we still need
+   * to do that next time we load with the old zone count rather
+   * than 0.
+   */
+  ZoneCount zonesToRecord = depot->zoneCount;
+  if (depot->zoneCount == 0) {
+    zonesToRecord = depot->oldZoneCount;
+  }
+  result = putByte(buffer, zonesToRecord);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t encodedSize = contentLength(buffer) - initialLength;
+  return ASSERT(SLAB_DEPOT_HEADER_2_0.size == encodedSize,
+                "encoded block map component size must match header size");
+}
+
+/**
+ * Decode slab depot component state version 2.0 from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param state   The state structure to receive the decoded values
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+static int decodeSlabDepotState_2_0(Buffer *buffer, SlabDepotState2_0 *state)
+{
+  size_t initialLength = contentLength(buffer);
+
+  int result = decodeSlabConfig(buffer, &state->slabConfig);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  PhysicalBlockNumber firstBlock;
+  result = getUInt64LEFromBuffer(buffer, &firstBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  state->firstBlock = firstBlock;
+
+  PhysicalBlockNumber lastBlock;
+  result = getUInt64LEFromBuffer(buffer, &lastBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  state->lastBlock = lastBlock;
+
+  result = getByte(buffer, &state->zoneCount);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  size_t decodedSize = initialLength - contentLength(buffer);
+  return ASSERT(SLAB_DEPOT_HEADER_2_0.size == decodedSize,
+                "decoded slab depot component size must match header size");
+}
+
+/**********************************************************************/
+int decodeSlabDepot(Buffer              *buffer,
+                    const ThreadConfig  *threadConfig,
+                    Nonce                nonce,
+                    PhysicalLayer       *layer,
+                    Partition           *summaryPartition,
+                    ReadOnlyNotifier    *readOnlyNotifier,
+                    RecoveryJournal     *recoveryJournal,
+                    SlabDepot          **depotPtr)
+{
+  Header header;
+  int result = decodeHeader(buffer, &header);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateHeader(&SLAB_DEPOT_HEADER_2_0, &header, true, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabDepotState2_0 state;
+  result = decodeSlabDepotState_2_0(buffer, &state);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return allocateDepot(&state, threadConfig, nonce, VIO_POOL_SIZE, layer,
+                       summaryPartition, readOnlyNotifier, recoveryJournal,
+                       depotPtr);
+}
+
+/**********************************************************************/
+int decodeSodiumSlabDepot(Buffer              *buffer,
+                          const ThreadConfig  *threadConfig,
+                          Nonce                nonce,
+                          PhysicalLayer       *layer,
+                          Partition           *summaryPartition,
+                          ReadOnlyNotifier    *readOnlyNotifier,
+                          RecoveryJournal     *recoveryJournal,
+                          SlabDepot          **depotPtr)
+{
+  // Sodium uses version 2.0 of the slab depot state.
+  return decodeSlabDepot(buffer, threadConfig, nonce, layer, summaryPartition,
+                         readOnlyNotifier, recoveryJournal, depotPtr);
+}
+
+/**********************************************************************/
+int allocateSlabRefCounts(SlabDepot *depot)
+{
+  SlabIterator iterator = getSlabIterator(depot);
+  while (hasNextSlab(&iterator)) {
+    int result = allocateRefCountsForSlab(nextSlab(&iterator));
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot,
+                                         ZoneCount  zoneNumber)
+{
+  return depot->allocators[zoneNumber];
+}
+
+/**********************************************************************/
+int getSlabNumber(const SlabDepot     *depot,
+                  PhysicalBlockNumber  pbn,
+                  SlabCount           *slabNumberPtr)
+{
+  if (pbn < depot->firstBlock) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  SlabCount slabNumber = (pbn - depot->firstBlock) >> depot->slabSizeShift;
+  if (slabNumber >= depot->slabCount) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  *slabNumberPtr = slabNumber;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn)
+{
+  if (pbn == ZERO_BLOCK) {
+    return NULL;
+  }
+
+  SlabCount slabNumber;
+  int result = getSlabNumber(depot, pbn, &slabNumber);
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(depot->readOnlyNotifier, result);
+    return NULL;
+  }
+
+  return depot->slabs[slabNumber];
+
+}
+
+/**********************************************************************/
+SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn)
+{
+  Slab *slab = getSlab(depot, pbn);
+  return ((slab != NULL) ? slab->journal : NULL);
+}
+
+/**********************************************************************/
+uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn)
+{
+  Slab *slab = getSlab(depot, pbn);
+  if ((slab == NULL) || isUnrecoveredSlab(slab)) {
+    return 0;
+  }
+
+  return getAvailableReferences(slab->referenceCounts, pbn);
+}
+
+/**********************************************************************/
+bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn)
+{
+  if (pbn == ZERO_BLOCK) {
+    return true;
+  }
+
+  SlabCount slabNumber;
+  if (getSlabNumber(depot, pbn, &slabNumber) != VDO_SUCCESS) {
+    return false;
+  }
+
+  SlabBlockNumber sbn;
+  int result = slabBlockNumberFromPBN(depot->slabs[slabNumber], pbn, &sbn);
+  return (result == VDO_SUCCESS);
+}
+
+/**********************************************************************/
+BlockCount getDepotAllocatedBlocks(const SlabDepot *depot)
+{
+  BlockCount total = 0;
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    // The allocators are responsible for thread safety.
+    total += getAllocatedBlocks(depot->allocators[zone]);
+  }
+  return total;
+}
+
+/**********************************************************************/
+BlockCount getDepotDataBlocks(const SlabDepot *depot)
+{
+  // XXX This needs to be thread safe, but resize changes the slab count. It
+  // does so on the admin thread (our usual caller), so it's usually safe.
+  return (depot->slabCount * depot->slabConfig.dataBlocks);
+}
+
+/**********************************************************************/
+BlockCount getDepotFreeBlocks(const SlabDepot *depot)
+{
+  /*
+   * We can't ever shrink a volume except when resize fails, and we can't
+   * allocate from the new slabs until after the resize succeeds, so by
+   * getting the number of allocated blocks first, we ensure the allocated
+   * count is always less than the capacity. Doing it in the other order on a
+   * full volume could lose a race with a sucessful resize, resulting in a
+   * nonsensical negative/underflow result.
+   */
+  BlockCount allocated = getDepotAllocatedBlocks(depot);
+  memoryFence();
+  return (getDepotDataBlocks(depot) - allocated);
+}
+
+/**********************************************************************/
+SlabCount getDepotSlabCount(const SlabDepot *depot)
+{
+  return depot->slabCount;
+}
+
+/**********************************************************************/
+SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot)
+{
+  SlabCount total = 0;
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    // The allocators are responsible for thread safety.
+    total += getUnrecoveredSlabCount(depot->allocators[zone]);
+  }
+  return total;
+}
+
+/**
+ * The preamble of a load operation which loads the slab summary.
+ *
+ * <p>Implements ActionPreamble.
+ **/
+static void startDepotLoad(void *context, VDOCompletion *parent)
+{
+  SlabDepot *depot = context;
+  loadSlabSummary(depot->slabSummary,
+                  getCurrentManagerOperation(depot->actionManager),
+                  depot->oldZoneCount, parent);
+}
+
+/**********************************************************************/
+void loadSlabDepot(SlabDepot      *depot,
+                   AdminStateCode  operation,
+                   VDOCompletion  *parent,
+                   void           *context)
+{
+  if (assertLoadOperation(operation, parent)) {
+    scheduleOperationWithContext(depot->actionManager, operation,
+                                 startDepotLoad, loadBlockAllocator, NULL,
+                                 context, parent);
+  }
+}
+
+/**********************************************************************/
+void prepareToAllocate(SlabDepot         *depot,
+                       SlabDepotLoadType  loadType,
+                       VDOCompletion     *parent)
+{
+  depot->loadType = loadType;
+  atomicStore32(&depot->zonesToScrub, depot->zoneCount);
+  scheduleAction(depot->actionManager, NULL, prepareAllocatorToAllocate,
+                 NULL, parent);
+}
+
+/**********************************************************************/
+void updateSlabDepotSize(SlabDepot *depot)
+{
+  depot->lastBlock = depot->newLastBlock;
+}
+
+/**********************************************************************/
+int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize)
+{
+  if ((newSize >> depot->slabSizeShift) <= depot->slabCount) {
+    return VDO_INCREMENT_TOO_SMALL;
+  }
+
+  // Generate the depot configuration for the new block count.
+  SlabDepotState2_0 newState;
+  int result = configureState(newSize, depot->firstBlock, depot->slabConfig,
+                              depot->zoneCount, &newState);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabCount newSlabCount = computeSlabCount(depot->firstBlock,
+                                            newState.lastBlock,
+                                            depot->slabSizeShift);
+  if (newSlabCount <= depot->slabCount) {
+    return logErrorWithStringError(VDO_INCREMENT_TOO_SMALL,
+                                   "Depot can only grow");
+  }
+  if (newSlabCount == depot->newSlabCount) {
+    // Check it out, we've already got all the new slabs allocated!
+    return VDO_SUCCESS;
+  }
+
+  abandonNewSlabs(depot);
+  result = allocateSlabs(depot, newSlabCount);
+  if (result != VDO_SUCCESS) {
+    abandonNewSlabs(depot);
+    return result;
+  }
+
+  depot->newSize      = newSize;
+  depot->oldLastBlock = depot->lastBlock;
+  depot->newLastBlock = newState.lastBlock;
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Finish registering new slabs now that all of the allocators have received
+ * their new slabs.
+ *
+ * <p>Implements ActionConclusion.
+ **/
+static int finishRegistration(void *context)
+{
+  SlabDepot *depot = context;
+  depot->slabCount = depot->newSlabCount;
+  FREE(depot->slabs);
+  depot->slabs        = depot->newSlabs;
+  depot->newSlabs     = NULL;
+  depot->newSlabCount = 0;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void useNewSlabs(SlabDepot *depot, VDOCompletion *parent)
+{
+  ASSERT_LOG_ONLY(depot->newSlabs != NULL, "Must have new slabs to use");
+  scheduleOperation(depot->actionManager, ADMIN_STATE_SUSPENDED_OPERATION,
+                    NULL, registerNewSlabsForAllocator, finishRegistration,
+                    parent);
+}
+
+/**********************************************************************/
+void drainSlabDepot(SlabDepot      *depot,
+                    AdminStateCode  operation,
+                    VDOCompletion  *parent)
+{
+  scheduleOperation(depot->actionManager, operation, NULL, drainBlockAllocator,
+                    NULL, parent);
+}
+
+/**********************************************************************/
+void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent)
+{
+  if (isReadOnly(depot->readOnlyNotifier)) {
+    finishCompletion(parent, VDO_READ_ONLY);
+    return;
+  }
+
+  scheduleOperation(depot->actionManager, ADMIN_STATE_RESUMING, NULL,
+                    resumeBlockAllocator, NULL, parent);
+}
+
+/**********************************************************************/
+void commitOldestSlabJournalTailBlocks(SlabDepot      *depot,
+                                       SequenceNumber  recoveryBlockNumber)
+{
+  if (depot == NULL) {
+    return;
+  }
+
+  depot->newReleaseRequest = recoveryBlockNumber;
+  scheduleDefaultAction(depot->actionManager);
+}
+
+/**********************************************************************/
+const SlabConfig *getSlabConfig(const SlabDepot *depot)
+{
+  return &depot->slabConfig;
+}
+
+/**********************************************************************/
+SlabSummary *getSlabSummary(const SlabDepot *depot)
+{
+  return depot->slabSummary;
+}
+
+/**********************************************************************/
+SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone)
+{
+  if (depot->slabSummary == NULL) {
+    return NULL;
+  }
+  return getSummaryForZone(depot->slabSummary, zone);
+}
+
+/**********************************************************************/
+void scrubAllUnrecoveredSlabs(SlabDepot     *depot,
+                              void          *parent,
+                              VDOAction     *callback,
+                              VDOAction     *errorHandler,
+                              ThreadID       threadID,
+                              VDOCompletion *launchParent)
+{
+  prepareCompletion(&depot->scrubbingCompletion, callback, errorHandler,
+                    threadID, parent);
+  scheduleAction(depot->actionManager, NULL, scrubAllUnrecoveredSlabsInZone,
+                 NULL, launchParent);
+}
+
+/**********************************************************************/
+void notifyZoneFinishedScrubbing(VDOCompletion *completion)
+{
+  SlabDepot *depot = completion->parent;
+  if (atomicAdd32(&depot->zonesToScrub, -1) == 0) {
+    // We're the last!
+    completeCompletion(&depot->scrubbingCompletion);
+  }
+}
+
+/**********************************************************************/
+bool hasUnrecoveredSlabs(SlabDepot *depot)
+{
+  return (atomicLoad32(&depot->zonesToScrub) > 0);
+}
+
+/**********************************************************************/
+BlockCount getNewDepotSize(const SlabDepot *depot)
+{
+  return (depot->newSlabs == NULL) ? 0 : depot->newSize;
+}
+
+/**********************************************************************/
+bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB)
+{
+  if ((depotA->firstBlock       != depotB->firstBlock)
+      || (depotA->lastBlock     != depotB->lastBlock)
+      || (depotA->slabCount     != depotB->slabCount)
+      || (depotA->slabSizeShift != depotB->slabSizeShift)
+      || (getDepotAllocatedBlocks(depotA)
+          != getDepotAllocatedBlocks(depotB))) {
+    return false;
+  }
+
+  for (size_t i = 0; i < depotA->slabCount; i++) {
+    Slab *slabA = depotA->slabs[i];
+    Slab *slabB = depotB->slabs[i];
+    if ((slabA->start  != slabB->start)
+        || (slabA->end != slabB->end)
+        || !areEquivalentReferenceCounters(slabA->referenceCounts,
+                                           slabB->referenceCounts)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**********************************************************************/
+void allocateFromLastSlab(SlabDepot *depot)
+{
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    allocateFromAllocatorLastSlab(depot->allocators[zone]);
+  }
+}
+
+/**********************************************************************/
+BlockAllocatorStatistics
+getDepotBlockAllocatorStatistics(const SlabDepot *depot)
+{
+  BlockAllocatorStatistics totals;
+  memset(&totals, 0, sizeof(totals));
+
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    BlockAllocator *allocator = depot->allocators[zone];
+    BlockAllocatorStatistics stats = getBlockAllocatorStatistics(allocator);
+    totals.slabCount     += stats.slabCount;
+    totals.slabsOpened   += stats.slabsOpened;
+    totals.slabsReopened += stats.slabsReopened;
+  }
+
+  return totals;
+}
+
+/**********************************************************************/
+RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot)
+{
+  RefCountsStatistics depotStats;
+  memset(&depotStats, 0, sizeof(depotStats));
+
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    BlockAllocator *allocator = depot->allocators[zone];
+    RefCountsStatistics stats = getRefCountsStatistics(allocator);
+    depotStats.blocksWritten += stats.blocksWritten;
+  }
+
+  return depotStats;
+}
+
+/**********************************************************************/
+SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot)
+{
+  SlabJournalStatistics depotStats;
+  memset(&depotStats, 0, sizeof(depotStats));
+
+  for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) {
+    BlockAllocator *allocator = depot->allocators[zone];
+    SlabJournalStatistics stats = getSlabJournalStatistics(allocator);
+    depotStats.diskFullCount += stats.diskFullCount;
+    depotStats.flushCount    += stats.flushCount;
+    depotStats.blockedCount  += stats.blockedCount;
+    depotStats.blocksWritten += stats.blocksWritten;
+    depotStats.tailBusyCount += stats.tailBusyCount;
+  }
+
+  return depotStats;
+}
+
+/**********************************************************************/
+void dumpSlabDepot(const SlabDepot *depot)
+{
+  logInfo("Slab Depot");
+  logInfo("  zoneCount=%u oldZoneCount=%u slabCount=%" PRIu32
+          " activeReleaseRequest=%llu newReleaseRequest=%llu",
+          (unsigned int) depot->zoneCount, (unsigned int) depot->oldZoneCount,
+          depot->slabCount, depot->activeReleaseRequest,
+          depot->newReleaseRequest);
+}
diff --git a/vdo/base/slabDepot.h b/vdo/base/slabDepot.h
new file mode 100644
index 0000000..b439470
--- /dev/null
+++ b/vdo/base/slabDepot.h
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.h#12 $
+ */
+
+#ifndef SLAB_DEPOT_H
+#define SLAB_DEPOT_H
+
+#include "buffer.h"
+
+#include "adminState.h"
+#include "completion.h"
+#include "fixedLayout.h"
+#include "journalPoint.h"
+#include "statistics.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * A SlabDepot is responsible for managing all of the slabs and block
+ * allocators of a VDO. It has a single array of slabs in order to eliminate
+ * the need for additional math in order to compute which physical zone a PBN
+ * is in. It also has a BlockAllocator per zone.
+ *
+ * Load operations are required to be performed on a single thread. Normal
+ * operations are assumed to be performed in the appropriate zone. Allocations
+ * and reference count updates must be done from the thread of their physical
+ * zone. Requests to commit slab journal tail blocks from the recovery journal
+ * must be done on the journal zone thread. Save operations are required to be
+ * launched from the same thread as the original load operation.
+ **/
+
+typedef enum {
+  NORMAL_LOAD,
+  RECOVERY_LOAD,
+  REBUILD_LOAD
+} SlabDepotLoadType;
+
+/**
+ * Calculate the number of slabs a depot would have.
+ *
+ * @param depot  The depot
+ *
+ * @return The number of slabs
+ **/
+SlabCount calculateSlabCount(SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a slab depot.
+ *
+ * @param [in]  blockCount        The number of blocks initially available
+ * @param [in]  firstBlock        The number of the first block which may be
+ *                                allocated
+ * @param [in]  slabConfig        The slab configuration
+ * @param [in]  threadConfig      The thread configuration of the VDO
+ * @param [in]  nonce             The nonce of the VDO
+ * @param [in]  vioPoolSize       The size of the VIO pool
+ * @param [in]  layer             The physical layer below this depot
+ * @param [in]  summaryPartition  The partition which holds the slab summary
+ * @param [in]  readOnlyNotifier  The context for entering read-only mode
+ * @param [in]  recoveryJournal   The recovery journal of the VDO
+ * @param [out] depotPtr          A pointer to hold the depot
+ *
+ * @return A success or error code
+ **/
+int makeSlabDepot(BlockCount            blockCount,
+                  PhysicalBlockNumber   firstBlock,
+                  SlabConfig            slabConfig,
+                  const ThreadConfig   *threadConfig,
+                  Nonce                 nonce,
+                  BlockCount            vioPoolSize,
+                  PhysicalLayer        *layer,
+                  Partition            *summaryPartition,
+                  ReadOnlyNotifier     *readOnlyNotifier,
+                  RecoveryJournal      *recoveryJournal,
+                  SlabDepot           **depotPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a slab depot and null out the reference to it.
+ *
+ * @param depotPtr  The reference to the depot to destroy
+ **/
+void freeSlabDepot(SlabDepot **depotPtr);
+
+/**
+ * Get the size of the encoded state of a slab depot.
+ *
+ * @return The encoded size of the depot's state
+ **/
+size_t getSlabDepotEncodedSize(void)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode the state of a slab depot into a buffer.
+ *
+ * @param depot   The depot to encode
+ * @param buffer  The buffer to encode into
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the state of a slab depot saved in a buffer.
+ *
+ * @param [in]  buffer            The buffer containing the saved state
+ * @param [in]  threadConfig      The thread config of the VDO
+ * @param [in]  nonce             The nonce of the VDO
+ * @param [in]  layer             The physical layer below this depot
+ * @param [in]  summaryPartition  The partition which holds the slab summary
+ * @param [in]  readOnlyNotifier  The context for entering read-only mode
+ * @param [in]  recoveryJournal   The recovery journal of the VDO
+ * @param [out] depotPtr          A pointer to hold the depot
+ *
+ * @return A success or error code
+ **/
+int decodeSodiumSlabDepot(Buffer              *buffer,
+                          const ThreadConfig  *threadConfig,
+                          Nonce                nonce,
+                          PhysicalLayer       *layer,
+                          Partition           *summaryPartition,
+                          ReadOnlyNotifier    *readOnlyNotifier,
+                          RecoveryJournal     *recoveryJournal,
+                          SlabDepot          **depotPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the state of a slab depot saved in a buffer.
+ *
+ * @param [in]  buffer            The buffer containing the saved state
+ * @param [in]  threadConfig      The thread config of the VDO
+ * @param [in]  nonce             The nonce of the VDO
+ * @param [in]  layer             The physical layer below this depot
+ * @param [in]  summaryPartition  The partition which holds the slab summary
+ * @param [in]  readOnlyNotifier  The context for entering read-only mode
+ * @param [in]  recoveryJournal   The recovery journal of the VDO
+ * @param [out] depotPtr          A pointer to hold the depot
+ *
+ * @return A success or error code
+ **/
+int decodeSlabDepot(Buffer              *buffer,
+                    const ThreadConfig  *threadConfig,
+                    Nonce                nonce,
+                    PhysicalLayer       *layer,
+                    Partition           *summaryPartition,
+                    ReadOnlyNotifier    *readOnlyNotifier,
+                    RecoveryJournal     *recoveryJournal,
+                    SlabDepot          **depotPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate the RefCounts for all slabs in the depot. This method may be called
+ * only before entering normal operation from the load thread.
+ *
+ * @param depot  The depot whose RefCounts need allocation
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int allocateSlabRefCounts(SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the block allocator for a specified physical zone from a depot.
+ *
+ * @param depot       The depot
+ * @param zoneNumber  The physical zone
+ *
+ * @return The block allocator for the specified zone
+ **/
+BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot,
+                                         ZoneCount  zoneNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of the slab that contains a specified block.
+ *
+ * @param depot          The slab depot
+ * @param pbn            The physical block number
+ * @param slabNumberPtr  A pointer to hold the slab number
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int getSlabNumber(const SlabDepot     *depot,
+                  PhysicalBlockNumber  pbn,
+                  SlabCount           *slabNumberPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the slab object for the slab that contains a specified block. Will put
+ * the VDO in read-only mode if the PBN is not a valid data block nor the zero
+ * block.
+ *
+ * @param depot  The slab depot
+ * @param pbn    The physical block number
+ *
+ * @return The slab containing the block, or NULL if the block number is the
+ *         zero block or otherwise out of range
+ **/
+Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the slab journal for the slab that contains a specified block.
+ *
+ * @param depot  The slab depot
+ * @param pbn    The physical block number within the block depot partition
+ *               of any block in the slab
+ *
+ * @return The slab journal of the slab containing the block, or NULL if the
+ *         block number is for the zero block or otherwise out of range
+ **/
+SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine how many new references a block can acquire. This method must be
+ * called from the the physical zone thread of the PBN.
+ *
+ * @param depot  The slab depot
+ * @param pbn    The physical block number that is being queried
+ *
+ * @return the number of available references
+ **/
+uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Determine whether the given PBN refers to a data block.
+ *
+ * @param depot  The depot
+ * @param pbn    The physical block number to ask about
+ *
+ * @return <code>True</code> if the PBN corresponds to a data block
+ **/
+bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of data blocks allocated across all the slabs in the
+ * depot, which is the total number of blocks with a non-zero reference count.
+ * This may be called from any thread.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The total number of blocks with a non-zero reference count
+ **/
+BlockCount getDepotAllocatedBlocks(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total of the statistics from all the block allocators in the depot.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The statistics from all block allocators in the depot
+ **/
+BlockAllocatorStatistics
+getDepotBlockAllocatorStatistics(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of data blocks in all the slabs in the depot. This may
+ * be called from any thread.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The total number of data blocks in all slabs
+ **/
+BlockCount getDepotDataBlocks(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of free blocks remaining in all the slabs in the
+ * depot, which is the total number of blocks that have a zero reference
+ * count. This may be called from any thread.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The total number of blocks with a zero reference count
+ **/
+BlockCount getDepotFreeBlocks(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of slabs in the depot
+ *
+ * @param depot  The slab depot
+ *
+ * @return The total number of slabs
+ **/
+SlabCount getDepotSlabCount(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of unrecovered slabs in the depot, which is the total
+ * number of unrecovered slabs from all zones. This may be called from any
+ * thread.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The total number of slabs that are unrecovered
+ **/
+SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the aggregated slab journal statistics for the depot.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The aggregated statistics for all slab journals in the depot
+ **/
+SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the cumulative RefCounts statistics for the depot.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The cumulative statistics for all RefCounts in the depot
+ **/
+RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Asynchronously load any slab depot state that isn't included in the
+ * SuperBlock component. This method may be called only before entering normal
+ * operation from the load thread.
+ *
+ * @param depot        The depot to load
+ * @param operation    The type of load to perform
+ * @param parent       The completion to finish when the load is complete
+ * @param context      Additional context for the load operation; may be NULL
+ **/
+void loadSlabDepot(SlabDepot         *depot,
+                   AdminStateCode     operation,
+                   VDOCompletion     *parent,
+                   void              *context);
+
+/**
+ * Prepare the slab depot to come online and start allocating blocks. This
+ * method may be called only before entering normal operation from the load
+ * thread. It must be called before allocation may proceed.
+ *
+ * @param depot     The depot to prepare
+ * @param loadType  The load type
+ * @param parent    The completion to finish when the operation is complete
+ **/
+void prepareToAllocate(SlabDepot         *depot,
+                       SlabDepotLoadType  loadType,
+                       VDOCompletion     *parent);
+
+/**
+ * Update the slab depot to reflect its new size in memory. This size is saved
+ * to disk as part of the super block.
+ *
+ * @param depot  The depot to update
+ **/
+void updateSlabDepotSize(SlabDepot *depot);
+
+/**
+ * Allocate new memory needed for a resize of a slab depot to the given size.
+ *
+ * @param depot    The depot to prepare to resize
+ * @param newSize  The number of blocks in the new depot
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize)
+  __attribute__((warn_unused_result));
+
+/**
+ * Use the new slabs allocated for resize.
+ *
+ * @param depot   The depot
+ * @param parent  The object to notify when complete
+ **/
+void useNewSlabs(SlabDepot *depot, VDOCompletion *parent);
+
+/**
+ * Abandon any new slabs in this depot, freeing them as needed.
+ *
+ * @param depot  The depot
+ **/
+void abandonNewSlabs(SlabDepot *depot);
+
+/**
+ * Drain all slab depot I/O. If saving, or flushing, all dirty depot metadata
+ * will be written out. If saving or suspending, the depot will be left in a
+ * suspended state.
+ *
+ * @param depot      The depot to drain
+ * @param operation  The drain operation (flush, rebuild, suspend, or save)
+ * @param parent     The completion to finish when the drain is complete
+ **/
+void drainSlabDepot(SlabDepot      *depot,
+                    AdminStateCode  operation,
+                    VDOCompletion  *parent);
+
+/**
+ * Resume a suspended slab depot.
+ *
+ * @param depot   The depot to resume
+ * @param parent  The completion to finish when the depot has resumed
+ **/
+void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent);
+
+/**
+ * Commit all dirty tail blocks which are locking a given recovery journal
+ * block. This method must be called from the journal zone thread.
+ *
+ * @param depot                The depot
+ * @param recoveryBlockNumber  The sequence number of the recovery journal
+ *                             block whose locks should be released
+ **/
+void commitOldestSlabJournalTailBlocks(SlabDepot      *depot,
+                                       SequenceNumber  recoveryBlockNumber);
+
+/**
+ * Get the SlabConfig of a depot.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The slab configuration of the specified depot
+ **/
+const SlabConfig *getSlabConfig(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the slab summary.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The slab summary
+ **/
+SlabSummary *getSlabSummary(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the portion of the slab summary for a given physical zone.
+ *
+ * @param depot  The slab depot
+ * @param zone   The zone
+ *
+ * @return The portion of the slab summary for the specified zone
+ **/
+SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Scrub all unrecovered slabs.
+ *
+ * @param depot         The depot to scrub
+ * @param parent        The object to notify when scrubbing is complete
+ * @param callback      The function to call when scrubbing is complete
+ * @param errorHandler  The handler for scrubbing errors
+ * @param threadID      The thread on which to run the callback
+ * @param launchParent  The object to notify when scrubbing has been launched
+ *                      for all zones
+ **/
+void scrubAllUnrecoveredSlabs(SlabDepot     *depot,
+                              void          *parent,
+                              VDOAction     *callback,
+                              VDOAction     *errorHandler,
+                              ThreadID       threadID,
+                              VDOCompletion *launchParent);
+
+/**
+ * Check whether there are outstanding unrecovered slabs.
+ *
+ * @param depot  The slab depot
+ *
+ * @return Whether there are outstanding unrecovered slabs
+ **/
+bool hasUnrecoveredSlabs(SlabDepot *depot);
+
+/**
+ * Get the physical size to which this depot is prepared to grow.
+ *
+ * @param depot  The slab depot
+ *
+ * @return The new number of blocks the depot will be grown to, or 0 if the
+ *         depot is not prepared to grow
+ **/
+BlockCount getNewDepotSize(const SlabDepot *depot)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump the slab depot, in a thread-unsafe fashion.
+ *
+ * @param depot  The slab depot
+ **/
+void dumpSlabDepot(const SlabDepot *depot);
+
+#endif // SLAB_DEPOT_H
diff --git a/vdo/base/slabDepotInternals.h b/vdo/base/slabDepotInternals.h
new file mode 100644
index 0000000..7dfe57b
--- /dev/null
+++ b/vdo/base/slabDepotInternals.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepotInternals.h#13 $
+ */
+
+#ifndef SLAB_DEPOT_INTERNALS_H
+#define SLAB_DEPOT_INTERNALS_H
+
+#include "slabDepot.h"
+
+#include "atomic.h"
+
+#include "actionManager.h"
+
+struct slabDepot {
+  ZoneCount             zoneCount;
+  ZoneCount             oldZoneCount;
+  SlabConfig            slabConfig;
+  SlabSummary          *slabSummary;
+  ReadOnlyNotifier     *readOnlyNotifier;
+  ActionManager        *actionManager;
+
+  PhysicalBlockNumber   firstBlock;
+  PhysicalBlockNumber   lastBlock;
+  PhysicalBlockNumber   origin;
+
+  /** slabSize == (1 << slabSizeShift) */
+  unsigned int          slabSizeShift;
+
+  /** Determines how slabs should be queued during load */
+  SlabDepotLoadType     loadType;
+
+  /** The state for notifying slab journals to release recovery journal */
+  SequenceNumber        activeReleaseRequest;
+  SequenceNumber        newReleaseRequest;
+
+  /** The completion for scrubbing */
+  VDOCompletion         scrubbingCompletion;
+  Atomic32              zonesToScrub;
+
+  /** Cached journal pointer for slab creation */
+  RecoveryJournal      *journal;
+
+  /** Array of pointers to individually allocated slabs */
+  Slab                **slabs;
+  /** The number of slabs currently allocated and stored in 'slabs' */
+  SlabCount             slabCount;
+
+  /** Array of pointers to a larger set of slabs (used during resize) */
+  Slab                **newSlabs;
+  /** The number of slabs currently allocated and stored in 'newSlabs' */
+  SlabCount             newSlabCount;
+  /** The size that 'newSlabs' was allocated for */
+  BlockCount            newSize;
+
+  /** The last block before resize, for rollback */
+  PhysicalBlockNumber   oldLastBlock;
+  /** The last block after resize, for resize */
+  PhysicalBlockNumber   newLastBlock;
+
+  /** The block allocators for this depot */
+  BlockAllocator       *allocators[];
+};
+
+/**
+ * Destroy a slab.
+ *
+ * @param slab  The slab to destroy
+ **/
+void destroySlab(Slab *slab);
+
+/**
+ * Inform a slab's depot that the slab has been created.
+ *
+ * @param slab  The slab to register
+ **/
+void registerSlabWithDepot(Slab *slab);
+
+/**
+ * Notify a slab depot that one of its allocators has finished scrubbing slabs.
+ * This method should only be called if the scrubbing was successful. This
+ * callback is registered by each block allocator in
+ * scrubAllUnrecoveredSlabsInZone().
+ *
+ * @param completion  A completion whose parent must be a slab depot
+ **/
+void notifyZoneFinishedScrubbing(VDOCompletion *completion);
+
+/**
+ * Check whether two depots are equivalent (i.e. represent the same
+ * state and have the same reference counter). This method is used for unit
+ * testing.
+ *
+ * @param depotA The first depot to compare
+ * @param depotB The second depot to compare
+ *
+ * @return <code>true</code> if the two depots are equivalent
+ **/
+bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB)
+  __attribute__((warn_unused_result));
+
+/**
+ * Start allocating from the highest numbered slab in each zone.
+ *
+ * @param depot   The depot
+ **/
+void allocateFromLastSlab(SlabDepot *depot);
+
+#endif /* SLAB_DEPOT_INTERNALS_H */
diff --git a/vdo/base/slabIterator.h b/vdo/base/slabIterator.h
new file mode 100644
index 0000000..e977c2d
--- /dev/null
+++ b/vdo/base/slabIterator.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabIterator.h#1 $
+ */
+
+#ifndef SLAB_ITERATOR_H
+#define SLAB_ITERATOR_H
+
+#include "slab.h"
+#include "types.h"
+
+/**
+ * SlabIterator is a structure for iterating over a set of slabs.
+ **/
+typedef struct {
+  Slab      **slabs;
+  Slab       *next;
+  SlabCount   end;
+  SlabCount   stride;
+} SlabIterator;
+
+/**
+ * Return a SlabIterator initialized to iterate over an array of slabs
+ * with a given stride. Iteration always occurs from higher to lower numbered
+ * slabs.
+ *
+ * @param slabs  The array of slabs
+ * @param start  The number of the slab to start iterating from
+ * @param end    The number of the last slab which may be returned
+ * @param stride The difference in slab number between successive slabs
+ *
+ * @return an initialized iterator structure
+ **/
+static inline SlabIterator iterateSlabs(Slab      **slabs,
+                                        SlabCount   start,
+                                        SlabCount   end,
+                                        SlabCount   stride)
+{
+  return (SlabIterator) {
+    .slabs  = slabs,
+    .next   = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]),
+    .end    = end,
+    .stride = stride,
+  };
+}
+
+/**
+ * Check whether another Slab would be returned by the iterator.
+ *
+ * @param iterator  The iterator to poll
+ *
+ * @return <code>true</code> if the next call to <code>nextSlab</code>
+ *         will return a Slab
+ **/
+static inline bool hasNextSlab(const SlabIterator *iterator)
+{
+  return (iterator->next != NULL);
+}
+
+/**
+ * Get the next Slab, advancing the iterator.
+ *
+ * @param iterator  The iterator over the Slab chain
+ *
+ * @return the next Slab or <code>NULL</code> if the array of slabs is empty
+ *         or if all the appropriate Slabs have been returned
+ **/
+static inline Slab *nextSlab(SlabIterator *iterator)
+{
+  Slab *slab = iterator->next;
+  if ((slab == NULL)
+      || (slab->slabNumber < iterator->end + iterator->stride)) {
+    iterator->next = NULL;
+  } else {
+    iterator->next = iterator->slabs[slab->slabNumber - iterator->stride];
+  }
+  return slab;
+}
+
+#endif // SLAB_ITERATOR_H
diff --git a/vdo/base/slabJournal.c b/vdo/base/slabJournal.c
new file mode 100644
index 0000000..1895f80
--- /dev/null
+++ b/vdo/base/slabJournal.c
@@ -0,0 +1,1321 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.c#18 $
+ */
+
+#include "slabJournalInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+
+#include "adminState.h"
+#include "blockAllocatorInternals.h"
+#include "dataVIO.h"
+#include "recoveryJournal.h"
+#include "refCounts.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+
+/**
+ * Return the slab journal from the resource waiter.
+ *
+ * @param waiter  The waiter
+ *
+ * @return The slab journal
+ **/
+__attribute__((warn_unused_result))
+static inline SlabJournal *slabJournalFromResourceWaiter(Waiter *waiter)
+{
+  STATIC_ASSERT(offsetof(SlabJournal, resourceWaiter) == 0);
+  return (SlabJournal *) waiter;
+}
+
+/**
+ * Return the slab journal from the flush waiter.
+ *
+ * @param waiter  The waiter
+ *
+ * @return The slab journal
+ **/
+__attribute__((warn_unused_result))
+static inline SlabJournal *slabJournalFromFlushWaiter(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+  return (SlabJournal *)
+    ((uintptr_t) waiter - offsetof(SlabJournal, flushWaiter));
+}
+
+/**********************************************************************/
+SlabJournal *slabJournalFromDirtyNode(RingNode *node)
+{
+  if (node == NULL) {
+    return NULL;
+  }
+  return (SlabJournal *) ((uintptr_t) node - offsetof(SlabJournal, dirtyNode));
+}
+
+/**
+ * Return the slab journal from the slab summary waiter.
+ *
+ * @param waiter  The waiter
+ *
+ * @return The slab journal
+ **/
+__attribute__((warn_unused_result))
+static inline SlabJournal *slabJournalFromSlabSummaryWaiter(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+  return (SlabJournal *)
+    ((uintptr_t) waiter - offsetof(SlabJournal, slabSummaryWaiter));
+}
+
+/**
+ * Get the physical block number for a given sequence number.
+ *
+ * @param journal   The journal
+ * @param sequence  The sequence number of the desired block
+ *
+ * @return the block number corresponding to the sequence number
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber getBlockNumber(SlabJournal    *journal,
+                                                 SequenceNumber  sequence)
+{
+  TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequence);
+  return (journal->slab->journalOrigin + offset);
+}
+
+/**
+ * Get the lock object for a slab journal block by sequence number.
+ *
+ * @param journal         Slab journal to retrieve from
+ * @param sequenceNumber  Sequence number of the block
+ *
+ * @return the lock object for the given sequence number
+ **/
+__attribute__((warn_unused_result))
+static inline JournalLock *getLock(SlabJournal    *journal,
+                                   SequenceNumber  sequenceNumber)
+{
+  TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequenceNumber);
+  return &journal->locks[offset];
+}
+
+/**
+ * Check whether the VDO is in read-only mode.
+ *
+ * @param journal  The journal whose owning VDO should be checked
+ *
+ * @return <code>true</code> if the VDO is in read-only mode
+ **/
+__attribute__((warn_unused_result))
+static inline bool isVDOReadOnly(SlabJournal *journal)
+{
+  return isReadOnly(journal->slab->allocator->readOnlyNotifier);
+}
+
+/**
+ * Check whether there are entry waiters which should delay a flush.
+ *
+ * @param journal  The journal to check
+ *
+ * @return <code>true</code> if there are no entry waiters, or if the slab
+ *         is unrecovered
+ **/
+__attribute__((warn_unused_result))
+static inline bool mustMakeEntriesToFlush(SlabJournal *journal)
+{
+  return (!slabIsRebuilding(journal->slab)
+          && hasWaiters(&journal->entryWaiters));
+}
+
+/**
+ * Check whether a reap is currently in progress.
+ *
+ * @param journal  The journal which may be reaping
+ *
+ * @return <code>true</code> if the journal is reaping
+ **/
+__attribute__((warn_unused_result))
+static inline bool isReaping(SlabJournal *journal)
+{
+  return (journal->head != journal->unreapable);
+}
+
+/**********************************************************************/
+bool isSlabJournalActive(SlabJournal *journal)
+{
+  return (mustMakeEntriesToFlush(journal)
+          || isReaping(journal)
+          || journal->waitingToCommit
+          || !isRingEmpty(&journal->uncommittedBlocks)
+          || journal->updatingSlabSummary);
+}
+
+/**
+ * Initialize tail block as a new block.
+ *
+ * @param journal  The journal whose tail block is being initialized
+ **/
+static void initializeTailBlock(SlabJournal *journal)
+{
+  SlabJournalBlockHeader *header = &journal->tailHeader;
+  header->sequenceNumber         = journal->tail;
+  header->entryCount             = 0;
+  header->hasBlockMapIncrements  = false;
+}
+
+/**
+ * Set all journal fields appropriately to start journaling.
+ *
+ * @param journal  The journal to be reset, based on its tail sequence number
+ **/
+static void initializeJournalState(SlabJournal *journal)
+{
+  journal->unreapable = journal->head;
+  journal->reapLock   = getLock(journal, journal->unreapable);
+  journal->nextCommit = journal->tail;
+  journal->summarized = journal->lastSummarized = journal->tail;
+  initializeTailBlock(journal);
+}
+
+/**
+ * Check whether a journal block is full.
+ *
+ * @param journal The slab journal for the block
+ *
+ * @return <code>true</code> if the tail block is full
+ **/
+__attribute__((warn_unused_result))
+static bool blockIsFull(SlabJournal *journal)
+{
+  JournalEntryCount count = journal->tailHeader.entryCount;
+  return (journal->tailHeader.hasBlockMapIncrements
+          ? (journal->fullEntriesPerBlock == count)
+          : (journal->entriesPerBlock == count));
+}
+
+/**********************************************************************/
+static void addEntries(SlabJournal *journal);
+static void updateTailBlockLocation(SlabJournal *journal);
+static void releaseJournalLocks(Waiter *waiter, void *context);
+
+/**********************************************************************/
+int makeSlabJournal(BlockAllocator   *allocator,
+                    Slab             *slab,
+                    RecoveryJournal  *recoveryJournal,
+                    SlabJournal     **journalPtr)
+{
+  SlabJournal *journal;
+  const SlabConfig *slabConfig = getSlabConfig(allocator->depot);
+  int result = ALLOCATE_EXTENDED(SlabJournal, slabConfig->slabJournalBlocks,
+                                 JournalLock, __func__, &journal);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  journal->slab                = slab;
+  journal->size                = slabConfig->slabJournalBlocks;
+  journal->flushingThreshold   = slabConfig->slabJournalFlushingThreshold;
+  journal->blockingThreshold   = slabConfig->slabJournalBlockingThreshold;
+  journal->scrubbingThreshold  = slabConfig->slabJournalScrubbingThreshold;
+  journal->entriesPerBlock     = SLAB_JOURNAL_ENTRIES_PER_BLOCK;
+  journal->fullEntriesPerBlock = SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
+  journal->events              = &allocator->slabJournalStatistics;
+  journal->recoveryJournal     = recoveryJournal;
+  journal->summary             = getSlabSummaryZone(allocator);
+  journal->tail                = 1;
+  journal->head                = 1;
+
+  journal->flushingDeadline = journal->flushingThreshold;
+  // Set there to be some time between the deadline and the blocking threshold,
+  // so that hopefully all are done before blocking.
+  if ((journal->blockingThreshold - journal->flushingThreshold) > 5) {
+    journal->flushingDeadline = journal->blockingThreshold - 5;
+  }
+
+  journal->slabSummaryWaiter.callback = releaseJournalLocks;
+
+  result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedSlabJournalBlock",
+                    (char **) &journal->block);
+  if (result != VDO_SUCCESS) {
+    freeSlabJournal(&journal);
+    return result;
+  }
+
+  initializeRing(&journal->dirtyNode);
+  initializeRing(&journal->uncommittedBlocks);
+
+  journal->tailHeader.nonce        = slab->allocator->nonce;
+  journal->tailHeader.metadataType = VDO_METADATA_SLAB_JOURNAL;
+  initializeJournalState(journal);
+
+  *journalPtr = journal;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeSlabJournal(SlabJournal **journalPtr)
+{
+  SlabJournal *journal = *journalPtr;
+  if (journal == NULL) {
+    return;
+  }
+
+  FREE(journal->block);
+  FREE(journal);
+  *journalPtr = NULL;
+}
+
+/**********************************************************************/
+bool isSlabJournalBlank(const SlabJournal *journal)
+{
+  return ((journal != NULL)
+          && (journal->tail == 1)
+          && (journal->tailHeader.entryCount == 0));
+}
+
+/**********************************************************************/
+bool isSlabJournalDirty(const SlabJournal *journal)
+{
+  return (journal->recoveryLock != 0);
+}
+
+/**
+ * Put a slab journal on the dirty ring of its allocator in the correct order.
+ *
+ * @param journal  The journal to be marked dirty
+ * @param lock     The recovery journal lock held by the slab journal
+ **/
+static void markSlabJournalDirty(SlabJournal *journal, SequenceNumber lock)
+{
+  ASSERT_LOG_ONLY(!isSlabJournalDirty(journal), "slab journal was clean");
+
+  journal->recoveryLock = lock;
+  RingNode *dirtyRing   = &journal->slab->allocator->dirtySlabJournals;
+  RingNode *node        = dirtyRing->prev;
+  while (node != dirtyRing) {
+    SlabJournal *dirtyJournal = slabJournalFromDirtyNode(node);
+    if (dirtyJournal->recoveryLock <= journal->recoveryLock) {
+      break;
+    }
+
+    node = node->prev;
+  }
+
+  pushRingNode(node->next, &journal->dirtyNode);
+}
+
+/**********************************************************************/
+static void markSlabJournalClean(SlabJournal *journal)
+{
+  journal->recoveryLock = 0;
+  unspliceRingNode(&journal->dirtyNode);
+}
+
+/**
+ * Implements WaiterCallback. This callback is invoked on all VIOs waiting
+ * to make slab journal entries after the VDO has gone into read-only mode.
+ **/
+static void abortWaiter(Waiter *waiter,
+                        void   *context __attribute__((unused)))
+{
+  continueDataVIO(waiterAsDataVIO(waiter), VDO_READ_ONLY);
+}
+
+/**********************************************************************/
+void abortSlabJournalWaiters(SlabJournal *journal)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == journal->slab->allocator->threadID),
+                  "abortSlabJournalWaiters() called on correct thread");
+  notifyAllWaiters(&journal->entryWaiters, abortWaiter, journal);
+  checkIfSlabDrained(journal->slab);
+}
+
+/**
+ * Put the journal in read-only mode. All attempts to add entries after
+ * this function is called will fail. All VIOs waiting for to make entries
+ * will be awakened with an error. All flushes will complete as soon as all
+ * pending IO is done.
+ *
+ * @param journal    The journal which has failed
+ * @param errorCode  The error result triggering this call
+ **/
+static void enterJournalReadOnlyMode(SlabJournal *journal, int errorCode)
+{
+  enterReadOnlyMode(journal->slab->allocator->readOnlyNotifier, errorCode);
+  abortSlabJournalWaiters(journal);
+}
+
+/**
+ * Actually advance the head of the journal now that any necessary flushes
+ * are complete.
+ *
+ * @param journal  The journal to be reaped
+ **/
+static void finishReaping(SlabJournal *journal)
+{
+  journal->head = journal->unreapable;
+  addEntries(journal);
+  checkIfSlabDrained(journal->slab);
+}
+
+/**********************************************************************/
+static void reapSlabJournal(SlabJournal *journal);
+
+/**
+ * Finish reaping now that we have flushed the lower layer and then try
+ * reaping again in case we deferred reaping due to an outstanding VIO.
+ *
+ * @param completion  The flush VIO
+ **/
+static void completeReaping(VDOCompletion *completion)
+{
+  VIOPoolEntry *entry   = completion->parent;
+  SlabJournal  *journal = entry->parent;
+  returnVIO(journal->slab->allocator, entry);
+  finishReaping(journal);
+  reapSlabJournal(journal);
+}
+
+/**
+ * Handle an error flushing the lower layer.
+ *
+ * @param completion  The flush VIO
+ **/
+static void handleFlushError(VDOCompletion *completion)
+{
+  SlabJournal *journal = ((VIOPoolEntry *) completion->parent)->parent;
+  enterJournalReadOnlyMode(journal, completion->result);
+  completeReaping(completion);
+}
+
+/**
+ * Waiter callback for getting a VIO with which to flush the lower layer prior
+ * to reaping.
+ *
+ * @param waiter      The journal as a flush waiter
+ * @param vioContext  The newly acquired flush VIO
+ **/
+static void flushForReaping(Waiter *waiter, void *vioContext)
+{
+  SlabJournal  *journal = slabJournalFromFlushWaiter(waiter);
+  VIOPoolEntry *entry   = vioContext;
+  VIO          *vio     = entry->vio;
+
+  entry->parent                    = journal;
+  vio->completion.callbackThreadID = journal->slab->allocator->threadID;
+  launchFlush(vio, completeReaping, handleFlushError);
+}
+
+/**
+ * Conduct a reap on a slab journal to reclaim unreferenced blocks.
+ *
+ * @param journal  The slab journal
+ **/
+static void reapSlabJournal(SlabJournal *journal)
+{
+  if (isReaping(journal)) {
+    // We already have a reap in progress so wait for it to finish.
+    return;
+  }
+
+  if (isUnrecoveredSlab(journal->slab) || !isNormal(&journal->slab->state)
+      || isVDOReadOnly(journal)) {
+    // We must not reap in the first two cases, and there's no point in
+    // read-only mode.
+    return;
+  }
+
+  /*
+   * Start reclaiming blocks only when the journal head has no references. Then
+   * stop when a block is referenced or reap reaches the most recently written
+   * block, referenced by the slab summary, which has the sequence number just
+   * before the tail.
+   */
+  bool reaped = false;
+  while ((journal->unreapable < journal->tail)
+         && (journal->reapLock->count == 0)) {
+    reaped = true;
+    journal->unreapable++;
+    journal->reapLock++;
+    if (journal->reapLock == &journal->locks[journal->size]) {
+      journal->reapLock = &journal->locks[0];
+    }
+  }
+
+  if (!reaped) {
+    return;
+  }
+
+  PhysicalLayer *layer = journal->slab->allocator->completion.layer;
+  if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) {
+    finishReaping(journal);
+    return;
+  }
+
+  /*
+   * In async mode, it is never safe to reap a slab journal block without first
+   * issuing a flush, regardless of whether a user flush has been received or
+   * not. In the absence of the flush, the reference block write which released
+   * the locks allowing the slab journal to reap may not be persisted. Although
+   * slab summary writes will eventually issue flushes, multiple slab journal
+   * block writes can be issued while previous slab summary updates have not
+   * yet been made. Even though those slab journal block writes will be ignored
+   * if the slab summary update is not persisted, they may still overwrite the
+   * to-be-reaped slab journal block resulting in a loss of reference count
+   * updates (VDO-2912).
+   *
+   * In sync mode, it is similarly unsafe. However, we cannot possibly make
+   * those additional slab journal block writes due to the blocking threshold
+   * and the recovery journal's flush policy of flushing before every block.
+   * We may make no more than (number of VIOs) entries in slab journals since
+   * the last recovery journal flush; thus, due to the size of the slab
+   * journal blocks, the RJ must have flushed the storage no more than one
+   * slab journal block ago. So we could only overwrite the to-be-reaped block
+   * if we wrote and flushed the last block in the journal. But the blocking
+   * threshold prevents that.
+   */
+  journal->flushWaiter.callback = flushForReaping;
+  int result = acquireVIO(journal->slab->allocator, &journal->flushWaiter);
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(journal, result);
+    return;
+  }
+}
+
+/**
+ * This is the callback invoked after a slab summary update completes. It
+ * is registered in the constructor on behalf of updateTailBlockLocation().
+ *
+ * Implements WaiterCallback.
+ *
+ * @param waiter        The slab summary waiter that has just been notified
+ * @param context       The result code of the update
+ **/
+static void releaseJournalLocks(Waiter *waiter, void *context)
+{
+  SlabJournal *journal = slabJournalFromSlabSummaryWaiter(waiter);
+  int          result  = *((int *) context);
+  if (result != VDO_SUCCESS) {
+    if (result != VDO_READ_ONLY) {
+      // Don't bother logging what might be lots of errors if we are already
+      // in read-only mode.
+      logErrorWithStringError(result, "failed slab summary update %llu",
+                              journal->summarized);
+    }
+
+    journal->updatingSlabSummary = false;
+    enterJournalReadOnlyMode(journal, result);
+    return;
+  }
+
+  if (journal->partialWriteInProgress
+      && (journal->summarized == journal->tail)) {
+    journal->partialWriteInProgress = false;
+    addEntries(journal);
+  }
+
+  SequenceNumber first    = journal->lastSummarized;
+  journal->lastSummarized = journal->summarized;
+  for (SequenceNumber i = journal->summarized - 1; i >= first; i--) {
+    // Release the lock the summarized block held on the recovery journal.
+    // (During replay, recoveryStart will always be 0.)
+    if (journal->recoveryJournal != NULL) {
+      ZoneCount zoneNumber = journal->slab->allocator->zoneNumber;
+      releaseRecoveryJournalBlockReference(journal->recoveryJournal,
+                                           getLock(journal, i)->recoveryStart,
+                                           ZONE_TYPE_PHYSICAL,
+                                           zoneNumber);
+
+    }
+
+    // Release our own lock against reaping for blocks that are committed.
+    // (This function will not change locks during replay.)
+    adjustSlabJournalBlockReference(journal, i, -1);
+  }
+
+  journal->updatingSlabSummary = false;
+
+  reapSlabJournal(journal);
+
+  // Check if the slab summary needs to be updated again.
+  updateTailBlockLocation(journal);
+}
+
+/**
+ * Update the tail block location in the slab summary, if necessary.
+ *
+ * @param journal  The slab journal that is updating its tail block location
+ **/
+static void updateTailBlockLocation(SlabJournal *journal)
+{
+  if (journal->updatingSlabSummary || isVDOReadOnly(journal)
+      || (journal->lastSummarized >= journal->nextCommit)) {
+    checkIfSlabDrained(journal->slab);
+    return;
+  }
+
+  BlockCount freeBlockCount;
+  if (isUnrecoveredSlab(journal->slab)) {
+    freeBlockCount = getSummarizedFreeBlockCount(journal->summary,
+                                                 journal->slab->slabNumber);
+  } else {
+    freeBlockCount = getSlabFreeBlockCount(journal->slab);
+  }
+
+  journal->summarized          = journal->nextCommit;
+  journal->updatingSlabSummary = true;
+
+  /*
+   * Update slab summary as dirty.
+   * Slab journal can only reap past sequence number 1 when all the refCounts
+   * for this slab have been written to the layer. Therefore, indicate that the
+   * refCounts must be loaded when the journal head has reaped past sequence
+   * number 1.
+   */
+  TailBlockOffset blockOffset
+    = getSlabJournalBlockOffset(journal, journal->summarized);
+  updateSlabSummaryEntry(journal->summary, &journal->slabSummaryWaiter,
+                         journal->slab->slabNumber, blockOffset,
+                         (journal->head > 1), false, freeBlockCount);
+}
+
+/**********************************************************************/
+void reopenSlabJournal(SlabJournal *journal)
+{
+  ASSERT_LOG_ONLY(journal->tailHeader.entryCount == 0,
+                  "Slab journal's active block empty before reopening");
+  journal->head       = journal->tail;
+  initializeJournalState(journal);
+
+  // Ensure no locks are spuriously held on an empty journal.
+  for (SequenceNumber block = 1; block <= journal->size; block++) {
+    ASSERT_LOG_ONLY((getLock(journal, block)->count == 0),
+                    "Scrubbed journal's block %llu is not locked",
+                    block);
+  }
+
+  addEntries(journal);
+}
+
+/**********************************************************************/
+static SequenceNumber getCommittingSequenceNumber(const VIOPoolEntry *entry)
+{
+  const PackedSlabJournalBlock *block = entry->buffer;
+  return getUInt64LE(block->header.fields.sequenceNumber);
+}
+
+/**
+ * Handle post-commit processing. This is the callback registered by
+ * writeSlabJournalBlock().
+ *
+ * @param completion  The write VIO as a completion
+ **/
+static void completeWrite(VDOCompletion *completion)
+{
+  int           writeResult = completion->result;
+  VIOPoolEntry *entry       = completion->parent;
+  SlabJournal  *journal     = entry->parent;
+
+  SequenceNumber committed = getCommittingSequenceNumber(entry);
+  unspliceRingNode(&entry->node);
+  returnVIO(journal->slab->allocator, entry);
+
+  if (writeResult != VDO_SUCCESS) {
+    logErrorWithStringError(writeResult,
+                            "cannot write slab journal block %llu",
+                            committed);
+    enterJournalReadOnlyMode(journal, writeResult);
+    return;
+  }
+
+  relaxedAdd64(&journal->events->blocksWritten, 1);
+
+  if (isRingEmpty(&journal->uncommittedBlocks)) {
+    // If no blocks are outstanding, then the commit point is at the tail.
+    journal->nextCommit = journal->tail;
+  } else {
+    // The commit point is always the beginning of the oldest incomplete block.
+    VIOPoolEntry *oldest = asVIOPoolEntry(journal->uncommittedBlocks.next);
+    journal->nextCommit = getCommittingSequenceNumber(oldest);
+  }
+
+  updateTailBlockLocation(journal);
+}
+
+/**
+ * Callback from acquireVIO() registered in commitSlabJournalTail().
+ *
+ * @param waiter      The VIO pool waiter which was just notified
+ * @param vioContext  The VIO pool entry for the write
+ **/
+static void writeSlabJournalBlock(Waiter *waiter, void *vioContext)
+{
+  SlabJournal            *journal = slabJournalFromResourceWaiter(waiter);
+  VIOPoolEntry           *entry   = vioContext;
+  SlabJournalBlockHeader *header  = &journal->tailHeader;
+
+  header->head = journal->head;
+  pushRingNode(&journal->uncommittedBlocks, &entry->node);
+  packSlabJournalBlockHeader(header, &journal->block->header);
+
+  // Copy the tail block into the VIO.
+  memcpy(entry->buffer, journal->block, VDO_BLOCK_SIZE);
+
+  int unusedEntries = journal->entriesPerBlock - header->entryCount;
+  ASSERT_LOG_ONLY(unusedEntries >= 0, "Slab journal block is not overfull");
+  if (unusedEntries > 0) {
+    // Release the per-entry locks for any unused entries in the block we are
+    // about to write.
+    adjustSlabJournalBlockReference(journal, header->sequenceNumber,
+                                    -unusedEntries);
+    journal->partialWriteInProgress = !blockIsFull(journal);
+  }
+
+  PhysicalBlockNumber blockNumber
+    = getBlockNumber(journal, header->sequenceNumber);
+
+  entry->parent = journal;
+  entry->vio->completion.callbackThreadID = journal->slab->allocator->threadID;
+  /*
+   * This block won't be read in recovery until the slab summary is updated
+   * to refer to it. The slab summary update does a flush which is sufficient
+   * to protect us from VDO-2331.
+   */
+  launchWriteMetadataVIO(entry->vio, blockNumber, completeWrite,
+                         completeWrite);
+
+  // Since the write is submitted, the tail block structure can be reused.
+  journal->tail++;
+  initializeTailBlock(journal);
+  journal->waitingToCommit = false;
+  if (journal->slab->state.state == ADMIN_STATE_WAITING_FOR_RECOVERY) {
+    finishOperationWithResult(&journal->slab->state,
+                              (isVDOReadOnly(journal)
+                               ? VDO_READ_ONLY : VDO_SUCCESS));
+    return;
+  }
+
+  addEntries(journal);
+}
+
+/**********************************************************************/
+void commitSlabJournalTail(SlabJournal *journal)
+{
+  if ((journal->tailHeader.entryCount == 0)
+      && mustMakeEntriesToFlush(journal)) {
+    // There are no entries at the moment, but there are some waiters, so defer
+    // initiating the flush until those entries are ready to write.
+    return;
+  }
+
+  if (isVDOReadOnly(journal)
+      || journal->waitingToCommit
+      || (journal->tailHeader.entryCount == 0)) {
+    // There is nothing to do since the tail block is empty, or writing, or
+    // the journal is in read-only mode.
+    return;
+  }
+
+  /*
+   * Since we are about to commit the tail block, this journal no longer
+   * needs to be on the ring of journals which the recovery journal might
+   * ask to commit.
+   */
+  markSlabJournalClean(journal);
+
+  journal->waitingToCommit = true;
+
+  journal->resourceWaiter.callback = writeSlabJournalBlock;
+  int result = acquireVIO(journal->slab->allocator, &journal->resourceWaiter);
+  if (result != VDO_SUCCESS) {
+    journal->waitingToCommit = false;
+    enterJournalReadOnlyMode(journal, result);
+    return;
+  }
+}
+
+/**********************************************************************/
+void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader,
+                            SlabJournalPayload     *payload,
+                            SlabBlockNumber         sbn,
+                            JournalOperation        operation)
+{
+  JournalEntryCount entryNumber = tailHeader->entryCount++;
+  if (operation == BLOCK_MAP_INCREMENT) {
+    if (!tailHeader->hasBlockMapIncrements) {
+      memset(payload->fullEntries.entryTypes, 0,
+             SLAB_JOURNAL_ENTRY_TYPES_SIZE);
+      tailHeader->hasBlockMapIncrements = true;
+    }
+
+    payload->fullEntries.entryTypes[entryNumber / 8]
+      |= ((byte) 1 << (entryNumber % 8));
+  }
+
+  packSlabJournalEntry(&payload->entries[entryNumber], sbn,
+                       isIncrementOperation(operation));
+}
+
+/**********************************************************************/
+SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block,
+                                        JournalEntryCount       entryCount)
+{
+  SlabJournalEntry entry
+    = unpackSlabJournalEntry(&block->payload.entries[entryCount]);
+  if (block->header.fields.hasBlockMapIncrements
+      && ((block->payload.fullEntries.entryTypes[entryCount / 8]
+           & ((byte) 1 << (entryCount % 8))) != 0)) {
+    entry.operation = BLOCK_MAP_INCREMENT;
+  }
+  return entry;
+}
+
+/**
+ * Actually add an entry to the slab journal, potentially firing off a write
+ * if a block becomes full. This function is synchronous.
+ *
+ * @param journal        The slab journal to append to
+ * @param pbn            The pbn being adjusted
+ * @param operation      The type of entry to make
+ * @param recoveryPoint  The recovery journal point for this entry
+ **/
+static void addEntry(SlabJournal         *journal,
+                     PhysicalBlockNumber  pbn,
+                     JournalOperation     operation,
+                     const JournalPoint  *recoveryPoint)
+{
+  int result = ASSERT(beforeJournalPoint(&journal->tailHeader.recoveryPoint,
+                                         recoveryPoint),
+                      "recovery journal point is monotonically increasing, "
+                      "recovery point: %llu.%u, "
+                      "block recovery point: %llu.%u",
+                      recoveryPoint->sequenceNumber, recoveryPoint->entryCount,
+                      journal->tailHeader.recoveryPoint.sequenceNumber,
+                      journal->tailHeader.recoveryPoint.entryCount);
+  if (result != VDO_SUCCESS) {
+    enterJournalReadOnlyMode(journal, result);
+    return;
+  }
+
+  PackedSlabJournalBlock *block = journal->block;
+  if (operation == BLOCK_MAP_INCREMENT) {
+    result = ASSERT_LOG_ONLY((journal->tailHeader.entryCount
+                              < journal->fullEntriesPerBlock),
+                             "block has room for full entries");
+    if (result != VDO_SUCCESS) {
+      enterJournalReadOnlyMode(journal, result);
+      return;
+    }
+  }
+
+  encodeSlabJournalEntry(&journal->tailHeader, &block->payload,
+                         pbn - journal->slab->start, operation);
+  journal->tailHeader.recoveryPoint = *recoveryPoint;
+  if (blockIsFull(journal)) {
+    commitSlabJournalTail(journal);
+  }
+}
+
+/**********************************************************************/
+bool attemptReplayIntoSlabJournal(SlabJournal         *journal,
+                                  PhysicalBlockNumber  pbn,
+                                  JournalOperation     operation,
+                                  JournalPoint        *recoveryPoint,
+                                  VDOCompletion       *parent)
+{
+  // Only accept entries after the current recovery point.
+  if (!beforeJournalPoint(&journal->tailHeader.recoveryPoint, recoveryPoint)) {
+    return true;
+  }
+
+  SlabJournalBlockHeader *header = &journal->tailHeader;
+  if ((header->entryCount >= journal->fullEntriesPerBlock)
+      && (header->hasBlockMapIncrements ||
+          (operation == BLOCK_MAP_INCREMENT))) {
+    // The tail block does not have room for the entry we are attempting
+    // to add so commit the tail block now.
+    commitSlabJournalTail(journal);
+  }
+
+  if (journal->waitingToCommit) {
+    startOperationWithWaiter(&journal->slab->state,
+                             ADMIN_STATE_WAITING_FOR_RECOVERY, parent, NULL);
+    return false;
+  }
+
+  if ((journal->tail - journal->head) >= journal->size) {
+    /*
+     * We must have reaped the current head before the crash, since
+     * the blocked threshold keeps us from having more entries than
+     * fit in a slab journal; hence we can just advance the head
+     * (and unreapable block), as needed.
+     */
+    journal->head++;
+    journal->unreapable++;
+  }
+
+  markSlabReplaying(journal->slab);
+  addEntry(journal, pbn, operation, recoveryPoint);
+  return true;
+}
+
+/**
+ * Check whether the journal should be saving reference blocks out.
+ *
+ * @param journal       The journal to check
+ *
+ * @return true if the journal should be requesting reference block writes
+ **/
+static bool requiresFlushing(const SlabJournal *journal)
+{
+  BlockCount journalLength = (journal->tail - journal->head);
+  return (journalLength >= journal->flushingThreshold);
+}
+
+/**
+ * Check whether the journal must be reaped before adding new entries.
+ *
+ * @param journal       The journal to check
+ *
+ * @return true if the journal must be reaped
+ **/
+static bool requiresReaping(const SlabJournal *journal)
+{
+  BlockCount journalLength = (journal->tail - journal->head);
+  return (journalLength >= journal->blockingThreshold);
+}
+
+/**********************************************************************/
+bool requiresScrubbing(const SlabJournal *journal)
+{
+  BlockCount journalLength = (journal->tail - journal->head);
+  return (journalLength >= journal->scrubbingThreshold);
+}
+
+/**
+ * Implements WaiterCallback. This callback is invoked by addEntries() once
+ * it has determined that we are ready to make another entry in the slab
+ * journal.
+ *
+ * @param waiter        The VIO which should make an entry now
+ * @param context       The slab journal to make an entry in
+ **/
+static void addEntryFromWaiter(Waiter *waiter, void *context)
+{
+  DataVIO     *dataVIO = waiterAsDataVIO(waiter);
+  SlabJournal *journal = (SlabJournal *) context;
+  SlabJournalBlockHeader *header = &journal->tailHeader;
+  SequenceNumber recoveryBlock = dataVIO->recoveryJournalPoint.sequenceNumber;
+
+  if (header->entryCount == 0) {
+    /*
+     * This is the first entry in the current tail block, so get a lock
+     * on the recovery journal which we will hold until this tail block is
+     * committed.
+     */
+    getLock(journal, header->sequenceNumber)->recoveryStart = recoveryBlock;
+    if (journal->recoveryJournal != NULL) {
+      ZoneCount zoneNumber = journal->slab->allocator->zoneNumber;
+      acquireRecoveryJournalBlockReference(journal->recoveryJournal,
+                                           recoveryBlock, ZONE_TYPE_PHYSICAL,
+                                           zoneNumber);
+    }
+    markSlabJournalDirty(journal, recoveryBlock);
+
+    // If the slab journal is over the first threshold, tell the refCounts to
+    // write some reference blocks, but proceed apace.
+    if (requiresFlushing(journal)) {
+      relaxedAdd64(&journal->events->flushCount, 1);
+      BlockCount journalLength = (journal->tail - journal->head);
+      BlockCount blocksToDeadline = 0;
+      if (journalLength <= journal->flushingDeadline) {
+        blocksToDeadline = journal->flushingDeadline - journalLength;
+      }
+      saveSeveralReferenceBlocks(journal->slab->referenceCounts,
+                                 blocksToDeadline + 1);
+    }
+  }
+
+  JournalPoint slabJournalPoint = {
+    .sequenceNumber = header->sequenceNumber,
+    .entryCount     = header->entryCount,
+  };
+
+  addEntry(journal, dataVIO->operation.pbn, dataVIO->operation.type,
+           &dataVIO->recoveryJournalPoint);
+
+  // Now that an entry has been made in the slab journal, update the
+  // reference counts.
+  int result = modifySlabReferenceCount(journal->slab, &slabJournalPoint,
+                                        dataVIO->operation);
+  continueDataVIO(dataVIO, result);
+}
+
+/**
+ * Check whether the next entry to be made is a block map increment.
+ *
+ * @param journal  The journal
+ *
+ * @return <code>true</code> if the first entry waiter's operation is a block
+ *         map increment
+ **/
+static inline bool isNextEntryABlockMapIncrement(SlabJournal *journal)
+{
+  DataVIO *dataVIO = waiterAsDataVIO(getFirstWaiter(&journal->entryWaiters));
+  return (dataVIO->operation.type == BLOCK_MAP_INCREMENT);
+}
+
+/**
+ * Add as many entries as possible from the queue of VIOs waiting to make
+ * entries. By processing the queue in order, we ensure that slab journal
+ * entries are made in the same order as recovery journal entries for the
+ * same increment or decrement.
+ *
+ * @param journal  The journal to which entries may be added
+ **/
+static void addEntries(SlabJournal *journal)
+{
+  if (journal->addingEntries) {
+    // Protect against re-entrancy.
+    return;
+  }
+
+  journal->addingEntries = true;
+  while (hasWaiters(&journal->entryWaiters)) {
+    if (journal->partialWriteInProgress || slabIsRebuilding(journal->slab)) {
+      // Don't add entries while rebuilding or while a partial write is
+      // outstanding (VDO-2399).
+      break;
+    }
+
+    SlabJournalBlockHeader *header = &journal->tailHeader;
+    if (journal->waitingToCommit) {
+      // If we are waiting for resources to write the tail block, and the
+      // tail block is full, we can't make another entry.
+      relaxedAdd64(&journal->events->tailBusyCount, 1);
+      break;
+    } else if (isNextEntryABlockMapIncrement(journal)
+               && (header->entryCount >= journal->fullEntriesPerBlock)) {
+      // The tail block does not have room for a block map increment, so
+      // commit it now.
+      commitSlabJournalTail(journal);
+      if (journal->waitingToCommit) {
+        relaxedAdd64(&journal->events->tailBusyCount, 1);
+        break;
+      }
+    }
+
+    // If the slab is over the blocking threshold, make the VIO wait.
+    if (requiresReaping(journal)) {
+      relaxedAdd64(&journal->events->blockedCount, 1);
+      saveDirtyReferenceBlocks(journal->slab->referenceCounts);
+      break;
+    }
+
+    if (header->entryCount == 0) {
+      JournalLock *lock = getLock(journal, header->sequenceNumber);
+      // Check if the on disk slab journal is full. Because of the
+      // blocking and scrubbing thresholds, this should never happen.
+      if (lock->count > 0) {
+        ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
+                        "New block has locks, but journal is not full");
+
+        /*
+         * The blocking threshold must let the journal fill up if the new
+         * block has locks; if the blocking threshold is smaller than the
+         * journal size, the new block cannot possibly have locks already.
+         */
+        ASSERT_LOG_ONLY((journal->blockingThreshold >= journal->size),
+                        "New block can have locks already iff blocking"
+                        "threshold is at the end of the journal");
+
+        relaxedAdd64(&journal->events->diskFullCount, 1);
+        saveDirtyReferenceBlocks(journal->slab->referenceCounts);
+        break;
+      }
+
+      /*
+       * Don't allow the new block to be reaped until all of the reference
+       * count blocks are written and the journal block has been
+       * fully committed as well.
+       */
+      lock->count = journal->entriesPerBlock + 1;
+
+      if (header->sequenceNumber == 1) {
+        /*
+         * This is the first entry in this slab journal, ever. Dirty all of
+         * the reference count blocks. Each will acquire a lock on the
+         * tail block so that the journal won't be reaped until the
+         * reference counts are initialized. The lock acquisition must
+         * be done by the RefCounts since here we don't know how many
+         * reference blocks the RefCounts has.
+         */
+        acquireDirtyBlockLocks(journal->slab->referenceCounts);
+      }
+    }
+
+    notifyNextWaiter(&journal->entryWaiters, addEntryFromWaiter, journal);
+  }
+
+  journal->addingEntries = false;
+
+  // If there are no waiters, and we are flushing or saving, commit the
+  // tail block.
+  if (isSlabDraining(journal->slab) && !isSuspending(&journal->slab->state)
+      && !hasWaiters(&journal->entryWaiters)) {
+    commitSlabJournalTail(journal);
+  }
+}
+
+/**********************************************************************/
+void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO)
+{
+  if (!isSlabOpen(journal->slab)) {
+    continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE);
+    return;
+  }
+
+  if (isVDOReadOnly(journal)) {
+    continueDataVIO(dataVIO, VDO_READ_ONLY);
+    return;
+  }
+
+  int result = enqueueDataVIO(&journal->entryWaiters, dataVIO,
+                              THIS_LOCATION("$F($j-$js)"));
+  if (result != VDO_SUCCESS) {
+    continueDataVIO(dataVIO, result);
+    return;
+  }
+
+  if (isUnrecoveredSlab(journal->slab) && requiresReaping(journal)) {
+    increaseScrubbingPriority(journal->slab);
+  }
+
+  addEntries(journal);
+}
+
+/**********************************************************************/
+void adjustSlabJournalBlockReference(SlabJournal    *journal,
+                                     SequenceNumber  sequenceNumber,
+                                     int             adjustment)
+{
+  if (sequenceNumber == 0) {
+    return;
+  }
+
+  if (isReplayingSlab(journal->slab)) {
+    // Locks should not be used during offline replay.
+    return;
+  }
+
+  ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+  JournalLock *lock = getLock(journal, sequenceNumber);
+  if (adjustment < 0) {
+    ASSERT_LOG_ONLY((-adjustment <= lock->count),
+                    "adjustment %d of lock count %u for slab journal block %"
+		    PRIu64 " must not underflow", adjustment, lock->count,
+		    sequenceNumber);
+  }
+
+  lock->count += adjustment;
+  if (lock->count == 0) {
+    reapSlabJournal(journal);
+  }
+}
+
+/**********************************************************************/
+bool releaseRecoveryJournalLock(SlabJournal    *journal,
+                                SequenceNumber  recoveryLock)
+{
+  if (recoveryLock > journal->recoveryLock) {
+    ASSERT_LOG_ONLY((recoveryLock < journal->recoveryLock),
+                    "slab journal recovery lock is not older than the recovery"
+                    " journal head");
+    return false;
+  }
+
+  if ((recoveryLock < journal->recoveryLock) || isVDOReadOnly(journal)) {
+    return false;
+  }
+
+  // All locks are held by the block which is in progress; write it.
+  commitSlabJournalTail(journal);
+  return true;
+}
+
+/**********************************************************************/
+void drainSlabJournal(SlabJournal *journal)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == journal->slab->allocator->threadID),
+                  "drainSlabJournal() called on correct thread");
+  if (isQuiescing(&journal->slab->state)) {
+    // XXX: we should revisit this assertion since it is no longer clear what
+    //      it is for.
+    ASSERT_LOG_ONLY((!(slabIsRebuilding(journal->slab)
+                       && hasWaiters(&journal->entryWaiters))),
+                    "slab is recovered or has no waiters");
+  }
+
+  switch (journal->slab->state.state) {
+  case ADMIN_STATE_REBUILDING:
+  case ADMIN_STATE_SUSPENDING:
+  case ADMIN_STATE_SAVE_FOR_SCRUBBING:
+    break;
+
+  default:
+    commitSlabJournalTail(journal);
+  }
+}
+
+/**
+ * Finish the decode process by returning the VIO and notifying the slab that
+ * we're done.
+ *
+ * @param completion  The VIO as a completion
+ **/
+static void finishDecodingJournal(VDOCompletion *completion)
+{
+  int           result  = completion->result;
+  VIOPoolEntry *entry   = completion->parent;
+  SlabJournal  *journal = entry->parent;
+  returnVIO(journal->slab->allocator, entry);
+  notifySlabJournalIsLoaded(journal->slab, result);
+}
+
+/**
+ * Set up the in-memory journal state to the state which was written to disk.
+ * This is the callback registered in readSlabJournalTail().
+ *
+ * @param completion  The VIO which was used to read the journal tail
+ **/
+static void setDecodedState(VDOCompletion *completion)
+{
+  VIOPoolEntry           *entry   = completion->parent;
+  SlabJournal            *journal = entry->parent;
+  PackedSlabJournalBlock *block   = entry->buffer;
+
+  SlabJournalBlockHeader header;
+  unpackSlabJournalBlockHeader(&block->header, &header);
+
+  if ((header.metadataType != VDO_METADATA_SLAB_JOURNAL)
+      || (header.nonce != journal->slab->allocator->nonce)) {
+    finishDecodingJournal(completion);
+    return;
+  }
+
+  journal->tail = header.sequenceNumber + 1;
+
+  // If the slab is clean, this implies the slab journal is empty, so advance
+  // the head appropriately.
+  if (getSummarizedCleanliness(journal->summary, journal->slab->slabNumber)) {
+    journal->head = journal->tail;
+  } else {
+    journal->head = header.head;
+  }
+
+  journal->tailHeader = header;
+  initializeJournalState(journal);
+  finishDecodingJournal(completion);
+}
+
+/**
+ * This reads the slab journal tail block by using a VIO acquired from the VIO
+ * pool. This is the success callback from acquireVIOFromPool() when decoding
+ * the slab journal.
+ *
+ * @param waiter      The VIO pool waiter which has just been notified
+ * @param vioContext  The VIO pool entry given to the waiter
+ **/
+static void readSlabJournalTail(Waiter *waiter, void *vioContext)
+{
+  SlabJournal  *journal = slabJournalFromResourceWaiter(waiter);
+  Slab         *slab    = journal->slab;
+  VIOPoolEntry *entry   = vioContext;
+  TailBlockOffset lastCommitPoint
+    = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber);
+  entry->parent = journal;
+
+
+  // Slab summary keeps the commit point offset, so the tail block is the
+  // block before that. Calculation supports small journals in unit tests.
+  TailBlockOffset tailBlock = ((lastCommitPoint == 0)
+                               ? (TailBlockOffset) (journal->size - 1)
+                               : (lastCommitPoint - 1));
+  entry->vio->completion.callbackThreadID = slab->allocator->threadID;
+  launchReadMetadataVIO(entry->vio, slab->journalOrigin + tailBlock,
+                        setDecodedState, finishDecodingJournal);
+}
+
+/**********************************************************************/
+void decodeSlabJournal(SlabJournal *journal)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == journal->slab->allocator->threadID),
+                  "decodeSlabJournal() called on correct thread");
+  Slab *slab = journal->slab;
+  TailBlockOffset lastCommitPoint
+    = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber);
+  if ((lastCommitPoint == 0)
+      && !mustLoadRefCounts(journal->summary, slab->slabNumber)) {
+    /*
+     * This slab claims that it has a tail block at (journal->size - 1), but
+     * a head of 1. This is impossible, due to the scrubbing threshold, on
+     * a real system, so don't bother reading the (bogus) data off disk.
+     */
+    ASSERT_LOG_ONLY(((journal->size < 16)
+                     || (journal->scrubbingThreshold < (journal->size - 1))),
+                    "Scrubbing threshold protects against reads of unwritten"
+                    "slab journal blocks");
+    notifySlabJournalIsLoaded(slab, VDO_SUCCESS);
+    return;
+  }
+
+  journal->resourceWaiter.callback = readSlabJournalTail;
+  int result = acquireVIO(slab->allocator, &journal->resourceWaiter);
+  if (result != VDO_SUCCESS) {
+    notifySlabJournalIsLoaded(slab, result);
+  }
+}
+
+/**********************************************************************/
+void dumpSlabJournal(const SlabJournal *journal)
+{
+  logInfo("  slab journal: entryWaiters=%zu waitingToCommit=%s"
+          " updatingSlabSummary=%s head=%llu unreapable=%" PRIu64
+          " tail=%llu nextCommit=%llu summarized=%" PRIu64
+          " lastSummarized=%llu recoveryJournalLock=%" PRIu64
+          " dirty=%s", countWaiters(&journal->entryWaiters),
+          boolToString(journal->waitingToCommit),
+          boolToString(journal->updatingSlabSummary),
+          journal->head, journal->unreapable, journal->tail,
+          journal->nextCommit, journal->summarized, journal->lastSummarized,
+          journal->recoveryLock,
+          boolToString(isSlabJournalDirty(journal)));
+  // Given the frequency with which the locks are just a tiny bit off, it
+  // might be worth dumping all the locks, but that might be too much logging.
+}
diff --git a/vdo/base/slabJournal.h b/vdo/base/slabJournal.h
new file mode 100644
index 0000000..a411711
--- /dev/null
+++ b/vdo/base/slabJournal.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.h#8 $
+ */
+
+#ifndef SLAB_JOURNAL_H
+#define SLAB_JOURNAL_H
+
+#include "completion.h"
+#include "journalPoint.h"
+#include "ringNode.h"
+#include "types.h"
+
+/**
+ * Convert a completion to a SlabJournal.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a SlabJournal
+ **/
+SlabJournal *asSlabJournal(VDOCompletion *completion)
+  __attribute__((warn_unused_result));
+
+/**
+ * Calculate the number of slab journal entries per block.
+ *
+ * @return The number of slab journal entries per block
+ **/
+size_t getSlabJournalEntriesPerBlock(void)
+  __attribute__((warn_unused_result));
+
+/**
+ * Obtain a pointer to a SlabJournal structure from a pointer to the
+ * dirtyRingNode field within it.
+ *
+ * @param node  The RingNode to convert
+ *
+ * @return The RingNode as a SlabJournal
+ **/
+SlabJournal *slabJournalFromDirtyNode(RingNode *node)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a slab journal.
+ *
+ * @param [in]  allocator        The block allocator which owns this journal
+ * @param [in]  slab             The parent slab of the journal
+ * @param [in]  recoveryJournal  The recovery journal of the VDO
+ * @param [out] journalPtr       The pointer to hold the new slab journal
+ *
+ * @return VDO_SUCCESS or error code
+ **/
+int makeSlabJournal(BlockAllocator   *allocator,
+                    Slab             *slab,
+                    RecoveryJournal  *recoveryJournal,
+                    SlabJournal     **journalPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a slab journal and null out the reference to it.
+ *
+ * @param journalPtr  The reference to the slab journal to free
+ **/
+void freeSlabJournal(SlabJournal **journalPtr);
+
+/**
+ * Check whether a slab journal is blank, meaning it has never had any entries
+ * recorded in it.
+ *
+ * @param journal  The journal to query
+ *
+ * @return <code>true</code> if the slab journal has never been modified
+ **/
+bool isSlabJournalBlank(const SlabJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the slab journal is on the block allocator's ring of dirty
+ * journals.
+ *
+ * @param journal  The journal to query
+ *
+ * @return <code>true</code> if the journal has been added to the dirty ring
+ **/
+bool isSlabJournalDirty(const SlabJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a slab journal is active.
+ *
+ * @param journal  The slab journal to check
+ *
+ * @return <code>true</code> if the journal is active
+ **/
+bool isSlabJournalActive(SlabJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Abort any VIOs waiting to make slab journal entries.
+ *
+ * @param journal  The journal to abort
+ **/
+void abortSlabJournalWaiters(SlabJournal *journal);
+
+/**
+ * Reopen a slab journal by emptying it and then adding any pending entries.
+ *
+ * @param journal  The journal to reopen
+ **/
+void reopenSlabJournal(SlabJournal *journal);
+
+/**
+ * Attempt to replay a recovery journal entry into a slab journal.
+ *
+ * @param journal        The slab journal to use
+ * @param pbn            The PBN for the entry
+ * @param operation      The type of entry to add
+ * @param recoveryPoint  The recovery journal point corresponding to this entry
+ * @param parent         The completion to notify when there is space to add
+ *                       the entry if the entry could not be added immediately
+ *
+ * @return <code>true</code> if the entry was added immediately
+ **/
+bool attemptReplayIntoSlabJournal(SlabJournal         *journal,
+                                  PhysicalBlockNumber  pbn,
+                                  JournalOperation     operation,
+                                  JournalPoint        *recoveryPoint,
+                                  VDOCompletion       *parent)
+  __attribute__((warn_unused_result));
+
+/**
+ * Add an entry to a slab journal.
+ *
+ * @param journal  The slab journal to use
+ * @param dataVIO  The DataVIO for which to add the entry
+ **/
+void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO);
+
+/**
+ * Adjust the reference count for a slab journal block. Note that when the
+ * adjustment is negative, the slab journal will be reaped.
+ *
+ * @param journal         The slab journal
+ * @param sequenceNumber  The journal sequence number of the referenced block
+ * @param adjustment      Amount to adjust the reference counter
+ **/
+void adjustSlabJournalBlockReference(SlabJournal    *journal,
+                                     SequenceNumber  sequenceNumber,
+                                     int             adjustment);
+
+/**
+ * Request the slab journal to release the recovery journal lock it may hold on
+ * a specified recovery journal block.
+ *
+ * @param journal       The slab journal
+ * @param recoveryLock  The sequence number of the recovery journal block
+ *                      whose locks should be released
+ *
+ * @return <code>true</code> if the journal does hold a lock on the specified
+ *         block (which it will release)
+ **/
+bool releaseRecoveryJournalLock(SlabJournal    *journal,
+                                SequenceNumber  recoveryLock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Commit the tail block of a slab journal.
+ *
+ * @param journal  The journal whose tail block should be committed
+ **/
+void commitSlabJournalTail(SlabJournal *journal);
+
+/**
+ * Drain slab journal I/O. Depending upon the type of drain (as recorded in
+ * the journal's slab), any dirty journal blocks may be written out.
+ *
+ * @param journal  The journal to drain
+ **/
+void drainSlabJournal(SlabJournal *journal);
+
+/**
+ * Decode the slab journal by reading its tail.
+ *
+ * @param journal  The journal to decode
+ **/
+void decodeSlabJournal(SlabJournal *journal);
+
+/**
+ * Check to see if the journal should be scrubbed.
+ *
+ * @param journal  The slab journal
+ *
+ * @return <code>true</code> if the journal requires scrubbing
+ **/
+bool requiresScrubbing(const SlabJournal *journal)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump the slab journal.
+ *
+ * @param journal       The slab journal to dump
+ **/
+void dumpSlabJournal(const SlabJournal *journal);
+
+#endif // SLAB_JOURNAL_H
diff --git a/vdo/base/slabJournalEraser.c b/vdo/base/slabJournalEraser.c
new file mode 100644
index 0000000..7cd6a81
--- /dev/null
+++ b/vdo/base/slabJournalEraser.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.c#1 $
+ */
+
+#include "slabJournalEraser.h"
+
+#include "memoryAlloc.h"
+
+#include "completion.h"
+#include "constants.h"
+#include "extent.h"
+#include "slab.h"
+#include "slabDepot.h"
+
+typedef struct {
+  VDOCompletion *parent;
+  VDOExtent     *extent;
+  char          *zeroBuffer;
+  SlabIterator   slabs;
+} SlabJournalEraser;
+
+/**
+ * Free the eraser and finish the parent.
+ *
+ * @param eraser    The eraser that is done
+ * @param result    The result to return to the parent
+ **/
+static void finishErasing(SlabJournalEraser *eraser, int result)
+{
+  VDOCompletion *parent = eraser->parent;
+  freeExtent(&eraser->extent);
+  FREE(eraser->zeroBuffer);
+  FREE(eraser);
+  finishCompletion(parent, result);
+}
+
+/**
+ * Finish erasing slab journals with an error.
+ *
+ * @param completion   A completion whose parent is the eraser
+ **/
+static void handleErasingError(VDOCompletion *completion)
+{
+  SlabJournalEraser *eraser = completion->parent;
+  finishErasing(eraser, eraser->extent->completion.result);
+}
+
+/**
+ * Erase the next slab journal.
+ *
+ * @param extentCompletion  A completion whose parent is the eraser
+ **/
+static void eraseNextSlabJournal(VDOCompletion *extentCompletion)
+{
+  SlabJournalEraser *eraser = extentCompletion->parent;
+
+  if (!hasNextSlab(&eraser->slabs)) {
+    finishErasing(eraser, VDO_SUCCESS);
+    return;
+  }
+
+  Slab *slab = nextSlab(&eraser->slabs);
+  writeMetadataExtent(eraser->extent, slab->journalOrigin);
+}
+
+/**********************************************************************/
+void eraseSlabJournals(SlabDepot     *depot,
+                       SlabIterator   slabs,
+                       VDOCompletion *parent)
+{
+  SlabJournalEraser *eraser;
+  int result = ALLOCATE(1, SlabJournalEraser, __func__, &eraser);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  eraser->parent = parent;
+  eraser->slabs  = slabs;
+
+  BlockCount journalSize = getSlabConfig(depot)->slabJournalBlocks;
+  result = ALLOCATE(journalSize * VDO_BLOCK_SIZE, char, __func__,
+                    &eraser->zeroBuffer);
+  if (result != VDO_SUCCESS) {
+    finishErasing(eraser, result);
+    return;
+  }
+
+  result = createExtent(parent->layer, VIO_TYPE_SLAB_JOURNAL,
+                        VIO_PRIORITY_METADATA, journalSize, eraser->zeroBuffer,
+                        &eraser->extent);
+  if (result != VDO_SUCCESS) {
+    finishErasing(eraser, result);
+    return;
+  }
+
+  VDOCompletion *extentCompletion = &eraser->extent->completion;
+  prepareCompletion(extentCompletion, eraseNextSlabJournal,
+                    handleErasingError, getCallbackThreadID(), eraser);
+  eraseNextSlabJournal(extentCompletion);
+}
diff --git a/vdo/base/slabJournalEraser.h b/vdo/base/slabJournalEraser.h
new file mode 100644
index 0000000..215d86f
--- /dev/null
+++ b/vdo/base/slabJournalEraser.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.h#1 $
+ */
+
+#ifndef SLAB_JOURNAL_ERASER_H
+#define SLAB_JOURNAL_ERASER_H
+
+#include "slabIterator.h"
+#include "types.h"
+
+/**
+ * Begin erasing slab journals, one at a time.
+ *
+ * @param depot         The depot from which to erase
+ * @param slabs         The slabs whose journals need erasing
+ * @param parent        The object to notify when complete
+ **/
+void eraseSlabJournals(SlabDepot     *depot,
+                       SlabIterator   slabs,
+                       VDOCompletion *parent);
+
+#endif // SLAB_JOURNAL_ERASER_H
diff --git a/vdo/base/slabJournalInternals.h b/vdo/base/slabJournalInternals.h
new file mode 100644
index 0000000..ce7eafb
--- /dev/null
+++ b/vdo/base/slabJournalInternals.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalInternals.h#8 $
+ */
+
+#ifndef SLAB_JOURNAL_INTERNALS_H
+#define SLAB_JOURNAL_INTERNALS_H
+
+#include "slabJournal.h"
+
+#include "numeric.h"
+
+#include "blockAllocatorInternals.h"
+#include "blockMapEntry.h"
+#include "journalPoint.h"
+#include "slab.h"
+#include "slabSummary.h"
+#include "statistics.h"
+#include "waitQueue.h"
+
+/**
+ * Slab journal blocks may have one of two formats, depending upon whether or
+ * not any of the entries in the block are block map increments. Since the
+ * steady state for a VDO is that all of the necessary block map pages will
+ * be allocated, most slab journal blocks will have only data entries. Such
+ * blocks can hold more entries, hence the two formats.
+ **/
+
+/** A single slab journal entry */
+struct slabJournalEntry {
+  SlabBlockNumber  sbn;
+  JournalOperation operation;
+};
+
+/** A single slab journal entry in its on-disk form */
+typedef union {
+  struct __attribute__((packed)) {
+    uint8_t offsetLow8;
+    uint8_t offsetMid8;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    unsigned offsetHigh7 : 7;
+    unsigned increment   : 1;
+#else
+    unsigned increment   : 1;
+    unsigned offsetHigh7 : 7;
+#endif
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[3];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    unsigned offset    : 23;
+    unsigned increment : 1;
+  } littleEndian;
+#endif
+} __attribute__((packed)) PackedSlabJournalEntry;
+
+/** The unpacked representation of the header of a slab journal block */
+typedef struct {
+  /** Sequence number for head of journal */
+  SequenceNumber     head;
+  /** Sequence number for this block */
+  SequenceNumber     sequenceNumber;
+  /** The nonce for a given VDO instance */
+  Nonce              nonce;
+  /** Recovery journal point for last entry */
+  JournalPoint       recoveryPoint;
+  /** Metadata type */
+  VDOMetadataType    metadataType;
+  /** Whether this block contains block map increments */
+  bool               hasBlockMapIncrements;
+  /** The number of entries in the block */
+  JournalEntryCount  entryCount;
+} SlabJournalBlockHeader;
+
+/**
+ * The packed, on-disk representation of a slab journal block header.
+ * All fields are kept in little-endian byte order.
+ **/
+typedef union __attribute__((packed)) {
+  struct __attribute__((packed)) {
+    /** 64-bit sequence number for head of journal */
+    byte               head[8];
+    /** 64-bit sequence number for this block */
+    byte               sequenceNumber[8];
+    /** Recovery journal point for last entry, packed into 64 bits */
+    PackedJournalPoint recoveryPoint;
+    /** The 64-bit nonce for a given VDO instance */
+    byte               nonce[8];
+    /** 8-bit metadata type (should always be two, for the slab journal) */
+    uint8_t            metadataType;
+    /** Whether this block contains block map increments */
+    bool               hasBlockMapIncrements;
+    /** 16-bit count of the entries encoded in the block */
+    byte               entryCount[2];
+  } fields;
+
+  // A raw view of the packed encoding.
+  uint8_t raw[8 + 8 + 8 + 8 + 1 + 1 + 2];
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  // This view is only valid on little-endian machines and is only present for
+  // ease of directly examining packed entries in GDB.
+  struct __attribute__((packed)) {
+    SequenceNumber     head;
+    SequenceNumber     sequenceNumber;
+    PackedJournalPoint recoveryPoint;
+    Nonce              nonce;
+    VDOMetadataType    metadataType;
+    bool               hasBlockMapIncrements;
+    JournalEntryCount  entryCount;
+  } littleEndian;
+#endif
+} PackedSlabJournalBlockHeader;
+
+enum {
+  SLAB_JOURNAL_PAYLOAD_SIZE
+    = VDO_BLOCK_SIZE - sizeof(PackedSlabJournalBlockHeader),
+  SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25,
+  SLAB_JOURNAL_ENTRY_TYPES_SIZE = ((SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1)
+                                   / 8) + 1,
+  SLAB_JOURNAL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE
+                                    / sizeof(PackedSlabJournalEntry)),
+};
+
+/** The payload of a slab journal block which has block map increments */
+typedef struct {
+  /* The entries themselves */
+  PackedSlabJournalEntry entries[SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK];
+  /* The bit map indicating which entries are block map increments */
+  byte                   entryTypes[SLAB_JOURNAL_ENTRY_TYPES_SIZE];
+} __attribute__((packed)) FullSlabJournalEntries;
+
+typedef union {
+  /* Entries which include block map increments */
+  FullSlabJournalEntries fullEntries;
+  /* Entries which are only data updates */
+  PackedSlabJournalEntry entries[SLAB_JOURNAL_ENTRIES_PER_BLOCK];
+  /* Ensure the payload fills to the end of the block */
+  byte                   space[SLAB_JOURNAL_PAYLOAD_SIZE];
+} __attribute__((packed)) SlabJournalPayload;
+
+typedef struct {
+  PackedSlabJournalBlockHeader header;
+  SlabJournalPayload           payload;
+} __attribute__((packed)) PackedSlabJournalBlock;
+
+typedef struct {
+  uint16_t       count;
+  SequenceNumber recoveryStart;
+} JournalLock;
+
+struct slabJournal {
+  /** A waiter object for getting a VIO pool entry */
+  Waiter                       resourceWaiter;
+  /** A waiter object for updating the slab summary */
+  Waiter                       slabSummaryWaiter;
+  /** A waiter object for getting an extent with which to flush */
+  Waiter                       flushWaiter;
+  /** The queue of VIOs waiting to make an entry */
+  WaitQueue                    entryWaiters;
+  /** The parent slab reference of this journal */
+  Slab                        *slab;
+
+  /** Whether a tail block commit is pending */
+  bool                         waitingToCommit;
+  /** Whether the journal is updating the slab summary */
+  bool                         updatingSlabSummary;
+  /** Whether the journal is adding entries from the entryWaiters queue */
+  bool                         addingEntries;
+  /** Whether a partial write is in progress */
+  bool                         partialWriteInProgress;
+
+  /** The oldest block in the journal on disk */
+  SequenceNumber               head;
+  /** The oldest block in the journal which may not be reaped */
+  SequenceNumber               unreapable;
+  /** The end of the half-open interval of the active journal */
+  SequenceNumber               tail;
+  /** The next journal block to be committed */
+  SequenceNumber               nextCommit;
+  /** The tail sequence number that is written in the slab summary */
+  SequenceNumber               summarized;
+  /** The tail sequence number that was last summarized in slab summary */
+  SequenceNumber               lastSummarized;
+
+  /** The sequence number of the recovery journal lock */
+  SequenceNumber               recoveryLock;
+
+  /**
+   * The number of entries which fit in a single block. Can't use the constant
+   * because unit tests change this number.
+   **/
+  JournalEntryCount            entriesPerBlock;
+  /**
+   * The number of full entries which fit in a single block. Can't use the
+   * constant because unit tests change this number.
+   **/
+  JournalEntryCount            fullEntriesPerBlock;
+
+  /** The recovery journal of the VDO (slab journal holds locks on it) */
+  RecoveryJournal             *recoveryJournal;
+
+  /** The slab summary to update tail block location */
+  SlabSummaryZone             *summary;
+  /** The statistics shared by all slab journals in our physical zone */
+  AtomicSlabJournalStatistics *events;
+  /** A ring of the VIO pool entries for outstanding journal block writes */
+  RingNode                     uncommittedBlocks;
+
+  /**
+   * The current tail block header state. This will be packed into
+   * the block just before it is written.
+   **/
+  SlabJournalBlockHeader       tailHeader;
+  /** A pointer to a block-sized buffer holding the packed block data */
+  PackedSlabJournalBlock      *block;
+
+  /** The number of blocks in the on-disk journal */
+  BlockCount                   size;
+  /** The number of blocks at which to start pushing reference blocks */
+  BlockCount                   flushingThreshold;
+  /** The number of blocks at which all reference blocks should be writing */
+  BlockCount                   flushingDeadline;
+  /** The number of blocks at which to wait for reference blocks to write */
+  BlockCount                   blockingThreshold;
+  /** The number of blocks at which to scrub the slab before coming online */
+  BlockCount                   scrubbingThreshold;
+
+  /** This node is for BlockAllocator to keep a queue of dirty journals */
+  RingNode                     dirtyNode;
+
+  /** The lock for the oldest unreaped block of the journal */
+  JournalLock                 *reapLock;
+  /** The locks for each on disk block */
+  JournalLock                  locks[];
+};
+
+/**
+ * Get the slab journal block offset of the given sequence number.
+ *
+ * @param journal   The slab journal
+ * @param sequence  The sequence number
+ *
+ * @return the offset corresponding to the sequence number
+ **/
+__attribute__((warn_unused_result))
+static inline TailBlockOffset
+getSlabJournalBlockOffset(SlabJournal *journal, SequenceNumber sequence)
+{
+  return (sequence % journal->size);
+}
+
+/**
+ * Encode a slab journal entry (exposed for unit tests).
+ *
+ * @param tailHeader  The unpacked header for the block
+ * @param payload     The journal block payload to hold the entry
+ * @param sbn         The slab block number of the entry to encode
+ * @param operation   The type of the entry
+ **/
+void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader,
+                            SlabJournalPayload     *payload,
+                            SlabBlockNumber         sbn,
+                            JournalOperation        operation);
+
+/**
+ * Decode a slab journal entry.
+ *
+ * @param block       The journal block holding the entry
+ * @param entryCount  The number of the entry
+ *
+ * @return The decoded entry
+ **/
+SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block,
+                                        JournalEntryCount       entryCount)
+  __attribute__((warn_unused_result));
+
+/**
+ * Generate the packed encoding of a slab journal entry.
+ *
+ * @param packed       The entry into which to pack the values
+ * @param sbn          The slab block number of the entry to encode
+ * @param isIncrement  The increment flag
+ **/
+static inline void packSlabJournalEntry(PackedSlabJournalEntry *packed,
+                                        SlabBlockNumber         sbn,
+                                        bool                    isIncrement)
+{
+  packed->fields.offsetLow8  = (sbn & 0x0000FF);
+  packed->fields.offsetMid8  = (sbn & 0x00FF00) >> 8;
+  packed->fields.offsetHigh7 = (sbn & 0x7F0000) >> 16;
+  packed->fields.increment   = isIncrement ? 1 : 0;
+}
+
+/**
+ * Decode the packed representation of a slab journal entry.
+ *
+ * @param packed  The packed entry to decode
+ *
+ * @return The decoded slab journal entry
+ **/
+__attribute__((warn_unused_result))
+static inline
+SlabJournalEntry unpackSlabJournalEntry(const PackedSlabJournalEntry *packed)
+{
+  SlabJournalEntry entry;
+  entry.sbn = packed->fields.offsetHigh7;
+  entry.sbn <<= 8;
+  entry.sbn |= packed->fields.offsetMid8;
+  entry.sbn <<= 8;
+  entry.sbn |= packed->fields.offsetLow8;
+  entry.operation
+    = (packed->fields.increment ? DATA_INCREMENT : DATA_DECREMENT);
+  return entry;
+}
+
+/**
+ * Generate the packed representation of a slab block header.
+ *
+ * @param header  The header containing the values to encode
+ * @param packed  The header into which to pack the values
+ **/
+static inline
+void packSlabJournalBlockHeader(const SlabJournalBlockHeader *header,
+                                PackedSlabJournalBlockHeader *packed)
+{
+  storeUInt64LE(packed->fields.head,           header->head);
+  storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber);
+  storeUInt64LE(packed->fields.nonce,          header->nonce);
+  storeUInt16LE(packed->fields.entryCount,     header->entryCount);
+
+  packed->fields.metadataType          = header->metadataType;
+  packed->fields.hasBlockMapIncrements = header->hasBlockMapIncrements;
+
+  packJournalPoint(&header->recoveryPoint, &packed->fields.recoveryPoint);
+}
+
+/**
+ * Decode the packed representation of a slab block header.
+ *
+ * @param packed  The packed header to decode
+ * @param header  The header into which to unpack the values
+ **/
+static inline
+void unpackSlabJournalBlockHeader(const PackedSlabJournalBlockHeader *packed,
+                                  SlabJournalBlockHeader             *header)
+{
+  *header = (SlabJournalBlockHeader) {
+    .head                  = getUInt64LE(packed->fields.head),
+    .sequenceNumber        = getUInt64LE(packed->fields.sequenceNumber),
+    .nonce                 = getUInt64LE(packed->fields.nonce),
+    .entryCount            = getUInt16LE(packed->fields.entryCount),
+    .metadataType          = packed->fields.metadataType,
+    .hasBlockMapIncrements = packed->fields.hasBlockMapIncrements,
+  };
+  unpackJournalPoint(&packed->fields.recoveryPoint, &header->recoveryPoint);
+}
+
+#endif // SLAB_JOURNAL_INTERNALS_H
diff --git a/vdo/base/slabScrubber.c b/vdo/base/slabScrubber.c
new file mode 100644
index 0000000..e37e9c8
--- /dev/null
+++ b/vdo/base/slabScrubber.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.c#6 $
+ */
+
+#include "slabScrubberInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "blockAllocator.h"
+#include "constants.h"
+#include "readOnlyNotifier.h"
+#include "recoveryJournal.h"
+#include "refCounts.h"
+#include "refCountsInternals.h"
+#include "slab.h"
+#include "slabJournalInternals.h"
+
+/**
+ * Allocate the buffer and extent used for reading the slab journal when
+ * scrubbing a slab.
+ *
+ * @param scrubber         The slab scrubber for which to allocate
+ * @param layer            The physical layer on which the scrubber resides
+ * @param slabJournalSize  The size of a slab journal
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int allocateExtentAndBuffer(SlabScrubber  *scrubber,
+                                   PhysicalLayer *layer,
+                                   BlockCount     slabJournalSize)
+{
+  size_t bufferSize = VDO_BLOCK_SIZE * slabJournalSize;
+  int result = ALLOCATE(bufferSize, char, __func__, &scrubber->journalData);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return createExtent(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
+                      slabJournalSize, scrubber->journalData,
+                      &scrubber->extent);
+}
+
+/**********************************************************************/
+int makeSlabScrubber(PhysicalLayer     *layer,
+                     BlockCount         slabJournalSize,
+                     ReadOnlyNotifier  *readOnlyNotifier,
+                     SlabScrubber     **scrubberPtr)
+{
+  SlabScrubber *scrubber;
+  int result = ALLOCATE(1, SlabScrubber, __func__, &scrubber);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = allocateExtentAndBuffer(scrubber, layer, slabJournalSize);
+  if (result != VDO_SUCCESS) {
+    freeSlabScrubber(&scrubber);
+    return result;
+  }
+
+  initializeCompletion(&scrubber->completion, SLAB_SCRUBBER_COMPLETION, layer);
+  initializeRing(&scrubber->highPrioritySlabs);
+  initializeRing(&scrubber->slabs);
+  scrubber->readOnlyNotifier = readOnlyNotifier;
+  scrubber->adminState.state = ADMIN_STATE_SUSPENDED;
+  *scrubberPtr               = scrubber;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Free the extent and buffer used for reading slab journals.
+ *
+ * @param scrubber  The scrubber
+ **/
+static void freeExtentAndBuffer(SlabScrubber *scrubber)
+{
+  freeExtent(&scrubber->extent);
+  if (scrubber->journalData != NULL) {
+    FREE(scrubber->journalData);
+    scrubber->journalData = NULL;
+  }
+}
+
+/**********************************************************************/
+void freeSlabScrubber(SlabScrubber **scrubberPtr)
+{
+  if (*scrubberPtr == NULL) {
+    return;
+  }
+
+  SlabScrubber *scrubber = *scrubberPtr;
+  freeExtentAndBuffer(scrubber);
+  FREE(scrubber);
+  *scrubberPtr = NULL;
+}
+
+/**
+ * Get the next slab to scrub.
+ *
+ * @param scrubber  The slab scrubber
+ *
+ * @return The next slab to scrub or <code>NULL</code> if there are none
+ **/
+static Slab *getNextSlab(SlabScrubber *scrubber)
+{
+  if (!isRingEmpty(&scrubber->highPrioritySlabs)) {
+    return slabFromRingNode(scrubber->highPrioritySlabs.next);
+  }
+
+  if (!isRingEmpty(&scrubber->slabs)) {
+    return slabFromRingNode(scrubber->slabs.next);
+  }
+
+  return NULL;
+}
+
+/**********************************************************************/
+bool hasSlabsToScrub(SlabScrubber *scrubber)
+{
+  return (getNextSlab(scrubber) != NULL);
+}
+
+/**********************************************************************/
+SlabCount getScrubberSlabCount(const SlabScrubber *scrubber)
+{
+  return relaxedLoad64(&scrubber->slabCount);
+}
+
+/**********************************************************************/
+void registerSlabForScrubbing(SlabScrubber *scrubber,
+                              Slab         *slab,
+                              bool          highPriority)
+{
+  ASSERT_LOG_ONLY((slab->status != SLAB_REBUILT),
+                  "slab to be scrubbed is unrecovered");
+
+  if (slab->status != SLAB_REQUIRES_SCRUBBING) {
+    return;
+  }
+
+  unspliceRingNode(&slab->ringNode);
+  if (!slab->wasQueuedForScrubbing) {
+    relaxedAdd64(&scrubber->slabCount, 1);
+    slab->wasQueuedForScrubbing = true;
+  }
+
+  if (highPriority) {
+    slab->status = SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING;
+    pushRingNode(&scrubber->highPrioritySlabs, &slab->ringNode);
+    return;
+  }
+
+  pushRingNode(&scrubber->slabs, &slab->ringNode);
+}
+
+/**
+ * Stop scrubbing, either because there are no more slabs to scrub or because
+ * there's been an error.
+ *
+ * @param scrubber  The scrubber
+ **/
+static void finishScrubbing(SlabScrubber *scrubber)
+{
+  if (!hasSlabsToScrub(scrubber)) {
+    freeExtentAndBuffer(scrubber);
+  }
+
+  // Inform whoever is waiting that scrubbing has completed.
+  completeCompletion(&scrubber->completion);
+
+  bool notify = hasWaiters(&scrubber->waiters);
+
+  // Note that the scrubber has stopped, and inform anyone who might be waiting
+  // for that to happen.
+  if (!finishDraining(&scrubber->adminState)) {
+    scrubber->adminState.state = ADMIN_STATE_SUSPENDED;
+  }
+
+  /*
+   * We can't notify waiters until after we've finished draining or they'll
+   * just requeue. Fortunately if there were waiters, we can't have been freed
+   * yet.
+   */
+  if (notify) {
+    notifyAllWaiters(&scrubber->waiters, NULL, NULL);
+  }
+}
+
+/**********************************************************************/
+static void scrubNextSlab(SlabScrubber *scrubber);
+
+/**
+ * Notify the scrubber that a slab has been scrubbed. This callback is
+ * registered in applyJournalEntries().
+ *
+ * @param completion  The slab rebuild completion
+ **/
+static void slabScrubbed(VDOCompletion *completion)
+{
+  SlabScrubber *scrubber = completion->parent;
+  finishScrubbingSlab(scrubber->slab);
+  relaxedAdd64(&scrubber->slabCount, -1);
+  scrubNextSlab(scrubber);
+}
+
+/**
+ * Abort scrubbing due to an error.
+ *
+ * @param scrubber  The slab scrubber
+ * @param result    The error
+ **/
+static void abortScrubbing(SlabScrubber *scrubber, int result)
+{
+  enterReadOnlyMode(scrubber->readOnlyNotifier, result);
+  setCompletionResult(&scrubber->completion, result);
+  scrubNextSlab(scrubber);
+}
+
+/**
+ * Handle errors while rebuilding a slab.
+ *
+ * @param completion  The slab rebuild completion
+ **/
+static void handleScrubberError(VDOCompletion *completion)
+{
+  abortScrubbing(completion->parent, completion->result);
+}
+
+/**
+ * Apply all the entries in a block to the reference counts.
+ *
+ * @param block        A block with entries to apply
+ * @param entryCount   The number of entries to apply
+ * @param blockNumber  The sequence number of the block
+ * @param slab         The slab to apply the entries to
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int applyBlockEntries(PackedSlabJournalBlock *block,
+                             JournalEntryCount       entryCount,
+                             SequenceNumber          blockNumber,
+                             Slab                   *slab)
+{
+  JournalPoint entryPoint = {
+    .sequenceNumber = blockNumber,
+    .entryCount     = 0,
+  };
+
+  SlabBlockNumber maxSBN = slab->end - slab->start;
+  while (entryPoint.entryCount < entryCount) {
+    SlabJournalEntry entry = decodeSlabJournalEntry(block,
+                                                    entryPoint.entryCount);
+    if (entry.sbn > maxSBN) {
+      // This entry is out of bounds.
+      return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Slab journal entry"
+                                     " (%llu, %u) had invalid offset"
+                                     " %u in slab (size %u blocks)",
+                                     blockNumber, entryPoint.entryCount,
+                                     entry.sbn, maxSBN);
+    }
+
+    int result = replayReferenceCountChange(slab->referenceCounts, &entryPoint,
+                                            entry);
+    if (result != VDO_SUCCESS) {
+      logErrorWithStringError(result, "Slab journal entry (%llu, %u)"
+                              " (%s of offset %" PRIu32 ") could not be"
+                              " applied in slab %u",
+                              blockNumber, entryPoint.entryCount,
+                              getJournalOperationName(entry.operation),
+                              entry.sbn, slab->slabNumber);
+      return result;
+    }
+    entryPoint.entryCount++;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Find the relevant extent of the slab journal and apply all valid entries.
+ * This is a callback registered in startScrubbing().
+ *
+ * @param completion  The metadata read extent completion
+ **/
+static void applyJournalEntries(VDOCompletion *completion)
+{
+  SlabScrubber *scrubber        = completion->parent;
+  Slab         *slab            = scrubber->slab;
+  SlabJournal  *journal         = slab->journal;
+  RefCounts    *referenceCounts = slab->referenceCounts;
+
+  // Find the boundaries of the useful part of the journal.
+  SequenceNumber  tail     = journal->tail;
+  TailBlockOffset endIndex = getSlabJournalBlockOffset(journal, tail - 1);
+  char *endData = scrubber->journalData + (endIndex * VDO_BLOCK_SIZE);
+  PackedSlabJournalBlock *endBlock = (PackedSlabJournalBlock *) endData;
+
+  SequenceNumber  head      = getUInt64LE(endBlock->header.fields.head);
+  TailBlockOffset headIndex = getSlabJournalBlockOffset(journal, head);
+  BlockCount      index     = headIndex;
+
+  JournalPoint refCountsPoint   = referenceCounts->slabJournalPoint;
+  JournalPoint lastEntryApplied = refCountsPoint;
+  for (SequenceNumber sequence = head; sequence < tail; sequence++) {
+    char *blockData = scrubber->journalData + (index * VDO_BLOCK_SIZE);
+    PackedSlabJournalBlock *block  = (PackedSlabJournalBlock *) blockData;
+    SlabJournalBlockHeader header;
+    unpackSlabJournalBlockHeader(&block->header, &header);
+
+    if ((header.nonce != slab->allocator->nonce)
+        || (header.metadataType != VDO_METADATA_SLAB_JOURNAL)
+        || (header.sequenceNumber != sequence)
+        || (header.entryCount > journal->entriesPerBlock)
+        || (header.hasBlockMapIncrements
+            && (header.entryCount > journal->fullEntriesPerBlock))) {
+      // The block is not what we expect it to be.
+      logError("Slab journal block for slab %u was invalid",
+               slab->slabNumber);
+      abortScrubbing(scrubber, VDO_CORRUPT_JOURNAL);
+      return;
+    }
+
+    int result = applyBlockEntries(block, header.entryCount, sequence, slab);
+    if (result != VDO_SUCCESS) {
+      abortScrubbing(scrubber, result);
+      return;
+    }
+
+    lastEntryApplied.sequenceNumber = sequence;
+    lastEntryApplied.entryCount     = header.entryCount - 1;
+    index++;
+    if (index == journal->size) {
+      index = 0;
+    }
+  }
+
+  // At the end of rebuild, the refCounts should be accurate to the end
+  // of the journal we just applied.
+  int result = ASSERT(!beforeJournalPoint(&lastEntryApplied, &refCountsPoint),
+                      "Refcounts are not more accurate than the slab journal");
+  if (result != VDO_SUCCESS) {
+    abortScrubbing(scrubber, result);
+    return;
+  }
+
+  // Save out the rebuilt reference blocks.
+  prepareCompletion(completion, slabScrubbed, handleScrubberError,
+                    completion->callbackThreadID, scrubber);
+  startSlabAction(slab, ADMIN_STATE_SAVE_FOR_SCRUBBING, completion);
+}
+
+/**
+ * Read the current slab's journal from disk now that it has been flushed.
+ * This callback is registered in scrubNextSlab().
+ *
+ * @param completion  The scrubber's extent completion
+ **/
+static void startScrubbing(VDOCompletion *completion)
+{
+  SlabScrubber *scrubber = completion->parent;
+  Slab         *slab     = scrubber->slab;
+  if (getSummarizedCleanliness(slab->allocator->summary, slab->slabNumber)) {
+    slabScrubbed(completion);
+    return;
+  }
+
+  prepareCompletion(&scrubber->extent->completion, applyJournalEntries,
+                    handleScrubberError, completion->callbackThreadID,
+                    completion->parent);
+  readMetadataExtent(scrubber->extent, slab->journalOrigin);
+}
+
+/**
+ * Scrub the next slab if there is one.
+ *
+ * @param scrubber  The scrubber
+ **/
+static void scrubNextSlab(SlabScrubber *scrubber)
+{
+  // Note: this notify call is always safe only because scrubbing can only
+  // be started when the VDO is quiescent.
+  notifyAllWaiters(&scrubber->waiters, NULL, NULL);
+  if (isReadOnly(scrubber->readOnlyNotifier)) {
+    setCompletionResult(&scrubber->completion, VDO_READ_ONLY);
+    finishScrubbing(scrubber);
+    return;
+  }
+
+  Slab *slab = getNextSlab(scrubber);
+  if ((slab == NULL)
+      || (scrubber->highPriorityOnly
+          && isRingEmpty(&scrubber->highPrioritySlabs))) {
+    scrubber->highPriorityOnly = false;
+    finishScrubbing(scrubber);
+    return;
+  }
+
+  if (finishDraining(&scrubber->adminState)) {
+    return;
+  }
+
+  unspliceRingNode(&slab->ringNode);
+  scrubber->slab = slab;
+  VDOCompletion *completion = extentAsCompletion(scrubber->extent);
+  prepareCompletion(completion, startScrubbing,
+                    handleScrubberError, scrubber->completion.callbackThreadID,
+                    scrubber);
+  startSlabAction(slab, ADMIN_STATE_SCRUBBING, completion);
+}
+
+/**********************************************************************/
+void scrubSlabs(SlabScrubber *scrubber,
+                void         *parent,
+                VDOAction    *callback,
+                VDOAction    *errorHandler)
+{
+  resumeIfQuiescent(&scrubber->adminState);
+  ThreadID threadID = getCallbackThreadID();
+  prepareCompletion(&scrubber->completion, callback, errorHandler, threadID,
+                    parent);
+  if (!hasSlabsToScrub(scrubber)) {
+    finishScrubbing(scrubber);
+    return;
+  }
+
+  scrubNextSlab(scrubber);
+}
+
+/**********************************************************************/
+void scrubHighPrioritySlabs(SlabScrubber  *scrubber,
+                            bool           scrubAtLeastOne,
+                            VDOCompletion *parent,
+                            VDOAction     *callback,
+                            VDOAction     *errorHandler)
+{
+  if (scrubAtLeastOne && isRingEmpty(&scrubber->highPrioritySlabs)) {
+    Slab *slab = getNextSlab(scrubber);
+    if (slab != NULL) {
+      registerSlabForScrubbing(scrubber, slab, true);
+    }
+  }
+  scrubber->highPriorityOnly = true;
+  scrubSlabs(scrubber, parent, callback, errorHandler);
+}
+
+/**********************************************************************/
+void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent)
+{
+  if (isQuiescent(&scrubber->adminState)) {
+    completeCompletion(parent);
+  } else {
+    startDraining(&scrubber->adminState, ADMIN_STATE_SUSPENDING, parent, NULL);
+  }
+}
+
+/**********************************************************************/
+void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent)
+{
+  if (!hasSlabsToScrub(scrubber)) {
+    completeCompletion(parent);
+    return;
+  }
+
+  int result = resumeIfQuiescent(&scrubber->adminState);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  scrubNextSlab(scrubber);
+  completeCompletion(parent);
+}
+
+/**********************************************************************/
+int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter)
+{
+  if (isReadOnly(scrubber->readOnlyNotifier)) {
+    return VDO_READ_ONLY;
+  }
+
+  if (isQuiescent(&scrubber->adminState)) {
+    return VDO_NO_SPACE;
+  }
+
+  return enqueueWaiter(&scrubber->waiters, waiter);
+}
+
+/**********************************************************************/
+void dumpSlabScrubber(const SlabScrubber *scrubber)
+{
+  logInfo("slabScrubber slabCount %u waiters %zu %s%s",
+          getScrubberSlabCount(scrubber),
+          countWaiters(&scrubber->waiters),
+          getAdminStateName(&scrubber->adminState),
+          scrubber->highPriorityOnly ? ", highPriorityOnly " : "");
+}
diff --git a/vdo/base/slabScrubber.h b/vdo/base/slabScrubber.h
new file mode 100644
index 0000000..ca13e63
--- /dev/null
+++ b/vdo/base/slabScrubber.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.h#4 $
+ */
+
+#ifndef SLAB_SCRUBBER_H
+#define SLAB_SCRUBBER_H
+
+#include "completion.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * Create a slab scrubber
+ *
+ * @param layer             The physical layer of the VDO
+ * @param slabJournalSize   The size of a slab journal in blocks
+ * @param readOnlyNotifier  The context for entering read-only mode
+ * @param scrubberPtr       A pointer to hold the scrubber
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeSlabScrubber(PhysicalLayer     *layer,
+                     BlockCount         slabJournalSize,
+                     ReadOnlyNotifier  *readOnlyNotifier,
+                     SlabScrubber     **scrubberPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a slab scrubber and null out the reference to it.
+ *
+ * @param scrubberPtr  A pointer to the scrubber to destroy
+ **/
+void freeSlabScrubber(SlabScrubber **scrubberPtr);
+
+/**
+ * Check whether a scrubber has slabs to scrub.
+ *
+ * @param scrubber  The scrubber to check
+ *
+ * @return <code>true</code> if the scrubber has slabs to scrub
+ **/
+bool hasSlabsToScrub(SlabScrubber *scrubber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Register a slab with a scrubber.
+ *
+ * @param scrubber      The scrubber
+ * @param slab          The slab to scrub
+ * @param highPriority  <code>true</code> if the slab should be put on the
+ *                      high-priority queue
+ **/
+void registerSlabForScrubbing(SlabScrubber *scrubber,
+                              Slab         *slab,
+                              bool          highPriority);
+
+/**
+ * Scrub all the slabs which have been registered with a slab scrubber.
+ *
+ * @param scrubber      The scrubber
+ * @param parent        The object to notify when scrubbing is complete
+ * @param callback      The function to run when scrubbing is complete
+ * @param errorHandler  The handler for scrubbing errors
+ **/
+void scrubSlabs(SlabScrubber *scrubber,
+                void         *parent,
+                VDOAction    *callback,
+                VDOAction    *errorHandler);
+
+/**
+ * Scrub any slabs which have been registered at high priority with a slab
+ * scrubber.
+ *
+ * @param scrubber         The scrubber
+ * @param scrubAtLeastOne  <code>true</code> if one slab should always be
+ *                         scrubbed, even if there are no high-priority slabs
+ *                         (and there is at least one low priority slab)
+ * @param parent           The completion to notify when scrubbing is complete
+ * @param callback         The function to run when scrubbing is complete
+ * @param errorHandler     The handler for scrubbing errors
+ **/
+void scrubHighPrioritySlabs(SlabScrubber  *scrubber,
+                            bool           scrubAtLeastOne,
+                            VDOCompletion *parent,
+                            VDOAction     *callback,
+                            VDOAction     *errorHandler);
+
+/**
+ * Tell the scrubber to stop scrubbing after it finishes the slab it is
+ * currently working on.
+ *
+ * @param scrubber  The scrubber to stop
+ * @param parent    The completion to notify when scrubbing has stopped
+ **/
+void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent);
+
+/**
+ * Tell the scrubber to resume scrubbing if it has been stopped.
+ *
+ * @param scrubber  The scrubber to resume
+ * @param parent    The object to notify once scrubbing has resumed
+ **/
+void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent);
+
+/**
+ * Wait for a clean slab.
+ *
+ * @param scrubber  The scrubber on which to wait
+ * @param waiter    The waiter
+ *
+ * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no
+ *         slabs to scrub, and some other error otherwise
+ **/
+int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter);
+
+/**
+ * Get the number of slabs that are unrecovered or being scrubbed.
+ *
+ * @param scrubber  The scrubber to query
+ *
+ * @return the number of slabs that are unrecovered or being scrubbed
+ **/
+SlabCount getScrubberSlabCount(const SlabScrubber *scrubber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Dump information about a slab scrubber to the log for debugging.
+ *
+ * @param scrubber   The scrubber to dump
+ **/
+void dumpSlabScrubber(const SlabScrubber *scrubber);
+
+#endif /* SLAB_SCRUBBER_H */
diff --git a/vdo/base/slabScrubberInternals.h b/vdo/base/slabScrubberInternals.h
new file mode 100644
index 0000000..3d3e8cd
--- /dev/null
+++ b/vdo/base/slabScrubberInternals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubberInternals.h#5 $
+ */
+
+#ifndef SLAB_SCRUBBER_INTERNALS_H
+#define SLAB_SCRUBBER_INTERNALS_H
+
+#include "slabScrubber.h"
+
+#include "adminState.h"
+#include "atomic.h"
+#include "extent.h"
+#include "ringNode.h"
+
+struct slabScrubber {
+  VDOCompletion     completion;
+  /** The queue of slabs to scrub first */
+  RingNode          highPrioritySlabs;
+  /** The queue of slabs to scrub once there are no highPrioritySlabs */
+  RingNode          slabs;
+  /** The queue of VIOs waiting for a slab to be scrubbed */
+  WaitQueue         waiters;
+
+  // The number of slabs that are unrecovered or being scrubbed. This field is
+  // modified by the physical zone thread, but is queried by other threads.
+  Atomic64          slabCount;
+
+  /** The administrative state of the scrubber */
+  AdminState        adminState;
+  /** Whether to only scrub high-priority slabs */
+  bool              highPriorityOnly;
+  /** The context for entering read-only mode */
+  ReadOnlyNotifier *readOnlyNotifier;
+  /** The slab currently being scrubbed */
+  Slab             *slab;
+  /** The extent for loading slab journal blocks */
+  VDOExtent        *extent;
+  /** A buffer to store the slab journal blocks */
+  char             *journalData;
+};
+
+#endif // SLAB_SCRUBBER_INTERNALS_H
diff --git a/vdo/base/slabSummary.c b/vdo/base/slabSummary.c
new file mode 100644
index 0000000..7021c67
--- /dev/null
+++ b/vdo/base/slabSummary.c
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.c#7 $
+ */
+
+#include "slabSummary.h"
+
+#include "memoryAlloc.h"
+
+#include "adminState.h"
+#include "constants.h"
+#include "extent.h"
+#include "readOnlyNotifier.h"
+#include "slabSummaryInternals.h"
+#include "threadConfig.h"
+#include "types.h"
+
+// SIZING
+
+/**********************************************************************/
+static BlockCount getSlabSummaryZoneSize(BlockSize blockSize)
+{
+  SlabCount entriesPerBlock = blockSize / sizeof(SlabSummaryEntry);
+  BlockCount blocksNeeded   = MAX_SLABS / entriesPerBlock;
+  return blocksNeeded;
+}
+
+/**********************************************************************/
+BlockCount getSlabSummarySize(BlockSize blockSize)
+{
+  return getSlabSummaryZoneSize(blockSize) * MAX_PHYSICAL_ZONES;
+}
+
+// FULLNESS HINT COMPUTATION
+
+/**
+ * Translate a slab's free block count into a 'fullness hint' that can be
+ * stored in a SlabSummaryEntry's 7 bits that are dedicated to its free count.
+ *
+ * Note: the number of free blocks must be strictly less than 2^23 blocks,
+ * even though theoretically slabs could contain precisely 2^23 blocks; there
+ * is an assumption that at least one block is used by metadata. This
+ * assumption is necessary; otherwise, the fullness hint might overflow.
+ * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but
+ * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which
+ * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free
+ * blocks.
+ *
+ * @param summary     The summary which is being updated
+ * @param freeBlocks  The number of free blocks
+ *
+ * @return A fullness hint, which can be stored in 7 bits.
+ **/
+__attribute__((warn_unused_result))
+static uint8_t computeFullnessHint(SlabSummary *summary, BlockCount freeBlocks)
+{
+  ASSERT_LOG_ONLY((freeBlocks < (1 << 23)),
+                  "free blocks must be less than 2^23");
+
+  if (freeBlocks == 0) {
+    return 0;
+  }
+
+  BlockCount hint = freeBlocks >> summary->hintShift;
+  return ((hint == 0) ? 1 : hint);
+}
+
+/**
+ * Translate a slab's free block hint into an approximate count, such that
+ * computeFullnessHint() is the inverse function of getApproximateFreeBlocks()
+ * (i.e. computeFullnessHint(getApproximateFreeBlocks(x)) == x).
+ *
+ * @param  summary        The summary from which the hint was obtained
+ * @param  freeBlockHint  The hint read from the summary
+ *
+ * @return An approximation to the free block count
+ **/
+__attribute__((warn_unused_result))
+static BlockCount getApproximateFreeBlocks(SlabSummary *summary,
+                                           uint8_t      freeBlockHint)
+{
+  return ((BlockCount) freeBlockHint) << summary->hintShift;
+}
+
+// MAKE/FREE FUNCTIONS
+
+/**********************************************************************/
+static void launchWrite(SlabSummaryBlock *summaryBlock);
+
+/**
+ * Initialize a SlabSummaryBlock.
+ *
+ * @param layer             The backing layer
+ * @param summaryZone       The parent SlabSummaryZone
+ * @param threadID          The ID of the thread of physical zone of this block
+ * @param entries           The entries this block manages
+ * @param index             The index of this block in its zone's summary
+ * @param slabSummaryBlock  The block to intialize
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int initializeSlabSummaryBlock(PhysicalLayer    *layer,
+                                      SlabSummaryZone  *summaryZone,
+                                      ThreadID          threadID,
+                                      SlabSummaryEntry *entries,
+                                      BlockCount        index,
+                                      SlabSummaryBlock *slabSummaryBlock)
+{
+  int result = ALLOCATE(VDO_BLOCK_SIZE, char, __func__,
+                        &slabSummaryBlock->outgoingEntries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = createVIO(layer, VIO_TYPE_SLAB_SUMMARY, VIO_PRIORITY_METADATA,
+                     slabSummaryBlock, slabSummaryBlock->outgoingEntries,
+                     &slabSummaryBlock->vio);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  slabSummaryBlock->vio->completion.callbackThreadID = threadID;
+  slabSummaryBlock->zone                             = summaryZone;
+  slabSummaryBlock->entries                          = entries;
+  slabSummaryBlock->index                            = index;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Create a new, empty SlabSummaryZone object.
+ *
+ * @param summary     The summary to which the new zone will belong
+ * @param layer       The layer
+ * @param zoneNumber  The zone this is
+ * @param threadID    The ID of the thread for this zone
+ * @param entries     The buffer to hold the entries in this zone
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int makeSlabSummaryZone(SlabSummary      *summary,
+                               PhysicalLayer    *layer,
+                               ZoneCount         zoneNumber,
+                               ThreadID          threadID,
+                               SlabSummaryEntry *entries)
+{
+  int result = ALLOCATE_EXTENDED(SlabSummaryZone, summary->blocksPerZone,
+                                 SlabSummaryBlock, __func__,
+                                 &summary->zones[zoneNumber]);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SlabSummaryZone *summaryZone = summary->zones[zoneNumber];
+  summaryZone->summary         = summary;
+  summaryZone->zoneNumber      = zoneNumber;
+  summaryZone->entries         = entries;
+
+  if (layer->createMetadataVIO == NULL) {
+    // Blocks are only used for writing, and without a createVIO() call,
+    // we'll never be writing anything.
+    return VDO_SUCCESS;
+  }
+
+  // Initialize each block.
+  for (BlockCount i = 0; i < summary->blocksPerZone; i++) {
+    result = initializeSlabSummaryBlock(layer, summaryZone, threadID, entries,
+                                        i, &summaryZone->summaryBlocks[i]);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+    entries += summary->entriesPerBlock;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeSlabSummary(PhysicalLayer       *layer,
+                    Partition           *partition,
+                    const ThreadConfig  *threadConfig,
+                    unsigned int         slabSizeShift,
+                    BlockCount           maximumFreeBlocksPerSlab,
+                    ReadOnlyNotifier    *readOnlyNotifier,
+                    SlabSummary        **slabSummaryPtr)
+{
+  BlockCount blocksPerZone   = getSlabSummaryZoneSize(VDO_BLOCK_SIZE);
+  SlabCount  entriesPerBlock = MAX_SLABS / blocksPerZone;
+  int result = ASSERT((entriesPerBlock * blocksPerZone) == MAX_SLABS,
+                      "block size must be a multiple of entry size");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (partition == NULL) {
+    // Don't make a slab summary for the formatter since it doesn't need it.
+    return VDO_SUCCESS;
+  }
+
+  SlabSummary *summary;
+  result = ALLOCATE_EXTENDED(SlabSummary, threadConfig->physicalZoneCount,
+                             SlabSummaryZone *, __func__, &summary);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  summary->zoneCount       = threadConfig->physicalZoneCount;
+  summary->readOnlyNotifier = readOnlyNotifier;
+  summary->hintShift       = (slabSizeShift > 6) ? (slabSizeShift - 6) : 0;
+  summary->blocksPerZone   = blocksPerZone;
+  summary->entriesPerBlock = entriesPerBlock;
+
+  size_t totalEntries = MAX_SLABS * MAX_PHYSICAL_ZONES;
+  size_t entryBytes = totalEntries * sizeof(SlabSummaryEntry);
+  result = layer->allocateIOBuffer(layer, entryBytes, "summary entries",
+                                   (char **) &summary->entries);
+  if (result != VDO_SUCCESS) {
+    freeSlabSummary(&summary);
+    return result;
+  }
+
+  // Initialize all the entries.
+  uint8_t hint = computeFullnessHint(summary, maximumFreeBlocksPerSlab);
+  for (size_t i = 0; i < totalEntries; i++) {
+    // This default tail block offset must be reflected in
+    // slabJournal.c::readSlabJournalTail().
+    summary->entries[i] = (SlabSummaryEntry) {
+      .tailBlockOffset = 0,
+      .fullnessHint    = hint,
+      .loadRefCounts   = false,
+      .isDirty         = false,
+    };
+  }
+
+  setSlabSummaryOrigin(summary, partition);
+  for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) {
+    result = makeSlabSummaryZone(summary, layer, zone,
+                                 getPhysicalZoneThread(threadConfig, zone),
+                                 summary->entries + (MAX_SLABS * zone));
+    if (result != VDO_SUCCESS) {
+      freeSlabSummary(&summary);
+      return result;
+    }
+  }
+
+  *slabSummaryPtr = summary;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeSlabSummary(SlabSummary **slabSummaryPtr)
+{
+  if (*slabSummaryPtr == NULL) {
+    return;
+  }
+
+  SlabSummary *summary = *slabSummaryPtr;
+  for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) {
+    SlabSummaryZone *summaryZone = summary->zones[zone];
+    if (summaryZone != NULL) {
+      for (BlockCount i = 0; i < summary->blocksPerZone; i++) {
+        freeVIO(&summaryZone->summaryBlocks[i].vio);
+        FREE(summaryZone->summaryBlocks[i].outgoingEntries);
+      }
+      FREE(summaryZone);
+    }
+  }
+  FREE(summary->entries);
+  FREE(summary);
+  *slabSummaryPtr = NULL;
+}
+
+/**********************************************************************/
+SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone)
+{
+  return summary->zones[zone];
+}
+
+// WRITING FUNCTIONALITY
+
+/**
+ * Check whether a summary zone has finished draining.
+ *
+ * @param summaryZone  The zone to check
+ **/
+static void checkForDrainComplete(SlabSummaryZone *summaryZone)
+{
+  if (!isDraining(&summaryZone->state) || (summaryZone->writeCount > 0)) {
+    return;
+  }
+
+  finishOperationWithResult(&summaryZone->state,
+                            (isReadOnly(summaryZone->summary->readOnlyNotifier)
+                             ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/**
+ * Wake all the waiters in a given queue. If the VDO is in read-only mode they
+ * will be given a VDO_READ_ONLY error code as their context, otherwise they
+ * will be given VDO_SUCCESS.
+ *
+ * @param summaryZone  The slab summary which owns the queue
+ * @param queue        The queue to notify
+ **/
+static void notifyWaiters(SlabSummaryZone *summaryZone, WaitQueue *queue)
+{
+  int result = (isReadOnly(summaryZone->summary->readOnlyNotifier)
+                ? VDO_READ_ONLY : VDO_SUCCESS);
+  notifyAllWaiters(queue, NULL, &result);
+}
+
+/**
+ * Finish processing a block which attempted to write, whether or not the
+ * attempt succeeded.
+ *
+ * @param block  The block
+ **/
+static void finishUpdatingSlabSummaryBlock(SlabSummaryBlock *block)
+{
+  notifyWaiters(block->zone, &block->currentUpdateWaiters);
+  block->writing = false;
+  block->zone->writeCount--;
+  if (hasWaiters(&block->nextUpdateWaiters)) {
+    launchWrite(block);
+  } else {
+    checkForDrainComplete(block->zone);
+  }
+}
+
+/**
+ * This is the callback for a successful block write.
+ *
+ * @param completion  The write VIO
+ **/
+static void finishUpdate(VDOCompletion *completion)
+{
+  SlabSummaryBlock *block = completion->parent;
+  atomicAdd64(&block->zone->summary->statistics.blocksWritten, 1);
+  finishUpdatingSlabSummaryBlock(block);
+}
+
+/**
+ * Handle an error writing a slab summary block.
+ *
+ * @param completion  The write VIO
+ **/
+static void handleWriteError(VDOCompletion *completion)
+{
+  SlabSummaryBlock *block = completion->parent;
+  enterReadOnlyMode(block->zone->summary->readOnlyNotifier,
+                    completion->result);
+  finishUpdatingSlabSummaryBlock(block);
+}
+
+/**
+ * Write a slab summary block unless it is currently out for writing.
+ *
+ * @param [in] block  The block that needs to be committed
+ **/
+static void launchWrite(SlabSummaryBlock *block)
+{
+  if (block->writing) {
+    return;
+  }
+
+  SlabSummaryZone *zone = block->zone;
+  zone->writeCount++;
+  transferAllWaiters(&block->nextUpdateWaiters, &block->currentUpdateWaiters);
+  block->writing = true;
+
+  SlabSummary *summary = zone->summary;
+  if (isReadOnly(summary->readOnlyNotifier)) {
+    finishUpdatingSlabSummaryBlock(block);
+    return;
+  }
+
+  memcpy(block->outgoingEntries, block->entries,
+         sizeof(SlabSummaryEntry) * summary->entriesPerBlock);
+
+  // Flush before writing to ensure that the slab journal tail blocks and
+  // reference updates covered by this summary update are stable (VDO-2332).
+  PhysicalBlockNumber pbn = (summary->origin
+                             + (summary->blocksPerZone * zone->zoneNumber)
+                             + block->index);
+  launchWriteMetadataVIOWithFlush(block->vio, pbn, finishUpdate,
+                                  handleWriteError, true, false);
+}
+
+/**
+ * Initiate a drain.
+ *
+ * Implements AdminInitiator.
+ **/
+static void initiateDrain(AdminState *state)
+{
+  checkForDrainComplete(container_of(state, SlabSummaryZone, state));
+}
+
+/**********************************************************************/
+void drainSlabSummaryZone(SlabSummaryZone *summaryZone,
+                          AdminStateCode   operation,
+                          VDOCompletion   *parent)
+{
+  startDraining(&summaryZone->state, operation, parent, initiateDrain);
+}
+
+/**********************************************************************/
+void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, VDOCompletion *parent)
+{
+  finishCompletion(parent, resumeIfQuiescent(&summaryZone->state));
+}
+
+// READ/UPDATE FUNCTIONS
+
+/**
+ * Get the summary block, and offset into it, for storing the summary for a
+ * slab.
+ *
+ * @param summaryZone    The SlabSummaryZone being queried
+ * @param slabNumber     The slab whose summary location is sought
+ *
+ * @return A pointer to the SlabSummaryEntryBlock containing this
+ *         SlabSummaryEntry
+ **/
+static SlabSummaryBlock *getSummaryBlockForSlab(SlabSummaryZone *summaryZone,
+                                                SlabCount        slabNumber)
+{
+  SlabCount entriesPerBlock = summaryZone->summary->entriesPerBlock;
+  return &summaryZone->summaryBlocks[slabNumber / entriesPerBlock];
+}
+
+/**********************************************************************/
+void updateSlabSummaryEntry(SlabSummaryZone *summaryZone,
+                            Waiter          *waiter,
+                            SlabCount        slabNumber,
+                            TailBlockOffset  tailBlockOffset,
+                            bool             loadRefCounts,
+                            bool             isClean,
+                            BlockCount       freeBlocks)
+{
+  SlabSummaryBlock *block = getSummaryBlockForSlab(summaryZone, slabNumber);
+  int               result;
+  if (isReadOnly(summaryZone->summary->readOnlyNotifier)) {
+    result = VDO_READ_ONLY;
+  } else if (isDraining(&summaryZone->state)
+             || isQuiescent(&summaryZone->state)) {
+    result = VDO_INVALID_ADMIN_STATE;
+  } else {
+    uint8_t hint = computeFullnessHint(summaryZone->summary, freeBlocks);
+    SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
+    *entry = (SlabSummaryEntry) {
+      .tailBlockOffset = tailBlockOffset,
+      .loadRefCounts   = (entry->loadRefCounts || loadRefCounts),
+      .isDirty         = !isClean,
+      .fullnessHint    = hint,
+    };
+    result = enqueueWaiter(&block->nextUpdateWaiters, waiter);
+  }
+
+  if (result != VDO_SUCCESS) {
+    waiter->callback(waiter, &result);
+    return;
+  }
+
+  launchWrite(block);
+}
+
+/**********************************************************************/
+TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone,
+                                             SlabCount        slabNumber)
+{
+  return summaryZone->entries[slabNumber].tailBlockOffset;
+}
+
+/**********************************************************************/
+bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber)
+{
+  return summaryZone->entries[slabNumber].loadRefCounts;
+}
+
+/**********************************************************************/
+bool getSummarizedCleanliness(SlabSummaryZone *summaryZone,
+                              SlabCount        slabNumber)
+{
+  return !summaryZone->entries[slabNumber].isDirty;
+}
+
+/**********************************************************************/
+BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone,
+                                       SlabCount        slabNumber)
+{
+  SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
+  return getApproximateFreeBlocks(summaryZone->summary, entry->fullnessHint);
+}
+
+/**********************************************************************/
+void getSummarizedRefCountsState(SlabSummaryZone *summaryZone,
+                                 SlabCount        slabNumber,
+                                 size_t          *freeBlockHint,
+                                 bool            *isClean)
+{
+  SlabSummaryEntry *entry = &summaryZone->entries[slabNumber];
+  *freeBlockHint          = entry->fullnessHint;
+  *isClean                = !entry->isDirty;
+}
+
+/**********************************************************************/
+void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone,
+                               SlabCount        slabCount,
+                               SlabStatus      *statuses)
+{
+  for (SlabCount i = 0; i < slabCount; i++) {
+    statuses[i] = (SlabStatus) {
+      .slabNumber = i,
+      .isClean    = !summaryZone->entries[i].isDirty,
+      .emptiness  = summaryZone->entries[i].fullnessHint
+    };
+  }
+}
+
+// RESIZE FUNCTIONS
+
+/**********************************************************************/
+void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition)
+{
+  summary->origin = getFixedLayoutPartitionOffset(partition);
+}
+
+// COMBINING FUNCTIONS (LOAD)
+
+/**
+ * Clean up after saving out the combined slab summary. This callback is
+ * registered in finishLoadingSummary() and loadSlabSummary().
+ *
+ * @param completion  The extent which was used to write the summary data
+ **/
+static void finishCombiningZones(VDOCompletion *completion)
+{
+  SlabSummary *summary = completion->parent;
+  int          result  = completion->result;
+  VDOExtent   *extent  = asVDOExtent(completion);
+  freeExtent(&extent);
+  finishLoadingWithResult(&summary->zones[0]->state, result);
+}
+
+/**********************************************************************/
+void combineZones(SlabSummary *summary)
+{
+  // Combine all the old summary data into the portion of the buffer
+  // corresponding to the first zone.
+  ZoneCount zone = 0;
+  if (summary->zonesToCombine > 1) {
+    for (SlabCount entryNumber = 0; entryNumber < MAX_SLABS; entryNumber++) {
+      if (zone != 0) {
+        memcpy(summary->entries + entryNumber,
+               summary->entries + (zone * MAX_SLABS) + entryNumber,
+               sizeof(SlabSummaryEntry));
+      }
+      zone++;
+      if (zone == summary->zonesToCombine) {
+        zone = 0;
+      }
+    }
+  }
+
+  // Copy the combined data to each zones's region of the buffer.
+  for (zone = 1; zone < MAX_PHYSICAL_ZONES; zone++) {
+    memcpy(summary->entries + (zone * MAX_SLABS), summary->entries,
+           MAX_SLABS * sizeof(SlabSummaryEntry));
+  }
+}
+
+/**
+ * Combine the slab summary data from all the previously written zones
+ * and copy the combined summary to each partition's data region. Then write
+ * the combined summary back out to disk. This callback is registered in
+ * loadSlabSummary().
+ *
+ * @param completion  The extent which was used to read the summary data
+ **/
+static void finishLoadingSummary(VDOCompletion *completion)
+{
+  SlabSummary *summary = completion->parent;
+  VDOExtent   *extent  = asVDOExtent(completion);
+
+  // Combine the zones so each zone is correct for all slabs.
+  combineZones(summary);
+
+  // Write the combined summary back out.
+  extent->completion.callback = finishCombiningZones;
+  writeMetadataExtent(extent, summary->origin);
+}
+
+/**********************************************************************/
+void loadSlabSummary(SlabSummary    *summary,
+                     AdminStateCode  operation,
+                     ZoneCount       zonesToCombine,
+                     VDOCompletion  *parent)
+{
+  SlabSummaryZone *zone = summary->zones[0];
+  if (!startLoading(&zone->state, operation, parent, NULL)) {
+    return;
+  }
+
+  VDOExtent *extent;
+  BlockCount blocks = summary->blocksPerZone * MAX_PHYSICAL_ZONES;
+  int        result = createExtent(parent->layer, VIO_TYPE_SLAB_SUMMARY,
+                                   VIO_PRIORITY_METADATA, blocks,
+                                   (char *) summary->entries, &extent);
+  if (result != VDO_SUCCESS) {
+    finishLoadingWithResult(&zone->state, result);
+    return;
+  }
+
+  if ((operation == ADMIN_STATE_FORMATTING)
+      || (operation == ADMIN_STATE_LOADING_FOR_REBUILD)) {
+    prepareCompletion(&extent->completion, finishCombiningZones,
+                      finishCombiningZones, 0, summary);
+    writeMetadataExtent(extent, summary->origin);
+    return;
+  }
+
+  summary->zonesToCombine = zonesToCombine;
+  prepareCompletion(&extent->completion, finishLoadingSummary,
+                    finishCombiningZones, 0, summary);
+  readMetadataExtent(extent, summary->origin);
+}
+
+/**********************************************************************/
+SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary)
+{
+  const AtomicSlabSummaryStatistics *atoms = &summary->statistics;
+  return (SlabSummaryStatistics) {
+    .blocksWritten = atomicLoad64(&atoms->blocksWritten),
+  };
+}
diff --git a/vdo/base/slabSummary.h b/vdo/base/slabSummary.h
new file mode 100644
index 0000000..4ce32cb
--- /dev/null
+++ b/vdo/base/slabSummary.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.h#5 $
+ */
+
+#ifndef SLAB_SUMMARY_H
+#define SLAB_SUMMARY_H
+
+#include "completion.h"
+#include "fixedLayout.h"
+#include "slab.h"
+#include "statistics.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * The SlabSummary provides hints during load and recovery about the state
+ * of the slabs in order to avoid the need to read the slab journals in their
+ * entirety before a VDO can come online.
+ *
+ * The information in the summary for each slab includes the rough number of
+ * free blocks (which is used to prioritize scrubbing), the cleanliness of a
+ * slab (so that clean slabs containing free space will be used on restart),
+ * and the location of the tail block of the slab's journal.
+ *
+ * The SlabSummary has its own partition at the end of the volume which is
+ * sized to allow for a complete copy of the summary for each of up to 16
+ * physical zones.
+ *
+ * During resize, the SlabSummary moves its backing partition and is saved once
+ * moved; the SlabSummary is not permitted to overwrite the previous recovery
+ * journal space.
+ *
+ * The SlabSummary does not have its own version information, but relies on the
+ * master version number.
+ **/
+
+/**
+ * The offset of a slab journal tail block.
+ **/
+typedef uint8_t TailBlockOffset;
+
+/**
+ * A slab status is a very small structure for use in determining the ordering
+ * of slabs in the scrubbing process.
+ **/
+typedef struct slabStatus {
+  SlabCount slabNumber;
+  bool      isClean;
+  uint8_t   emptiness;
+} SlabStatus;
+
+/**
+ * Returns the size on disk of the SlabSummary structure.
+ *
+ * @param blockSize  The block size of the physical layer
+ *
+ * @return the blocks required to store the SlabSummary on disk
+ **/
+BlockCount getSlabSummarySize(BlockSize blockSize)
+__attribute__((warn_unused_result));
+
+/**
+ * Create a slab summary.
+ *
+ * @param [in]  layer                     The layer
+ * @param [in]  partition                 The partition to hold the summary
+ * @param [in]  threadConfig              The thread config of the VDO
+ * @param [in]  slabSizeShift             The number of bits in the slab size
+ * @param [in]  maximumFreeBlocksPerSlab  The maximum number of free blocks a
+ *                                        slab can have
+ * @param [in]  readOnlyNotifier          The context for entering read-only
+ *                                        mode
+ * @param [out] slabSummaryPtr            A pointer to hold the summary
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeSlabSummary(PhysicalLayer       *layer,
+                    Partition           *partition,
+                    const ThreadConfig  *threadConfig,
+                    unsigned int         slabSizeShift,
+                    BlockCount           maximumFreeBlocksPerSlab,
+                    ReadOnlyNotifier    *readOnlyNotifier,
+                    SlabSummary        **slabSummaryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a SlabSummary and NULL out the reference to it.
+ *
+ * @param [in,out] slabSummaryPtr A pointer to the SlabSummary to free
+ **/
+void freeSlabSummary(SlabSummary **slabSummaryPtr);
+
+/**
+ * Get the portion of the slab summary for a specified zone.
+ *
+ * @param summary  The slab summary
+ * @param zone     The zone
+ *
+ * @return The portion of the slab summary for the specified zone
+ **/
+SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone)
+  __attribute__((warn_unused_result));
+
+/**
+ * Drain a zone of the slab summary.
+ *
+ * @param summaryZone  The zone to drain
+ * @param operation    The type of drain to perform
+ * @param parent       The object to notify when the suspend is complete
+ **/
+void drainSlabSummaryZone(SlabSummaryZone *summaryZone,
+                          AdminStateCode   operation,
+                          VDOCompletion   *parent);
+
+/**
+ * Resume a zone of the slab summary.
+ *
+ * @param summaryZone  The zone to resume
+ * @param parent       The object to notify when the zone is resumed
+ **/
+void resumeSlabSummaryZone(SlabSummaryZone *summaryZone,
+                           VDOCompletion   *parent);
+
+/**
+ * Update the entry for a slab.
+ *
+ * @param summaryZone     The SlabSummaryZone for the zone of the slab
+ * @param waiter          The waiter that is updating the summary
+ * @param slabNumber      The slab number to update
+ * @param tailBlockOffset The offset of slab journal's tail block
+ * @param loadRefCounts   Whether the refCounts must be loaded from the layer
+ *                        on the next load
+ * @param isClean         Whether the slab is clean
+ * @param freeBlocks      The number of free blocks
+ **/
+void updateSlabSummaryEntry(SlabSummaryZone *summaryZone,
+                            Waiter          *waiter,
+                            SlabCount        slabNumber,
+                            TailBlockOffset  tailBlockOffset,
+                            bool             loadRefCounts,
+                            bool             isClean,
+                            BlockCount       freeBlocks);
+
+/**
+ * Get the stored tail block offset for a slab.
+ *
+ * @param summaryZone       The SlabSummaryZone to use
+ * @param slabNumber        The slab number to get the offset for
+ *
+ * @return The tail block offset for the slab
+ **/
+TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone,
+                                             SlabCount        slabNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Whether refCounts must be loaded from the layer.
+ *
+ * @param summaryZone   The SlabSummaryZone to use
+ * @param slabNumber    The slab number to get information for
+ *
+ * @return Whether refCounts must be loaded
+ **/
+bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the stored cleanliness information for a single slab.
+ *
+ * @param summaryZone   The SlabSummaryZone to use
+ * @param slabNumber    The slab number to get information for
+ *
+ * @return Whether the slab is clean
+ **/
+bool getSummarizedCleanliness(SlabSummaryZone *summaryZone,
+                              SlabCount        slabNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the stored emptiness information for a single slab.
+ *
+ * @param summaryZone    The SlabSummaryZone to use
+ * @param slabNumber     The slab number to get information for
+ *
+ * @return An approximation to the free blocks in the slab
+ **/
+BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone,
+                                       SlabCount        slabNumber)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the stored RefCounts state information for a single slab. Used
+ * in testing only.
+ *
+ * @param [in]  summaryZone     The SlabSummaryZone to use
+ * @param [in]  slabNumber      The slab number to get information for
+ * @param [out] freeBlockHint   The approximate number of free blocks
+ * @param [out] isClean         Whether the slab is clean
+ **/
+void getSummarizedRefCountsState(SlabSummaryZone *summaryZone,
+                                 SlabCount        slabNumber,
+                                 size_t          *freeBlockHint,
+                                 bool            *isClean);
+
+/**
+ * Get the stored slab statuses for all slabs in a zone.
+ *
+ * @param [in]     summaryZone   The SlabSummaryZone to use
+ * @param [in]     slabCount     The number of slabs to fetch
+ * @param [in,out] statuses      An array of SlabStatuses to populate
+ **/
+void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone,
+                               SlabCount        slabCount,
+                               SlabStatus      *statuses);
+
+/**
+ * Set the origin of the slab summary relative to the physical layer.
+ *
+ * @param summary    The SlabSummary to update
+ * @param partition  The slab summary partition
+ **/
+void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition);
+
+/**
+ * Read in all the slab summary data from the slab summary partition,
+ * combine all the previously used zones into a single zone, and then
+ * write the combined summary back out to each possible zones' summary
+ * region.
+ *
+ * @param summary         The summary to load
+ * @param operation       The type of load to perform
+ * @param zonesToCombine  The number of zones to be combined; if set to 0,
+ *                        all of the summary will be initialized as new.
+ * @param parent          The parent of this operation
+ **/
+void loadSlabSummary(SlabSummary    *summary,
+                     AdminStateCode  operation,
+                     ZoneCount       zonesToCombine,
+                     VDOCompletion  *parent);
+
+/**
+ * Fetch the cumulative statistics for all slab summary zones in a summary.
+ *
+ * @param summary       The summary in question
+ *
+ * @return the cumulative slab summary statistics for the summary
+ **/
+SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary)
+  __attribute__((warn_unused_result));
+
+#endif // SLAB_SUMMARY_H
diff --git a/vdo/base/slabSummaryInternals.h b/vdo/base/slabSummaryInternals.h
new file mode 100644
index 0000000..8ac071c
--- /dev/null
+++ b/vdo/base/slabSummaryInternals.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummaryInternals.h#7 $
+ */
+
+#ifndef SLAB_SUMMARY_INTERNALS_H
+#define SLAB_SUMMARY_INTERNALS_H
+
+#include "slabSummary.h"
+
+#include "adminState.h"
+#include "atomic.h"
+
+typedef struct slabSummaryEntry {
+  /** Bits 7..0: The offset of the tail block within the slab journal */
+  TailBlockOffset tailBlockOffset;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  /** Bits 13..8: A hint about the fullness of the slab */
+  unsigned int    fullnessHint  : 6;
+  /** Bit 14: Whether the refCounts must be loaded from the layer */
+  unsigned int    loadRefCounts : 1;
+  /** Bit 15: The believed cleanliness of this slab */
+  unsigned int    isDirty       : 1;
+#else
+  /** Bit 15: The believed cleanliness of this slab */
+  unsigned int    isDirty       : 1;
+  /** Bit 14: Whether the refCounts must be loaded from the layer */
+  unsigned int    loadRefCounts : 1;
+  /** Bits 13..8: A hint about the fullness of the slab */
+  unsigned int    fullnessHint  : 6;
+#endif
+}  __attribute__((packed)) SlabSummaryEntry;
+
+typedef struct slabSummaryBlock {
+  /** The zone to which this block belongs */
+  SlabSummaryZone  *zone;
+  /** The index of this block in its zone's summary */
+  BlockCount        index;
+  /** Whether this block has a write outstanding */
+  bool              writing;
+  /** Ring of updates waiting on the outstanding write */
+  WaitQueue         currentUpdateWaiters;
+  /** Ring of updates waiting on the next write */
+  WaitQueue         nextUpdateWaiters;
+  /** The active SlabSummaryEntry array for this block */
+  SlabSummaryEntry *entries;
+  /** The VIO used to write this block */
+  VIO              *vio;
+  /** The packed entries, one block long, backing the VIO */
+  char             *outgoingEntries;
+} SlabSummaryBlock;
+
+/**
+ * The statistics for all the slab summary zones owned by this slab summary.
+ * These fields are all mutated only by their physical zone threads, but are
+ * read by other threads when gathering statistics for the entire depot.
+ **/
+typedef struct atomicSlabSummaryStatistics {
+  /** Number of blocks written */
+  Atomic64 blocksWritten;
+} AtomicSlabSummaryStatistics;
+
+struct slabSummaryZone {
+  /** The summary of which this is a zone */
+  SlabSummary      *summary;
+  /** The number of this zone */
+  ZoneCount         zoneNumber;
+  /** Count of the number of blocks currently out for writing */
+  BlockCount        writeCount;
+  /** The state of this zone */
+  AdminState        state;
+  /** The array (owned by the blocks) of all entries */
+  SlabSummaryEntry *entries;
+  /** The array of SlabSummaryEntryBlocks */
+  SlabSummaryBlock  summaryBlocks[];
+};
+
+struct slabSummary {
+  /** The context for entering read-only mode */
+  ReadOnlyNotifier            *readOnlyNotifier;
+  /** The statistics for this slab summary */
+  AtomicSlabSummaryStatistics  statistics;
+  /** The start of the slab summary partition relative to the layer */
+  PhysicalBlockNumber          origin;
+  /** The number of bits to shift to get a 7-bit fullness hint */
+  unsigned int                 hintShift;
+  /** The number of blocks (calculated based on MAX_SLABS) */
+  BlockCount                   blocksPerZone;
+  /** The number of slabs per block (calculated from block size) */
+  SlabCount                    entriesPerBlock;
+  /** The entries for all of the zones the partition can hold */
+  SlabSummaryEntry            *entries;
+  /** The number of zones which were active at the time of the last update */
+  ZoneCount                    zonesToCombine;
+  /** The current number of active zones */
+  ZoneCount                    zoneCount;
+  /** The currently active zones */
+  SlabSummaryZone             *zones[];
+};
+
+/**
+ * Treating the current entries buffer as the on-disk value of all zones,
+ * update every zone to the correct values for every slab.
+ *
+ * @param summary       The summary whose entries should be combined
+ **/
+void combineZones(SlabSummary *summary);
+
+#endif // SLAB_SUMMARY_INTERNALS_H
diff --git a/vdo/base/statistics.h b/vdo/base/statistics.h
new file mode 100644
index 0000000..2511076
--- /dev/null
+++ b/vdo/base/statistics.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ */
+
+#ifndef STATISTICS_H
+#define STATISTICS_H
+
+#include "header.h"
+#include "types.h"
+
+enum {
+  STATISTICS_VERSION = 31,
+};
+
+typedef struct {
+  /** The total number of slabs from which blocks may be allocated */
+  uint64_t slabCount;
+  /** The total number of slabs from which blocks have ever been allocated */
+  uint64_t slabsOpened;
+  /** The number of times since loading that a slab has been re-opened */
+  uint64_t slabsReopened;
+} BlockAllocatorStatistics;
+
+/**
+ * Counters for tracking the number of items written (blocks, requests, etc.)
+ * that keep track of totals at steps in the write pipeline. Three counters
+ * allow the number of buffered, in-memory items and the number of in-flight,
+ * unacknowledged writes to be derived, while still tracking totals for
+ * reporting purposes
+ **/
+typedef struct {
+  /** The total number of items on which processing has started */
+  uint64_t started;
+  /** The total number of items for which a write operation has been issued */
+  uint64_t written;
+  /** The total number of items for which a write operation has completed */
+  uint64_t committed;
+} CommitStatistics;
+
+/** Counters for events in the recovery journal */
+typedef struct {
+  /** Number of times the on-disk journal was full */
+  uint64_t diskFull;
+  /** Number of times the recovery journal requested slab journal commits. */
+  uint64_t slabJournalCommitsRequested;
+  /** Write/Commit totals for individual journal entries */
+  CommitStatistics entries;
+  /** Write/Commit totals for journal blocks */
+  CommitStatistics blocks;
+} RecoveryJournalStatistics;
+
+/** The statistics for the compressed block packer. */
+typedef struct {
+  /** Number of compressed data items written since startup */
+  uint64_t compressedFragmentsWritten;
+  /** Number of blocks containing compressed items written since startup */
+  uint64_t compressedBlocksWritten;
+  /** Number of VIOs that are pending in the packer */
+  uint64_t compressedFragmentsInPacker;
+} PackerStatistics;
+
+/** The statistics for the slab journals. */
+typedef struct {
+  /** Number of times the on-disk journal was full */
+  uint64_t diskFullCount;
+  /** Number of times an entry was added over the flush threshold */
+  uint64_t flushCount;
+  /** Number of times an entry was added over the block threshold */
+  uint64_t blockedCount;
+  /** Number of times a tail block was written */
+  uint64_t blocksWritten;
+  /** Number of times we had to wait for the tail to write */
+  uint64_t tailBusyCount;
+} SlabJournalStatistics;
+
+/** The statistics for the slab summary. */
+typedef struct {
+  /** Number of blocks written */
+  uint64_t blocksWritten;
+} SlabSummaryStatistics;
+
+/** The statistics for the reference counts. */
+typedef struct {
+  /** Number of reference blocks written */
+  uint64_t blocksWritten;
+} RefCountsStatistics;
+
+/** The statistics for the block map. */
+typedef struct {
+  /** number of dirty (resident) pages */
+  uint32_t dirtyPages;
+  /** number of clean (resident) pages */
+  uint32_t cleanPages;
+  /** number of free pages */
+  uint32_t freePages;
+  /** number of pages in failed state */
+  uint32_t failedPages;
+  /** number of pages incoming */
+  uint32_t incomingPages;
+  /** number of pages outgoing */
+  uint32_t outgoingPages;
+  /** how many times free page not avail */
+  uint32_t cachePressure;
+  /** number of getVDOPageAsync() for read */
+  uint64_t readCount;
+  /** number or getVDOPageAsync() for write */
+  uint64_t writeCount;
+  /** number of times pages failed to read */
+  uint64_t failedReads;
+  /** number of times pages failed to write */
+  uint64_t failedWrites;
+  /** number of gets that are reclaimed */
+  uint64_t reclaimed;
+  /** number of gets for outgoing pages */
+  uint64_t readOutgoing;
+  /** number of gets that were already there */
+  uint64_t foundInCache;
+  /** number of gets requiring discard */
+  uint64_t discardRequired;
+  /** number of gets enqueued for their page */
+  uint64_t waitForPage;
+  /** number of gets that have to fetch */
+  uint64_t fetchRequired;
+  /** number of page fetches */
+  uint64_t pagesLoaded;
+  /** number of page saves */
+  uint64_t pagesSaved;
+  /** the number of flushes issued */
+  uint64_t flushCount;
+} BlockMapStatistics;
+
+/** The dedupe statistics from hash locks */
+typedef struct {
+  /** Number of times the UDS advice proved correct */
+  uint64_t dedupeAdviceValid;
+  /** Number of times the UDS advice proved incorrect */
+  uint64_t dedupeAdviceStale;
+  /** Number of writes with the same data as another in-flight write */
+  uint64_t concurrentDataMatches;
+  /** Number of writes whose hash collided with an in-flight write */
+  uint64_t concurrentHashCollisions;
+} HashLockStatistics;
+
+/** Counts of error conditions in VDO. */
+typedef struct {
+  /** number of times VDO got an invalid dedupe advice PBN from UDS */
+  uint64_t invalidAdvicePBNCount;
+  /** number of times a VIO completed with a VDO_NO_SPACE error */
+  uint64_t noSpaceErrorCount;
+  /** number of times a VIO completed with a VDO_READ_ONLY error */
+  uint64_t readOnlyErrorCount;
+} ErrorStatistics;
+
+/** The statistics of the vdo service. */
+struct vdoStatistics {
+  uint32_t version;
+  uint32_t releaseVersion;
+  /** Number of blocks used for data */
+  uint64_t dataBlocksUsed;
+  /** Number of blocks used for VDO metadata */
+  uint64_t overheadBlocksUsed;
+  /** Number of logical blocks that are currently mapped to physical blocks */
+  uint64_t logicalBlocksUsed;
+  /** number of physical blocks */
+  BlockCount physicalBlocks;
+  /** number of logical blocks */
+  BlockCount logicalBlocks;
+  /** Size of the block map page cache, in bytes */
+  uint64_t blockMapCacheSize;
+  /** String describing the active write policy of the VDO */
+  char writePolicy[15];
+  /** The physical block size */
+  uint64_t blockSize;
+  /** Number of times the VDO has successfully recovered */
+  uint64_t completeRecoveries;
+  /** Number of times the VDO has recovered from read-only mode */
+  uint64_t readOnlyRecoveries;
+  /** String describing the operating mode of the VDO */
+  char mode[15];
+  /** Whether the VDO is in recovery mode */
+  bool inRecoveryMode;
+  /** What percentage of recovery mode work has been completed */
+  uint8_t recoveryPercentage;
+  /** The statistics for the compressed block packer */
+  PackerStatistics packer;
+  /** Counters for events in the block allocator */
+  BlockAllocatorStatistics allocator;
+  /** Counters for events in the recovery journal */
+  RecoveryJournalStatistics journal;
+  /** The statistics for the slab journals */
+  SlabJournalStatistics slabJournal;
+  /** The statistics for the slab summary */
+  SlabSummaryStatistics slabSummary;
+  /** The statistics for the reference counts */
+  RefCountsStatistics refCounts;
+  /** The statistics for the block map */
+  BlockMapStatistics blockMap;
+  /** The dedupe statistics from hash locks */
+  HashLockStatistics hashLock;
+  /** Counts of error conditions */
+  ErrorStatistics errors;
+};
+
+/**
+ * Get the proc file path for reading VDOStatistics.
+ *
+ * @return The proc file path
+ **/
+static inline const char *getVDOStatisticsProcFile(void) {
+  return "dedupe_stats";
+}
+
+#endif /* not STATISTICS_H */
diff --git a/vdo/base/statusCodes.c b/vdo/base/statusCodes.c
new file mode 100644
index 0000000..40be3fd
--- /dev/null
+++ b/vdo/base/statusCodes.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.c#3 $
+ */
+
+#include "statusCodes.h"
+
+#include "errors.h"
+#include "permassert.h"
+#include "threadOnce.h"
+
+const struct errorInfo vdoStatusList[] = {
+  { "VDO_NOT_IMPLEMENTED",       "Not implemented"                           },
+  { "VDO_OUT_OF_RANGE",          "Out of range"                              },
+  { "VDO_REF_COUNT_INVALID",     "Reference count would become invalid"      },
+  { "VDO_NO_SPACE",              "Out of space"                              },
+  { "VDO_UNEXPECTED_EOF",        "Unexpected EOF on block read"              },
+  { "VDO_BAD_CONFIGURATION",     "Bad configuration option"                  },
+  { "VDO_SOCKET_ERROR",          "Socket error"                              },
+  { "VDO_BAD_ALIGNMENT",         "Mis-aligned block reference"               },
+  { "VDO_COMPONENT_BUSY",        "Prior operation still in progress"         },
+  { "VDO_BAD_PAGE",              "Corrupt or incorrect page"                 },
+  { "VDO_UNSUPPORTED_VERSION",   "Unsupported component version"             },
+  { "VDO_INCORRECT_COMPONENT",   "Component id mismatch in decoder"          },
+  { "VDO_PARAMETER_MISMATCH",    "Parameters have conflicting values"        },
+  { "VDO_BLOCK_SIZE_TOO_SMALL",  "The block size is too small"               },
+  { "VDO_UNKNOWN_PARTITION",     "No partition exists with a given id"       },
+  { "VDO_PARTITION_EXISTS",      "A partition already exists with a given id"},
+  { "VDO_NOT_READ_ONLY",         "The device is not in read-only mode"       },
+  { "VDO_INCREMENT_TOO_SMALL",   "Physical block growth of too few blocks"   },
+  { "VDO_CHECKSUM_MISMATCH",     "Incorrect checksum"                        },
+  { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full"              },
+  { "VDO_LOCK_ERROR",            "A lock is held incorrectly"                },
+  { "VDO_READ_ONLY",             "The device is in read-only mode"           },
+  { "VDO_SHUTTING_DOWN",         "The device is shutting down"               },
+  { "VDO_CORRUPT_JOURNAL",       "Recovery journal entries corrupted"        },
+  { "VDO_TOO_MANY_SLABS",        "Exceeds maximum number of slabs supported" },
+  { "VDO_INVALID_FRAGMENT",      "Compressed block fragment is invalid"      },
+  { "VDO_RETRY_AFTER_REBUILD",   "Retry operation after rebuilding finishes" },
+  { "VDO_UNKNOWN_COMMAND",       "The extended command is not known"         },
+  { "VDO_COMMAND_ERROR",         "Bad extended command parameters"           },
+  { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit"      },
+  { "VDO_BAD_MAPPING",           "Invalid page mapping"                      },
+  { "VDO_READ_CACHE_BUSY",       "Read cache has no free slots"              },
+  { "VDO_BIO_CREATION_FAILED",   "Bio creation failed"                       },
+  { "VDO_BAD_MAGIC",             "Bad magic number"                          },
+  { "VDO_BAD_NONCE",             "Bad nonce"                                 },
+  { "VDO_JOURNAL_OVERFLOW",      "Journal sequence number overflow"          },
+  { "VDO_INVALID_ADMIN_STATE",   "Invalid operation for current state"       },
+};
+
+#ifndef __KERNEL__
+static OnceState vdoStatusCodesRegistered = ONCE_STATE_INITIALIZER;
+static int       statusCodeRegistrationResult;
+
+/**********************************************************************/
+static void doStatusCodeRegistration(void)
+{
+  STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE)
+                == COUNT_OF(vdoStatusList));
+
+  int result = registerErrorBlock("VDO Status",
+                                  VDO_STATUS_CODE_BASE,
+                                  VDO_STATUS_CODE_BLOCK_END,
+                                  vdoStatusList,
+                                  sizeof(vdoStatusList));
+  /*
+   *  The following test handles cases where libvdo is statically linked
+   *  against both the test modules and the test driver (because multiple
+   *  instances of this module call their own copy of this function
+   *  once each, resulting in multiple calls to registerErrorBlock which
+   *  is shared in libuds).
+   */
+  if (result == UDS_DUPLICATE_NAME) {
+    result = UDS_SUCCESS;
+  }
+
+  statusCodeRegistrationResult
+    = (result == UDS_SUCCESS) ? VDO_SUCCESS : result;
+}
+#endif
+
+/**********************************************************************/
+int registerStatusCodes(void)
+{
+#ifdef __KERNEL__
+  return VDO_SUCCESS;
+#else
+  performOnce(&vdoStatusCodesRegistered, doStatusCodeRegistration);
+  return statusCodeRegistrationResult;
+#endif
+}
diff --git a/vdo/base/statusCodes.h b/vdo/base/statusCodes.h
new file mode 100644
index 0000000..dd3a3ff
--- /dev/null
+++ b/vdo/base/statusCodes.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.h#2 $
+ */
+
+#ifndef STATUS_CODES_H
+#define STATUS_CODES_H
+
+#include "errors.h"
+
+enum {
+  UDS_BLOCK_SIZE      = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+  VDO_BLOCK_START     = UDS_ERROR_CODE_BLOCK_END,
+  VDO_BLOCK_END       = VDO_BLOCK_START + UDS_BLOCK_SIZE,
+  PRP_BLOCK_START     = VDO_BLOCK_END,
+  PRP_BLOCK_END       = PRP_BLOCK_START + UDS_BLOCK_SIZE,
+};
+
+/**
+ *  VDO-specific status codes.
+ **/
+enum vdoStatusCodes {
+  /** successful result */
+  VDO_SUCCESS          = 0,
+  /** base of all VDO errors */
+  VDO_STATUS_CODE_BASE = VDO_BLOCK_START,
+  /** we haven't written this yet */
+  VDO_NOT_IMPLEMENTED  = VDO_STATUS_CODE_BASE,
+  /** input out of range */
+  VDO_OUT_OF_RANGE,
+  /** an invalid reference count would result */
+  VDO_REF_COUNT_INVALID,
+  /** a free block could not be allocated */
+  VDO_NO_SPACE,
+  /** unexpected EOF on block read */
+  VDO_UNEXPECTED_EOF,
+  /** improper or missing configuration option */
+  VDO_BAD_CONFIGURATION,
+  /** socket opening or binding problem */
+  VDO_SOCKET_ERROR,
+  /** read or write on non-aligned offset */
+  VDO_BAD_ALIGNMENT,
+  /** prior operation still in progress */
+  VDO_COMPONENT_BUSY,
+  /** page contents incorrect or corrupt data */
+  VDO_BAD_PAGE,
+  /** unsupported version of some component */
+  VDO_UNSUPPORTED_VERSION,
+  /** component id mismatch in decoder */
+  VDO_INCORRECT_COMPONENT,
+  /** parameters have conflicting values */
+  VDO_PARAMETER_MISMATCH,
+  /** the block size is too small */
+  VDO_BLOCK_SIZE_TOO_SMALL,
+  /** no partition exists with a given id */
+  VDO_UNKNOWN_PARTITION,
+  /** a partition already exists with a given id */
+  VDO_PARTITION_EXISTS,
+  /** the VDO is not in read-only mode */
+  VDO_NOT_READ_ONLY,
+  /** physical block growth of too few blocks */
+  VDO_INCREMENT_TOO_SMALL,
+  /** incorrect checksum */
+  VDO_CHECKSUM_MISMATCH,
+  /** the recovery journal is full */
+  VDO_RECOVERY_JOURNAL_FULL,
+  /** a lock is held incorrectly */
+  VDO_LOCK_ERROR,
+  /** the VDO is in read-only mode */
+  VDO_READ_ONLY,
+  /** the VDO is shutting down */
+  VDO_SHUTTING_DOWN,
+  /** the recovery journal has corrupt entries */
+  VDO_CORRUPT_JOURNAL,
+  /** exceeds maximum number of slabs supported */
+  VDO_TOO_MANY_SLABS,
+  /** a compressed block fragment is invalid */
+  VDO_INVALID_FRAGMENT,
+  /** action is unsupported while rebuilding */
+  VDO_RETRY_AFTER_REBUILD,
+  /** the extended command is not known */
+  VDO_UNKNOWN_COMMAND,
+  /** bad extended command parameters */
+  VDO_COMMAND_ERROR,
+  /** cannot determine sizes to fit */
+  VDO_CANNOT_DETERMINE_SIZE,
+  /** a block map entry is invalid */
+  VDO_BAD_MAPPING,
+  /** read cache has no free slots */
+  VDO_READ_CACHE_BUSY,
+  /** bio_add_page failed */
+  VDO_BIO_CREATION_FAILED,
+  /** bad magic number */
+  VDO_BAD_MAGIC,
+  /** bad nonce */
+  VDO_BAD_NONCE,
+  /** sequence number overflow */
+  VDO_JOURNAL_OVERFLOW,
+  /** the VDO is not in a state to perform an admin operation */
+  VDO_INVALID_ADMIN_STATE,
+  /** one more than last error code */
+  VDO_STATUS_CODE_LAST,
+  VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END
+};
+
+extern const struct errorInfo vdoStatusList[];
+
+/**
+ * Register the VDO status codes if needed.
+ *
+ * @return a success or error code
+ **/
+int registerStatusCodes(void);
+
+#endif // STATUS_CODES_H
diff --git a/vdo/base/superBlock.c b/vdo/base/superBlock.c
new file mode 100644
index 0000000..a7376e9
--- /dev/null
+++ b/vdo/base/superBlock.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.c#5 $
+ */
+
+#include "superBlock.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "completion.h"
+#include "constants.h"
+#include "header.h"
+#include "releaseVersions.h"
+#include "statusCodes.h"
+#include "types.h"
+#include "vio.h"
+
+struct superBlock {
+  /** The parent for asynchronous load and save operations */
+  VDOCompletion        *parent;
+  /** The VIO for reading and writing the super block to disk */
+  VIO                  *vio;
+  /** The buffer for encoding and decoding component data */
+  Buffer               *componentBuffer;
+  /**
+   * A sector-sized buffer wrapping the first sector of encodedSuperBlock, for
+   * encoding and decoding the entire super block.
+   **/
+  Buffer               *blockBuffer;
+  /** A 1-block buffer holding the encoded on-disk super block */
+  byte                 *encodedSuperBlock;
+  /** The release version number loaded from the volume */
+  ReleaseVersionNumber  loadedReleaseVersion;
+  /** Whether this super block may not be written */
+  bool                  unwriteable;
+};
+
+enum {
+  SUPER_BLOCK_FIXED_SIZE
+    = ENCODED_HEADER_SIZE + sizeof(ReleaseVersionNumber) + CHECKSUM_SIZE,
+  MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE,
+};
+
+static const Header SUPER_BLOCK_HEADER_12_0 = {
+  .id = SUPER_BLOCK,
+  .version = {
+    .majorVersion = 12,
+    .minorVersion = 0,
+  },
+
+  // This is the minimum size, if the super block contains no components.
+  .size = SUPER_BLOCK_FIXED_SIZE - ENCODED_HEADER_SIZE,
+};
+
+/**
+ * Allocate a super block. Callers must free the allocated super block even
+ * on error.
+ *
+ * @param layer  The physical layer which holds the super block on disk
+ * @param superBlockPtr  A pointer to hold the new super block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int allocateSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr)
+{
+  int result = ALLOCATE(1, SuperBlock, __func__, superBlockPtr);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  SuperBlock *superBlock = *superBlockPtr;
+  result = makeBuffer(MAX_COMPONENT_DATA_SIZE, &superBlock->componentBuffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE,
+                                   "encoded super block",
+                                   (char **) &superBlock->encodedSuperBlock);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Even though the buffer is a full block, to avoid the potential corruption
+  // from a torn write, the entire encoding must fit in the first sector.
+  result = wrapBuffer(superBlock->encodedSuperBlock, VDO_SECTOR_SIZE, 0,
+                      &superBlock->blockBuffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (layer->createMetadataVIO == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  return createVIO(layer, VIO_TYPE_SUPER_BLOCK, VIO_PRIORITY_METADATA,
+                   superBlock, (char *) superBlock->encodedSuperBlock,
+                   &superBlock->vio);
+}
+
+/**********************************************************************/
+int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr)
+{
+  SuperBlock *superBlock;
+  int         result = allocateSuperBlock(layer, &superBlock);
+  if (result != VDO_SUCCESS) {
+    freeSuperBlock(&superBlock);
+    return result;
+  }
+
+  // For a new super block, use the current release.
+  superBlock->loadedReleaseVersion = CURRENT_RELEASE_VERSION_NUMBER;
+  *superBlockPtr = superBlock;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeSuperBlock(SuperBlock **superBlockPtr)
+{
+  if (*superBlockPtr == NULL) {
+    return;
+  }
+
+  SuperBlock *superBlock = *superBlockPtr;
+  freeBuffer(&superBlock->blockBuffer);
+  freeBuffer(&superBlock->componentBuffer);
+  freeVIO(&superBlock->vio);
+  FREE(superBlock->encodedSuperBlock);
+  FREE(superBlock);
+  *superBlockPtr = NULL;
+}
+
+/**
+ * Encode a super block into its on-disk representation.
+ *
+ * @param layer       The physical layer which implements the checksum
+ * @param superBlock  The super block to encode
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int encodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock)
+{
+  Buffer *buffer = superBlock->blockBuffer;
+  int     result = resetBufferEnd(buffer, 0);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  size_t componentDataSize = contentLength(superBlock->componentBuffer);
+
+  // Encode the header.
+  Header header = SUPER_BLOCK_HEADER_12_0;
+  header.size += componentDataSize;
+  result = encodeHeader(&header, buffer);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Encode the loaded release version.
+  result = putUInt32LEIntoBuffer(buffer, superBlock->loadedReleaseVersion);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Copy the already-encoded component data.
+  result = putBytes(buffer, componentDataSize,
+                    getBufferContents(superBlock->componentBuffer));
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Compute and encode the checksum.
+  CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM,
+                                              superBlock->encodedSuperBlock,
+                                              contentLength(buffer));
+  result = putUInt32LEIntoBuffer(buffer, checksum);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int saveSuperBlock(PhysicalLayer       *layer,
+                   SuperBlock          *superBlock,
+                   PhysicalBlockNumber  superBlockOffset)
+{
+  int result = encodeSuperBlock(layer, superBlock);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return layer->writer(layer, superBlockOffset, 1,
+                       (char *) superBlock->encodedSuperBlock, NULL);
+}
+
+/**
+ * Finish the parent of a super block load or save operation. This
+ * callback is registered in saveSuperBlockAsync() and loadSuperBlockAsync.
+ *
+ * @param completion  The super block VIO
+ **/
+static void finishSuperBlockParent(VDOCompletion *completion)
+{
+  SuperBlock    *superBlock = completion->parent;
+  VDOCompletion *parent     = superBlock->parent;
+  superBlock->parent        = NULL;
+  finishCompletion(parent, completion->result);
+}
+
+/**
+ * Log a super block save error. This error handler is registered in
+ * saveSuperBlockAsync().
+ *
+ * @param completion  The super block VIO
+ **/
+static void handleSaveError(VDOCompletion *completion)
+{
+  logErrorWithStringError(completion->result, "super block save failed");
+  /*
+   * Mark the super block as unwritable so that we won't attempt to write it
+   * again. This avoids the case where a growth attempt fails writing the
+   * super block with the new size, but the subsequent attempt to write out
+   * the read-only state succeeds. In this case, writes which happened just
+   * before the suspend would not be visible if the VDO is restarted without
+   * rebuilding, but, after a read-only rebuild, the effects of those writes
+   * would reappear.
+   */
+  ((SuperBlock *) completion->parent)->unwriteable = true;
+  completion->callback(completion);
+}
+
+/**********************************************************************/
+void saveSuperBlockAsync(SuperBlock          *superBlock,
+                         PhysicalBlockNumber  superBlockOffset,
+                         VDOCompletion       *parent)
+{
+  if (superBlock->unwriteable) {
+    finishCompletion(parent, VDO_READ_ONLY);
+    return;
+  }
+
+  if (superBlock->parent != NULL) {
+    finishCompletion(parent, VDO_COMPONENT_BUSY);
+    return;
+  }
+
+  PhysicalLayer *layer = parent->layer;
+  int result = encodeSuperBlock(layer, superBlock);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  superBlock->parent                           = parent;
+  superBlock->vio->completion.callbackThreadID = parent->callbackThreadID;
+  launchWriteMetadataVIOWithFlush(superBlock->vio, superBlockOffset,
+                                  finishSuperBlockParent, handleSaveError,
+                                  true, true);
+}
+
+/**
+ * Decode a super block from its on-disk representation.
+ *
+ * @param layer       The physical layer which implements the checksum
+ * @param superBlock  The super block to decode
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int decodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock)
+{
+  // Reset the block buffer to start decoding the entire first sector.
+  Buffer *buffer = superBlock->blockBuffer;
+  clearBuffer(buffer);
+
+  // Decode and validate the header.
+  Header header;
+  int result = decodeHeader(buffer, &header);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateHeader(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (header.size > contentLength(buffer)) {
+    // We can't check release version or checksum until we know the content
+    // size, so we have to assume a version mismatch on unexpected values.
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "super block contents too large: %zu",
+                                   header.size);
+  }
+
+  // Restrict the buffer to the actual payload bytes that remain.
+  result = resetBufferEnd(buffer, uncompactedAmount(buffer) + header.size);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Decode and store the release version number. It will be checked when the
+  // VDO master version is decoded and validated.
+  result = getUInt32LEFromBuffer(buffer, &superBlock->loadedReleaseVersion);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // The component data is all the rest, except for the checksum.
+  size_t componentDataSize = contentLength(buffer) - sizeof(CRC32Checksum);
+  result = putBuffer(superBlock->componentBuffer, buffer, componentDataSize);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Checksum everything up to but not including the saved checksum itself.
+  CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM,
+                                              superBlock->encodedSuperBlock,
+                                              uncompactedAmount(buffer));
+
+  // Decode and verify the saved checksum.
+  CRC32Checksum savedChecksum;
+  result = getUInt32LEFromBuffer(buffer, &savedChecksum);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(contentLength(buffer) == 0,
+                  "must have decoded entire superblock payload");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return ((checksum != savedChecksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS);
+}
+
+/**********************************************************************/
+int loadSuperBlock(PhysicalLayer        *layer,
+                   PhysicalBlockNumber   superBlockOffset,
+                   SuperBlock          **superBlockPtr)
+{
+  SuperBlock *superBlock = NULL;
+  int         result     = allocateSuperBlock(layer, &superBlock);
+  if (result != VDO_SUCCESS) {
+    freeSuperBlock(&superBlock);
+    return result;
+  }
+
+  result = layer->reader(layer, superBlockOffset, 1,
+                         (char *) superBlock->encodedSuperBlock, NULL);
+  if (result != VDO_SUCCESS) {
+    freeSuperBlock(&superBlock);
+    return result;
+  }
+
+  result = decodeSuperBlock(layer, superBlock);
+  if (result != VDO_SUCCESS) {
+    freeSuperBlock(&superBlock);
+    return result;
+  }
+
+  *superBlockPtr = superBlock;
+  return result;
+}
+
+/**
+ * Continue after loading the super block. This callback is registered
+ * in loadSuperBlockAsync().
+ *
+ * @param completion  The super block VIO
+ **/
+static void finishReadingSuperBlock(VDOCompletion *completion)
+{
+  SuperBlock    *superBlock = completion->parent;
+  VDOCompletion *parent     = superBlock->parent;
+  superBlock->parent        = NULL;
+  finishCompletion(parent, decodeSuperBlock(completion->layer, superBlock));
+}
+
+/**********************************************************************/
+void loadSuperBlockAsync(VDOCompletion        *parent,
+                         PhysicalBlockNumber   superBlockOffset,
+                         SuperBlock          **superBlockPtr)
+{
+  PhysicalLayer *layer      = parent->layer;
+  SuperBlock    *superBlock = NULL;
+  int            result     = allocateSuperBlock(layer, &superBlock);
+  if (result != VDO_SUCCESS) {
+    freeSuperBlock(&superBlock);
+    finishCompletion(parent, result);
+    return;
+  }
+
+  *superBlockPtr = superBlock;
+
+  superBlock->parent                           = parent;
+  superBlock->vio->completion.callbackThreadID = parent->callbackThreadID;
+  launchReadMetadataVIO(superBlock->vio, superBlockOffset,
+                        finishReadingSuperBlock, finishSuperBlockParent);
+}
+
+/**********************************************************************/
+Buffer *getComponentBuffer(SuperBlock *superBlock)
+{
+  return superBlock->componentBuffer;
+}
+
+/**********************************************************************/
+ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock)
+{
+  return superBlock->loadedReleaseVersion;
+}
+
+/**********************************************************************/
+size_t getFixedSuperBlockSize(void)
+{
+  return SUPER_BLOCK_FIXED_SIZE;
+}
diff --git a/vdo/base/superBlock.h b/vdo/base/superBlock.h
new file mode 100644
index 0000000..bfed7c6
--- /dev/null
+++ b/vdo/base/superBlock.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.h#2 $
+ */
+
+#ifndef SUPER_BLOCK_H
+#define SUPER_BLOCK_H
+
+#include "buffer.h"
+
+#include "completion.h"
+#include "types.h"
+
+typedef struct superBlock SuperBlock;
+
+/**
+ * Make a new super block.
+ *
+ * @param [in]  layer          The layer on which to write this super block
+ * @param [out] superBlockPtr  A pointer to hold the new super block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a super block and null out the reference to it.
+ *
+ * @param superBlockPtr the reference to the super block to free
+ **/
+void freeSuperBlock(SuperBlock **superBlockPtr);
+
+/**
+ * Save a super block.
+ *
+ * @param layer             The physical layer on which to save the super block
+ * @param superBlock        The super block to save
+ * @param superBlockOffset  The location of the super block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int saveSuperBlock(PhysicalLayer       *layer,
+                   SuperBlock          *superBlock,
+                   PhysicalBlockNumber  superBlockOffset)
+  __attribute__((warn_unused_result));
+
+/**
+ * Save a super block asynchronously.
+ *
+ * @param superBlock        The super block to save
+ * @param superBlockOffset  The location at which to write the super block
+ * @param parent            The object to notify when the save is complete
+ **/
+void saveSuperBlockAsync(SuperBlock          *superBlock,
+                         PhysicalBlockNumber  superBlockOffset,
+                         VDOCompletion       *parent);
+
+/**
+ * Allocate a super block and read its contents from storage.
+ *
+ * @param [in]  layer             The layer from which to load the super block
+ * @param [in]  superBlockOffset  The location from which to read the super
+ *                                block
+ * @param [out] superBlockPtr     A pointer to hold the loaded super block
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int loadSuperBlock(PhysicalLayer        *layer,
+                   PhysicalBlockNumber   superBlockOffset,
+                   SuperBlock          **superBlockPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Allocate a super block and read its contents from storage asynchronously. If
+ * a load error occurs before the super block's own completion can be allocated,
+ * the parent will be finished with the error.
+ *
+ * @param [in]  parent            The completion to finish after loading the
+ *                                super block
+ * @param [in]  superBlockOffset  The location from which to read the super
+ *                                block
+ * @param [out] superBlockPtr     A pointer to hold the super block
+ **/
+void loadSuperBlockAsync(VDOCompletion        *parent,
+                         PhysicalBlockNumber   superBlockOffset,
+                         SuperBlock          **superBlockPtr);
+
+/**
+ * Get a buffer which contains the component data from a super block.
+ *
+ * @param superBlock  The super block from which to get the component data
+ *
+ * @return the component data in a buffer
+ **/
+Buffer *getComponentBuffer(SuperBlock *superBlock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the release version number that was loaded from the volume when the
+ * SuperBlock was decoded.
+ *
+ * @param superBlock  The super block to query
+ *
+ * @return the release version number that was decoded from the volume
+ **/
+ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the encoded size of the fixed (non-component data) portion of a super
+ * block (this is for unit testing).
+ *
+ * @return The encoded size of the fixed portion of the super block
+ **/
+size_t getFixedSuperBlockSize(void)
+  __attribute__((warn_unused_result));
+
+#endif /* SUPER_BLOCK_H */
diff --git a/vdo/base/threadConfig.c b/vdo/base/threadConfig.c
new file mode 100644
index 0000000..b671b73
--- /dev/null
+++ b/vdo/base/threadConfig.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.c#2 $
+ */
+
+#include "threadConfig.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "constants.h"
+#include "types.h"
+
+/**********************************************************************/
+static int allocateThreadConfig(ZoneCount      logicalZoneCount,
+                                ZoneCount      physicalZoneCount,
+                                ZoneCount      hashZoneCount,
+                                ZoneCount      baseThreadCount,
+                                ThreadConfig **configPtr)
+{
+  ThreadConfig *config;
+  int result = ALLOCATE(1, ThreadConfig, "thread config", &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(logicalZoneCount, ThreadID, "logical thread array",
+                    &config->logicalThreads);
+  if (result != VDO_SUCCESS) {
+    freeThreadConfig(&config);
+    return result;
+  }
+
+  result = ALLOCATE(physicalZoneCount, ThreadID, "physical thread array",
+                    &config->physicalThreads);
+  if (result != VDO_SUCCESS) {
+    freeThreadConfig(&config);
+    return result;
+  }
+
+  result = ALLOCATE(hashZoneCount, ThreadID, "hash thread array",
+                    &config->hashZoneThreads);
+  if (result != VDO_SUCCESS) {
+    freeThreadConfig(&config);
+    return result;
+  }
+
+  config->logicalZoneCount  = logicalZoneCount;
+  config->physicalZoneCount = physicalZoneCount;
+  config->hashZoneCount     = hashZoneCount;
+  config->baseThreadCount   = baseThreadCount;
+
+  *configPtr = config;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void assignThreadIDs(ThreadID   threadIDs[],
+                            ZoneCount  count,
+                            ThreadID  *idPtr)
+{
+  for (ZoneCount zone = 0; zone < count; zone++) {
+    threadIDs[zone] = (*idPtr)++;
+  }
+}
+
+/**********************************************************************/
+int makeThreadConfig(ZoneCount      logicalZoneCount,
+                     ZoneCount      physicalZoneCount,
+                     ZoneCount      hashZoneCount,
+                     ThreadConfig **configPtr)
+{
+  if ((logicalZoneCount == 0)
+      && (physicalZoneCount == 0)
+      && (hashZoneCount == 0)) {
+    return makeOneThreadConfig(configPtr);
+  }
+
+  if (physicalZoneCount > MAX_PHYSICAL_ZONES) {
+    return logErrorWithStringError(VDO_BAD_CONFIGURATION,
+                                   "Physical zone count %u exceeds maximum "
+                                   "(%u)",
+                                   physicalZoneCount, MAX_PHYSICAL_ZONES);
+  }
+
+  if (logicalZoneCount > MAX_LOGICAL_ZONES) {
+    return logErrorWithStringError(VDO_BAD_CONFIGURATION,
+                                   "Logical zone count %u exceeds maximum "
+                                   "(%u)",
+                                   logicalZoneCount, MAX_LOGICAL_ZONES);
+  }
+
+  ThreadConfig *config;
+  ThreadCount total = logicalZoneCount + physicalZoneCount + hashZoneCount + 2;
+  int result = allocateThreadConfig(logicalZoneCount, physicalZoneCount,
+                                    hashZoneCount, total, &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ThreadID id = 0;
+  config->adminThread   = id;
+  config->journalThread = id++;
+  config->packerThread  = id++;
+  assignThreadIDs(config->logicalThreads, logicalZoneCount, &id);
+  assignThreadIDs(config->physicalThreads, physicalZoneCount, &id);
+  assignThreadIDs(config->hashZoneThreads, hashZoneCount, &id);
+
+  ASSERT_LOG_ONLY(id == total, "correct number of thread IDs assigned");
+
+  *configPtr = config;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeZeroThreadConfig(ThreadConfig **configPtr)
+{
+  ThreadConfig *config;
+  int result = ALLOCATE(1, ThreadConfig, __func__, &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  config->logicalZoneCount  = 0;
+  config->physicalZoneCount = 0;
+  config->hashZoneCount     = 0;
+  config->baseThreadCount   = 0;
+  *configPtr                = config;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeOneThreadConfig(ThreadConfig **configPtr)
+{
+  ThreadConfig *config;
+  int result = allocateThreadConfig(1, 1, 1, 1, &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  config->logicalThreads[0]  = 0;
+  config->physicalThreads[0] = 0;
+  config->hashZoneThreads[0] = 0;
+  *configPtr = config;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr)
+{
+  ThreadConfig *config;
+  int result = allocateThreadConfig(oldConfig->logicalZoneCount,
+                                    oldConfig->physicalZoneCount,
+                                    oldConfig->hashZoneCount,
+                                    oldConfig->baseThreadCount,
+                                    &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  config->adminThread   = oldConfig->adminThread;
+  config->journalThread = oldConfig->journalThread;
+  config->packerThread  = oldConfig->packerThread;
+  for (ZoneCount i = 0; i < config->logicalZoneCount; i++) {
+    config->logicalThreads[i] = oldConfig->logicalThreads[i];
+  }
+  for (ZoneCount i = 0; i < config->physicalZoneCount; i++) {
+    config->physicalThreads[i] = oldConfig->physicalThreads[i];
+  }
+  for (ZoneCount i = 0; i < config->hashZoneCount; i++) {
+    config->hashZoneThreads[i] = oldConfig->hashZoneThreads[i];
+  }
+
+  *configPtr = config;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeThreadConfig(ThreadConfig **configPtr)
+{
+  if (*configPtr == NULL) {
+    return;
+  }
+
+  ThreadConfig *config = *configPtr;
+  *configPtr           = NULL;
+
+  FREE(config->logicalThreads);
+  FREE(config->physicalThreads);
+  FREE(config->hashZoneThreads);
+  FREE(config);
+}
+
+/**********************************************************************/
+static bool getZoneThreadName(const ThreadID  threadIDs[],
+                              ZoneCount       count,
+                              ThreadID        id,
+                              const char     *prefix,
+                              char           *buffer,
+                              size_t          bufferLength)
+{
+  if (id >= threadIDs[0]) {
+    ThreadID index = id - threadIDs[0];
+    if (index < count) {
+      snprintf(buffer, bufferLength, "%s%d", prefix, index);
+      return true;
+    }
+  }
+  return false;
+}
+
+/**********************************************************************/
+void getVDOThreadName(const ThreadConfig *threadConfig,
+                      ThreadID            threadID,
+                      char               *buffer,
+                      size_t              bufferLength)
+{
+  if (threadConfig->baseThreadCount == 1) {
+    // Historically this was the "request queue" thread.
+    snprintf(buffer, bufferLength, "reqQ");
+    return;
+  }
+  if (threadID == threadConfig->journalThread) {
+    snprintf(buffer, bufferLength, "journalQ");
+    return;
+  } else if (threadID == threadConfig->adminThread) {
+    // Theoretically this could be different from the journal thread.
+    snprintf(buffer, bufferLength, "adminQ");
+    return;
+  } else if (threadID == threadConfig->packerThread) {
+    snprintf(buffer, bufferLength, "packerQ");
+    return;
+  }
+  if (getZoneThreadName(threadConfig->logicalThreads,
+                        threadConfig->logicalZoneCount,
+                        threadID, "logQ", buffer, bufferLength)) {
+    return;
+  }
+  if (getZoneThreadName(threadConfig->physicalThreads,
+                        threadConfig->physicalZoneCount,
+                        threadID, "physQ", buffer, bufferLength)) {
+    return;
+  }
+  if (getZoneThreadName(threadConfig->hashZoneThreads,
+                        threadConfig->hashZoneCount,
+                        threadID, "hashQ", buffer, bufferLength)) {
+    return;
+  }
+
+  // Some sort of misconfiguration?
+  snprintf(buffer, bufferLength, "reqQ%d", threadID);
+}
diff --git a/vdo/base/threadConfig.h b/vdo/base/threadConfig.h
new file mode 100644
index 0000000..6401651
--- /dev/null
+++ b/vdo/base/threadConfig.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.h#1 $
+ */
+
+#ifndef THREAD_CONFIG_H
+#define THREAD_CONFIG_H
+
+#include "permassert.h"
+
+#include "types.h"
+
+struct threadConfig {
+  ZoneCount    logicalZoneCount;
+  ZoneCount    physicalZoneCount;
+  ZoneCount    hashZoneCount;
+  ThreadCount  baseThreadCount;
+  ThreadID     adminThread;
+  ThreadID     journalThread;
+  ThreadID     packerThread;
+  ThreadID    *logicalThreads;
+  ThreadID    *physicalThreads;
+  ThreadID    *hashZoneThreads;
+};
+
+/**
+ * Make a thread configuration. If both the logical zone count and the
+ * physical zone count are set to 0, a one thread configuration will be
+ * made.
+ *
+ * @param [in]  logicalZoneCount    The number of logical zones
+ * @param [in]  physicalZoneCount   The number of physical zones
+ * @param [in]  hashZoneCount       The number of hash zones
+ * @param [out] configPtr           A pointer to hold the new thread
+ *                                  configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeThreadConfig(ZoneCount      logicalZoneCount,
+                     ZoneCount      physicalZoneCount,
+                     ZoneCount      hashZoneCount,
+                     ThreadConfig **configPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Make a thread configuration that uses no threads. This is the configuration
+ * for VDOs which are constructed from user mode that have only a synchronous
+ * layer.
+ *
+ * @param [out] configPtr   A pointer to hold the new thread configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeZeroThreadConfig(ThreadConfig **configPtr);
+
+/**
+ * Make a thread configuration that uses only one thread.
+ *
+ * @param [out] configPtr      A pointer to hold the new thread configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeOneThreadConfig(ThreadConfig **configPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Make a new thread config which is a copy of an existing one.
+ *
+ * @param [in]  oldConfig       The thread configuration to copy
+ * @param [out] configPtr       A pointer to hold the new thread configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a thread configuration and null out the reference to it.
+ *
+ * @param configPtr  The reference to the thread configuration to destroy
+ **/
+void freeThreadConfig(ThreadConfig **configPtr);
+
+/**
+ * Get the thread id for a given logical zone.
+ *
+ * @param threadConfig  the thread config
+ * @param logicalZone   the number of the logical zone
+ *
+ * @return the thread id for the given zone
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getLogicalZoneThread(const ThreadConfig *threadConfig,
+                                            ZoneCount           logicalZone)
+{
+  ASSERT_LOG_ONLY((logicalZone <= threadConfig->logicalZoneCount),
+                  "logical zone valid");
+  return threadConfig->logicalThreads[logicalZone];
+}
+
+/**
+ * Get the thread id for a given physical zone.
+ *
+ * @param threadConfig  the thread config
+ * @param physicalZone  the number of the physical zone
+ *
+ * @return the thread id for the given zone
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getPhysicalZoneThread(const ThreadConfig *threadConfig,
+                                             ZoneCount           physicalZone)
+{
+  ASSERT_LOG_ONLY((physicalZone <= threadConfig->physicalZoneCount),
+                  "physical zone valid");
+  return threadConfig->physicalThreads[physicalZone];
+}
+
+/**
+ * Get the thread id for a given hash zone.
+ *
+ * @param threadConfig  the thread config
+ * @param hashZone      the number of the hash zone
+ *
+ * @return the thread id for the given zone
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getHashZoneThread(const ThreadConfig *threadConfig,
+                                         ZoneCount           hashZone)
+{
+  ASSERT_LOG_ONLY((hashZone <= threadConfig->hashZoneCount),
+                  "hash zone valid");
+  return threadConfig->hashZoneThreads[hashZone];
+}
+
+/**
+ * Get the thread id for the journal zone.
+ *
+ * @param threadConfig  the thread config
+ *
+ * @return the thread id for the journal zone
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getJournalZoneThread(const ThreadConfig *threadConfig)
+{
+  return threadConfig->journalThread;
+}
+
+/**
+ * Get the thread id for the packer zone.
+ *
+ * @param threadConfig  the thread config
+ *
+ * @return the thread id for the packer zone
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getPackerZoneThread(const ThreadConfig *threadConfig)
+{
+  return threadConfig->packerThread;
+}
+
+/**
+ * Get the thread ID for admin requests.
+ *
+ * @param threadConfig  The thread config
+ *
+ * @return the thread id to use for admin requests
+ **/
+__attribute__((warn_unused_result))
+static inline ThreadID getAdminThread(const ThreadConfig *threadConfig)
+{
+  return threadConfig->adminThread;
+}
+
+/**
+ * Format the name of the worker thread desired to support a given
+ * work queue. The physical layer may add a prefix identifying the
+ * product; the output from this function should just identify the
+ * thread.
+ *
+ * @param threadConfig  The thread configuration
+ * @param threadID      The thread id
+ * @param buffer        Where to put the formatted name
+ * @param bufferLength  Size of the output buffer
+ **/
+void getVDOThreadName(const ThreadConfig *threadConfig,
+                      ThreadID            threadID,
+                      char               *buffer,
+                      size_t              bufferLength);
+
+#endif /* THREAD_CONFIG_H */
diff --git a/vdo/base/trace.c b/vdo/base/trace.c
new file mode 100644
index 0000000..7b4e33f
--- /dev/null
+++ b/vdo/base/trace.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.c#1 $
+ */
+
+#include "trace.h"
+
+#include "logger.h"
+#include "stringUtils.h"
+#include "timeUtils.h"
+
+TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[] = {
+  {
+    .function = "<none>",
+    .line     = 0,
+  },
+};
+
+/**********************************************************************/
+void addTraceRecord(Trace *trace, TraceLocation location)
+{
+  if (trace->used < NUM_TRACE_RECORDS) {
+    TraceRecord *record = &trace->records[trace->used];
+    trace->used++;
+
+    record->when     = nowUsec();
+    record->tid      = getThreadId();
+    record->location = location - baseTraceLocation;
+  }
+}
+
+/*
+ * The record display format used is a comma-separated list, each item
+ * containing: optional function name; "@" + timestamp with seconds
+ * and microseconds for the first record; if not the first record, "+"
+ * and offset in microseconds from previous timestamp.
+ *
+ * If the buffer's too small, it'll end with an ellipsis.
+ */
+void formatTrace(Trace  *trace,
+                 char   *buffer,
+                 size_t  bufferLength,
+                 size_t *msgLen)
+{
+  if (trace == NULL) {
+    return;
+  }
+  memset(buffer, 0, bufferLength);
+  char *buf = buffer;
+  char *bufferEnd = buffer + bufferLength - 1;
+  if (trace->used > 0) {
+    TraceRecord *record = &trace->records[0];
+    TraceLocationRecord *location = baseTraceLocation + record->location;
+    snprintf(buf, bufferEnd - buf, "Trace[%s@%llu.%06llu",
+             location->function, record->when / 1000000,
+             record->when % 1000000);
+    buf += strlen(buf);
+
+    for (unsigned int i = 1; i < trace->used; i++) {
+      TraceRecord *prev = record;
+      record++;
+
+      snprintf(buf, bufferEnd - buf, ",");
+      buf += strlen(buf);
+
+      location = baseTraceLocation + record->location;
+      unsigned long timeDiff = record->when - prev->when;
+      snprintf(buf, bufferEnd - buf, "%s+%lu",
+               location->function, timeDiff);
+      buf += strlen(buf);
+    }
+    if (bufferLength > 7) {
+      if (buffer[bufferLength-5] != '\0') {
+        // too long
+        strcpy(buffer+bufferLength-5, "...]");
+      } else {
+        strcpy(buf, "]");
+      }
+    }
+  }
+  *msgLen = (buf - buffer);
+}
diff --git a/vdo/base/trace.h b/vdo/base/trace.h
new file mode 100644
index 0000000..59dabf9
--- /dev/null
+++ b/vdo/base/trace.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.h#1 $
+ */
+
+#ifndef TRACE_H
+#define TRACE_H
+
+#ifndef __KERNEL__
+#include "cpu.h"
+#endif
+
+#include "threads.h"
+
+/*
+ * We need these records to be glued together with no intervening
+ * bytes. That makes it rather sensitive to how the compiler,
+ * assembler, and linker may add padding. Force extra alignment to
+ * make it more reliable.
+ *
+ * Trace point descriptor language:
+ *
+ * The descriptor string provided at a trace point can have one or
+ * more components, separated by ";". The first (or only) component is
+ * a string to be formatted and shown in the flowchart graph. The
+ * remaining components must be of the form "var=string", and assign
+ * string values to "variables" that last through the processing of
+ * the remainder of the current trace being read.
+ *
+ * The string displayed has variable substitutions done for any
+ * occurrences of "$var" in the string.
+ *
+ * So, the descriptor sequence:
+ *   kvdoWriteVIO;io=writeData;j=normal
+ *   submitBio($io)
+ *   writeJournalBlock($j)
+ * would cause the graph generator to show the strings:
+ *   kvdoWriteVIO
+ *   submitBio(writeData)
+ *   writeJournalBlock(normal)
+ *
+ * Substitutions are done in the variable assignment strings when
+ * they're processed, so "foo=x($bar)" sets "foo" using the current
+ * value of "bar"; it doesn't cause "bar" to be looked up when "$foo"
+ * is seen later.
+ *
+ * The variable named "F" is automatically updated with the name of
+ * the function associated with the descriptor, so you don't have to
+ * explicitly repeat the name of the function if you just want to
+ * augment it with more information. This may be desirable if a trace
+ * point is expected to be reached more than once at different stages
+ * of processing, or in a static function with a generic-sounding name
+ * that needs disambiguation for graphing.
+ *
+ * If no descriptor string is provided, the
+ * function:lineNumber:threadName string reported via systemtap will
+ * be used in the graph.
+ *
+ * Current variable names used:
+ *   cb=(various)      random info to log when enqueueing VIO callback
+ *   dup=post,update   deduplication operation
+ *   io=(various)      kind of I/O and data it's being done on
+ *   j=normal,dedupe   kind of journal update being done
+ *   js=mapWrite,writeZero,unmap  which step of journaling we're doing
+ */
+typedef const struct __attribute__((aligned(16))) traceLocationRecord {
+  const char *function;
+  int         line;
+  const char *description;
+} TraceLocationRecord;
+
+/*
+ * With well under 100 locations defined at the moment, even with no
+ * idea where &baseTraceLocation will fall relative to the others, we
+ * only need to support a range of -100..+100.
+ */
+typedef int32_t TraceLocationNumber;
+
+/* The type to pass around */
+typedef TraceLocationRecord *TraceLocation;
+
+/*
+ * N.B.: This code uses GCC extensions to create static, initialized
+ * objects inline, describing the current function and line number.
+ * The objects are collected into a table we can index with small
+ * signed integers relative to &baseTraceLocation.
+ *
+ * We need baseTraceLocation because there's no standard way to get
+ * the address of the start of this array we're defining.  And because
+ * we're not playing any (additional) special linker tricks to ensure
+ * ordering of the object files, the offsets may be signed, and we
+ * don't know the range beyond the fact that we don't have hundreds of
+ * these records lying around.
+ *
+ * By specifying a name that starts with neither .data nor .rodata, we
+ * leave it to the toolchain to pick a location for us, based on
+ * things like whether the section needs write access, which it does
+ * for a PIC library but not for a kernel module.
+ */
+
+#define TRACE_LOCATION_SECTION \
+  __attribute__((section(".kvdo_trace_locations")))
+
+extern TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[];
+
+#define TRACE_JOIN2(a,b) a##b
+#define TRACE_JOIN(a,b) TRACE_JOIN2(a,b)
+#define THIS_LOCATION(DESCRIPTION)                                      \
+  __extension__                                                         \
+  ({                                                                    \
+    static TRACE_LOCATION_SECTION                                       \
+      TraceLocationRecord TRACE_JOIN(loc,__LINE__) = {                  \
+      .function    = __func__,                                          \
+      .line        = __LINE__,                                          \
+      .description = DESCRIPTION,                                       \
+    };                                                                  \
+    &TRACE_JOIN(loc,__LINE__);                                          \
+  })
+
+typedef struct traceRecord {
+  uint64_t            when;     // counted in usec
+  pid_t               tid;
+  TraceLocationNumber location;
+} TraceRecord;
+
+enum { NUM_TRACE_RECORDS = 71 };
+
+typedef struct trace {
+  unsigned int used;
+  TraceRecord  records[NUM_TRACE_RECORDS];
+} Trace;
+
+/**
+ * Store a new record in the trace data.
+ *
+ * @param trace    The trace data to be updated
+ * @param location The source-location descriptor to be recorded
+ **/
+void addTraceRecord(Trace *trace, TraceLocation location);
+
+/**
+ * Format trace data into a string for logging.
+ *
+ * @param [in]  trace         The trace data to be logged
+ * @param [in]  buffer        The buffer in which to store the string
+ * @param [in]  bufferLength  Length of the buffer
+ * @param [out] msgLen        Length of the formatted string
+ **/
+void formatTrace(Trace  *trace,
+                 char   *buffer,
+                 size_t  bufferLength,
+                 size_t *msgLen);
+
+#endif /* TRACE_H */
diff --git a/vdo/base/types.h b/vdo/base/types.h
new file mode 100644
index 0000000..d820da6
--- /dev/null
+++ b/vdo/base/types.h
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/types.h#14 $
+ */
+
+#ifndef TYPES_H
+#define TYPES_H
+
+#include "blockMappingState.h"
+#include "common.h"
+#include "statusCodes.h"
+
+/**
+ * A size type in blocks.
+ **/
+typedef uint64_t BlockCount;
+
+/**
+ * The size of a block.
+ **/
+typedef uint16_t BlockSize;
+
+/**
+ * A count of compressed fragments
+ **/
+typedef uint8_t CompressedFragmentCount;
+
+/**
+ * A CRC-32 checksum
+ **/
+typedef uint32_t CRC32Checksum;
+
+/**
+ * A height within a tree.
+ **/
+typedef uint8_t Height;
+
+/**
+ * The logical block number as used by the consumer.
+ **/
+typedef uint64_t LogicalBlockNumber;
+
+/**
+ * The type of the nonce used to identify instances of VDO.
+ **/
+typedef uint64_t Nonce;
+
+/**
+ * A size in pages.
+ **/
+typedef uint32_t PageCount;
+
+/**
+ * A page number.
+ **/
+typedef uint32_t PageNumber;
+
+/**
+ * The size of a page.  Must be evenly divisible by block size.
+ **/
+typedef uint32_t PageSize;
+
+/**
+ * The physical (well, less logical) block number at which the block is found
+ * on the underlying device.
+ **/
+typedef uint64_t PhysicalBlockNumber;
+
+/**
+ * A release version number. These numbers are used to make the numbering
+ * space for component versions independent across release branches.
+ *
+ * Really an enum, but we have to specify the size for encoding; see
+ * releaseVersions.h for the enumeration values.
+ **/
+typedef uint32_t ReleaseVersionNumber;
+
+/**
+ * A count of tree roots.
+ **/
+typedef uint8_t RootCount;
+
+/**
+ * A number of sectors.
+ **/
+typedef uint8_t SectorCount;
+
+/**
+ * A sequence number.
+ **/
+typedef uint64_t SequenceNumber;
+
+/**
+ * A size type in slabs.
+ **/
+typedef uint16_t SlabCount;
+
+/**
+ * A slot in a bin or block map page.
+ **/
+typedef uint16_t SlotNumber;
+
+/**
+ * A number of VIOs.
+ **/
+typedef uint16_t VIOCount;
+
+/**
+ * A VDO thread configuration.
+ **/
+typedef struct threadConfig ThreadConfig;
+
+/**
+ * A thread counter
+ **/
+typedef uint8_t ThreadCount;
+
+/**
+ * A thread ID
+ *
+ * Base-code threads are numbered sequentially starting from 0.
+ **/
+typedef uint8_t ThreadID;
+
+/**
+ * The thread ID returned when the current base code thread ID cannot be found
+ * or is otherwise undefined.
+ **/
+static const ThreadID INVALID_THREAD_ID = (ThreadID) -1;
+
+/**
+ * A zone counter
+ **/
+typedef uint8_t ZoneCount;
+
+/**
+ * The type of request a VIO is performing
+ **/
+typedef enum __attribute__((packed)) vioOperation {
+  VIO_UNSPECIFIED_OPERATION = 0,
+  VIO_READ                  = 1,
+  VIO_WRITE                 = 2,
+  VIO_READ_MODIFY_WRITE     = VIO_READ | VIO_WRITE,
+  VIO_READ_WRITE_MASK       = VIO_READ_MODIFY_WRITE,
+  VIO_FLUSH_BEFORE          = 4,
+  VIO_FLUSH_AFTER           = 8,
+} VIOOperation;
+
+/**
+ * VIO types for statistics and instrumentation.
+ **/
+typedef enum __attribute__((packed)) {
+  VIO_TYPE_UNINITIALIZED = 0,
+  VIO_TYPE_DATA,
+  VIO_TYPE_BLOCK_ALLOCATOR,
+  VIO_TYPE_BLOCK_MAP,
+  VIO_TYPE_BLOCK_MAP_INTERIOR,
+  VIO_TYPE_COMPRESSED_BLOCK,
+  VIO_TYPE_PARTITION_COPY,
+  VIO_TYPE_RECOVERY_JOURNAL,
+  VIO_TYPE_SLAB_JOURNAL,
+  VIO_TYPE_SLAB_SUMMARY,
+  VIO_TYPE_SUPER_BLOCK,
+  VIO_TYPE_TEST,
+} VIOType;
+
+/**
+ * The current operation on a physical block (from the point of view of the
+ * recovery journal, slab journals, and reference counts.
+ **/
+typedef enum __attribute__((packed)) {
+  DATA_DECREMENT      = 0,
+  DATA_INCREMENT      = 1,
+  BLOCK_MAP_DECREMENT = 2,
+  BLOCK_MAP_INCREMENT = 3,
+} JournalOperation;
+
+/**
+ * Partition IDs are encoded in the volume layout in the super block.
+ **/
+typedef enum __attribute__((packed)) {
+  BLOCK_MAP_PARTITION        = 0,
+  BLOCK_ALLOCATOR_PARTITION  = 1,
+  RECOVERY_JOURNAL_PARTITION = 2,
+  SLAB_SUMMARY_PARTITION     = 3,
+} PartitionID;
+
+/**
+ * Check whether a VIOType is for servicing an external data request.
+ *
+ * @param vioType  The VIOType to check
+ **/
+static inline bool isDataVIOType(VIOType vioType)
+{
+  return (vioType == VIO_TYPE_DATA);
+}
+
+/**
+ * Check whether a VIOType is for compressed block writes
+ *
+ * @param vioType  The VIOType to check
+ **/
+static inline bool isCompressedWriteVIOType(VIOType vioType)
+{
+  return (vioType == VIO_TYPE_COMPRESSED_BLOCK);
+}
+
+/**
+ * Check whether a VIOType is for metadata
+ *
+ * @param vioType  The VIOType to check
+ **/
+static inline bool isMetadataVIOType(VIOType vioType)
+{
+  return ((vioType != VIO_TYPE_UNINITIALIZED)
+          && !isDataVIOType(vioType)
+          && !isCompressedWriteVIOType(vioType));
+}
+
+/**
+ * Priority levels for asynchronous I/O operations performed on a VIO.
+ **/
+typedef enum __attribute__((packed)) vioPriority {
+  VIO_PRIORITY_LOW             = 0,
+  VIO_PRIORITY_DATA            = VIO_PRIORITY_LOW,
+  VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA,
+  VIO_PRIORITY_METADATA,
+  VIO_PRIORITY_HIGH,
+} VIOPriority;
+
+/**
+ * Metadata types for the VDO.
+ **/
+typedef enum __attribute__((packed)) {
+  VDO_METADATA_RECOVERY_JOURNAL = 1,
+  VDO_METADATA_SLAB_JOURNAL,
+} VDOMetadataType;
+
+/**
+ * The possible write policy values.
+ **/
+typedef enum {
+  WRITE_POLICY_SYNC,           ///< All writes are synchronous, i. e., they
+                               ///< are acknowledged only when the data is
+                               ///< written to stable storage.
+  WRITE_POLICY_ASYNC,          ///< Writes are acknowledged when the data is
+                               ///< cached for writing to stable storage, subject
+                               ///< to resiliency guarantees specified elsewhere.
+			       ///< After a crash, the data will be either old or
+			       ///< new value for unflushed writes, never garbage.
+  WRITE_POLICY_ASYNC_UNSAFE,   ///< Writes are acknowledged when the data is
+                               ///< cached for writing to stable storage, subject
+                               ///< to resiliency guarantees specified elsewhere.
+  WRITE_POLICY_AUTO,           ///< The appropriate policy is chosen based on the
+                               ///< underlying device
+} WritePolicy;
+
+typedef enum {
+  ZONE_TYPE_ADMIN,
+  ZONE_TYPE_JOURNAL,
+  ZONE_TYPE_LOGICAL,
+  ZONE_TYPE_PHYSICAL,
+} ZoneType;
+
+/**
+ * A position in the block map where a block map entry is stored.
+ **/
+typedef struct {
+  PhysicalBlockNumber pbn;
+  SlotNumber          slot;
+} BlockMapSlot;
+
+/**
+ * A position in the arboreal block map at a specific level.
+ **/
+typedef struct {
+  PageNumber   pageIndex;
+  BlockMapSlot blockMapSlot;
+} BlockMapTreeSlot;
+
+/**
+ * The configuration of a single slab derived from the configured block size
+ * and slab size.
+ **/
+typedef struct slabConfig {
+  BlockCount slabBlocks;             ///< total number of blocks in the slab
+  BlockCount dataBlocks;             ///< number of blocks available for data
+  BlockCount referenceCountBlocks;   ///< number of blocks for refCounts
+  BlockCount slabJournalBlocks;      ///< number of blocks for the slab journal
+  /**
+   * Number of blocks after which the slab journal starts pushing out a
+   * ReferenceBlock for each new entry it receives.
+   **/
+  BlockCount slabJournalFlushingThreshold;
+  /**
+   * Number of blocks after which the slab journal pushes out all
+   * ReferenceBlocks and makes all VIOs wait.
+   **/
+  BlockCount slabJournalBlockingThreshold;
+  /**
+   * Number of blocks after which the slab must be scrubbed before coming
+   * online.
+   **/
+  BlockCount slabJournalScrubbingThreshold;
+} __attribute__((packed)) SlabConfig;
+
+/**
+ * The configuration of the VDO service.
+ **/
+typedef struct vdoConfig {
+  BlockCount    logicalBlocks;       ///< number of logical blocks
+  BlockCount    physicalBlocks;      ///< number of physical blocks
+  BlockCount    slabSize;            ///< number of blocks in a slab
+  BlockCount    recoveryJournalSize; ///< number of recovery journal blocks
+  BlockCount    slabJournalBlocks;   ///< number of slab journal blocks
+} __attribute__((packed)) VDOConfig;
+
+/**
+ * The configuration parameters of the VDO service specified at load time.
+ **/
+typedef struct vdoLoadConfig {
+  /** the offset on the physical layer where the VDO begins */
+  PhysicalBlockNumber   firstBlockOffset;
+  /** the expected release version number of the VDO */
+  ReleaseVersionNumber  releaseVersion;
+  /** the expected nonce of the VDO */
+  Nonce                 nonce;
+  /** the thread configuration of the VDO */
+  ThreadConfig         *threadConfig;
+  /** the page cache size, in pages */
+  PageCount             cacheSize;
+  /** whether writes are synchronous */
+  WritePolicy           writePolicy;
+  /** the maximum age of a dirty block map page in recovery journal blocks */
+  BlockCount            maximumAge;
+} VDOLoadConfig;
+
+/**
+ * Forward declarations of abstract types
+ **/
+typedef struct actionManager       ActionManager;
+typedef struct allocatingVIO       AllocatingVIO;
+typedef struct allocationSelector  AllocationSelector;
+typedef struct blockAllocator      BlockAllocator;
+typedef struct blockMap            BlockMap;
+typedef struct blockMapTreeZone    BlockMapTreeZone;
+typedef struct blockMapZone        BlockMapZone;
+typedef struct dataVIO             DataVIO;
+typedef struct flusher             Flusher;
+typedef struct forest              Forest;
+typedef struct hashLock            HashLock;
+typedef struct hashZone            HashZone;
+typedef struct indexConfig         IndexConfig;
+typedef struct inputBin            InputBin;
+typedef struct lbnLock             LBNLock;
+typedef struct lockCounter         LockCounter;
+typedef struct logicalZone         LogicalZone;
+typedef struct logicalZones        LogicalZones;
+typedef struct pbnLock             PBNLock;
+typedef struct physicalLayer       PhysicalLayer;
+typedef struct physicalZone        PhysicalZone;
+typedef struct recoveryJournal     RecoveryJournal;
+typedef struct readOnlyNotifier    ReadOnlyNotifier;
+typedef struct refCounts           RefCounts;
+typedef struct vdoSlab             Slab;
+typedef struct slabDepot           SlabDepot;
+typedef struct slabJournal         SlabJournal;
+typedef struct slabJournalEntry    SlabJournalEntry;
+typedef struct slabScrubber        SlabScrubber;
+typedef struct slabSummary         SlabSummary;
+typedef struct slabSummaryZone     SlabSummaryZone;
+typedef struct vdo                 VDO;
+typedef struct vdoCompletion       VDOCompletion;
+typedef struct vdoExtent           VDOExtent;
+typedef struct vdoFlush            VDOFlush;
+typedef struct vdoLayout           VDOLayout;
+typedef struct vdoStatistics       VDOStatistics;
+typedef struct vio                 VIO;
+typedef struct vioPool             VIOPool;
+
+typedef struct {
+  PhysicalBlockNumber pbn;
+  BlockMappingState   state;
+} DataLocation;
+
+typedef struct {
+  PhysicalBlockNumber  pbn;
+  BlockMappingState    state;
+  PhysicalZone        *zone;
+} ZonedPBN;
+
+/**
+ * Callback which will be called by the VDO when all of the VIOs in the
+ * extent have been processed.
+ *
+ * @param extent The extent which is complete
+ **/
+typedef void VDOExtentCallback(VDOExtent *extent);
+
+/**
+ * An asynchronous operation.
+ *
+ * @param vio The VIO on which to operate
+ **/
+typedef void AsyncOperation(VIO *vio);
+
+/**
+ * An asynchronous compressed write operation.
+ *
+ * @param allocatingVIO  The AllocatingVIO to write
+ **/
+typedef void CompressedWriter(AllocatingVIO *allocatingVIO);
+
+/**
+ * An asynchronous data operation.
+ *
+ * @param dataVIO  The DataVIO on which to operate
+ **/
+typedef void AsyncDataOperation(DataVIO *dataVIO);
+
+/**
+ * A reference to a completion which (the reference) can be enqueued
+ * for completion on a specified thread.
+ **/
+typedef struct enqueueable {
+  VDOCompletion *completion;
+} Enqueueable;
+
+#endif // TYPES_H
diff --git a/vdo/base/upgrade.c b/vdo/base/upgrade.c
new file mode 100644
index 0000000..4d58d6f
--- /dev/null
+++ b/vdo/base/upgrade.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.c#6 $
+ */
+
+#include "upgrade.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "blockMap.h"
+#include "readOnlyNotifier.h"
+#include "recoveryJournal.h"
+#include "releaseVersions.h"
+#include "slabDepot.h"
+#include "statusCodes.h"
+#include "superBlock.h"
+#include "vdoInternal.h"
+#include "volumeGeometry.h"
+
+/* The latest supported Sodium version */
+/* Commented out because not currently used.
+ * static const VersionNumber SODIUM_MASTER_VERSION_67_0 = {
+ * .majorVersion = 67,
+ * .minorVersion =  0,
+ * };
+ */
+
+/* The component data version for current Sodium */
+static const VersionNumber SODIUM_COMPONENT_DATA_41_0 = {
+  .majorVersion = 41,
+  .minorVersion =  0,
+};
+
+/**
+ * Current Sodium's configuration of the VDO component.
+ **/
+typedef struct {
+  VDOState  state;
+  uint64_t  completeRecoveries;
+  uint64_t  readOnlyRecoveries;
+  VDOConfig config;
+  Nonce     nonce;
+} __attribute__((packed)) SodiumComponent41_0;
+
+/**
+ * Checks whether the release version loaded in the superblock is the
+ * current VDO version.
+ *
+ * @param vdo  The VDO to validate
+ *
+ * @return true if the release version number is the current version
+ **/
+static bool isCurrentReleaseVersion(VDO *vdo)
+{
+  ReleaseVersionNumber loadedVersion
+    = getLoadedReleaseVersion(vdo->superBlock);
+
+  return (loadedVersion == CURRENT_RELEASE_VERSION_NUMBER);
+}
+
+/**
+ * Loads the VDO master version into the VDO and checks that the version
+ * can be understood by VDO.
+ *
+ * @param vdo  The VDO to validate
+ *
+ * @return VDO_SUCCESS or an error if the loaded version is not supported
+ **/
+static int validateSodiumVersion(VDO *vdo)
+{
+  int result = decodeVDOVersion(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (isCurrentReleaseVersion(vdo)) {
+    return VDO_SUCCESS;
+  }
+
+  ReleaseVersionNumber loadedVersion
+    = getLoadedReleaseVersion(vdo->superBlock);
+  return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                 "Release version %d, load version %d.%d"
+                                 " cannot be upgraded", loadedVersion,
+                                 vdo->loadVersion.majorVersion,
+                                 vdo->loadVersion.minorVersion);
+}
+
+/**
+ * Decode a SodiumComponent41_0.
+ *
+ * @param buffer        The component data buffer
+ * @param component     The component structure to decode into
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int decodeSodium41_0Component(Buffer              *buffer,
+                                     SodiumComponent41_0 *component)
+{
+  return getBytesFromBuffer(buffer, sizeof(*component), component);
+}
+
+/**
+ * Decode the component data for the VDO itself from the component data
+ * buffer in the super block.
+ *
+ * @param vdo     The VDO to decode
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int decodeSodiumComponent(VDO *vdo)
+{
+  Buffer *buffer = getComponentBuffer(vdo->superBlock);
+  VersionNumber version;
+  int result = decodeVersionNumber(buffer, &version);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  SodiumComponent41_0 component;
+  if (areSameVersion(SODIUM_COMPONENT_DATA_41_0, version)) {
+    result = decodeSodium41_0Component(buffer, &component);
+  } else {
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "VDO component data version mismatch,"
+                                   " expected 41.0, got %d.%d",
+                                   version.majorVersion,
+                                   version.minorVersion);
+  }
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Copy the decoded component into the VDO structure.
+  vdo->state              = component.state;
+  vdo->loadState          = component.state;
+  vdo->completeRecoveries = component.completeRecoveries;
+  vdo->readOnlyRecoveries = component.readOnlyRecoveries;
+  vdo->config             = component.config;
+  vdo->nonce              = component.nonce;
+
+  logInfo("Converted VDO component data version %d.%d",
+          version.majorVersion, version.minorVersion);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int finishSodiumDecode(VDO *vdo)
+{
+  Buffer *buffer = getComponentBuffer(vdo->superBlock);
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  int result = makeRecoveryJournal(vdo->nonce, vdo->layer,
+                                   getVDOPartition(vdo->layout,
+                                                   RECOVERY_JOURNAL_PARTITION),
+                                   vdo->completeRecoveries,
+                                   vdo->config.recoveryJournalSize,
+                                   RECOVERY_JOURNAL_TAIL_BUFFER_SIZE,
+                                   vdo->readOnlyNotifier, threadConfig,
+                                   &vdo->recoveryJournal);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeSodiumRecoveryJournal(vdo->recoveryJournal, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeSodiumSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer,
+                                 getVDOPartition(vdo->layout,
+                                                 SLAB_SUMMARY_PARTITION),
+                                 vdo->readOnlyNotifier, vdo->recoveryJournal,
+                                 &vdo->depot);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeSodiumBlockMap(buffer, vdo->config.logicalBlocks,
+                                threadConfig, &vdo->blockMap);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ASSERT_LOG_ONLY((contentLength(buffer) == 0),
+                  "All decoded component data was used");
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int upgradePriorVDO(PhysicalLayer *layer)
+{
+  VolumeGeometry geometry;
+  int result = loadVolumeGeometry(layer, &geometry);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDO *vdo;
+  result = makeVDO(layer, &vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = loadSuperBlock(vdo->layer, getDataRegionOffset(geometry),
+                          &vdo->superBlock);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return logErrorWithStringError(result, "Could not load VDO super block");
+  }
+
+  // Load the necessary pieces to save again.
+  result = validateSodiumVersion(vdo);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  if (isCurrentReleaseVersion(vdo)) {
+    logInfo("VDO already up-to-date");
+    freeVDO(&vdo);
+    return VDO_SUCCESS;
+  }
+
+  result = decodeSodiumComponent(vdo);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  if (requiresRebuild(vdo)) {
+    // Do not attempt to upgrade a dirty prior version.
+    freeVDO(&vdo);
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "Cannot upgrade a dirty VDO.");
+  }
+
+  result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer,
+                                &vdo->readOnlyNotifier);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  result = finishSodiumDecode(vdo);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  // Saving will automatically change the release version to current.
+  result = saveVDOComponents(vdo);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  logInfo("Successfully saved upgraded VDO");
+  freeVDO(&vdo);
+
+  return result;
+}
diff --git a/vdo/base/upgrade.h b/vdo/base/upgrade.h
new file mode 100644
index 0000000..be2bd05
--- /dev/null
+++ b/vdo/base/upgrade.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.h#1 $
+ */
+
+#ifndef UPGRADE_H
+#define UPGRADE_H
+
+#include "types.h"
+
+/**
+ * Reconfigure the superblock of a prior VDO, preparing it for upgrading.
+ *
+ * @param layer  The layer with a VDO to prepare
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int upgradePriorVDO(PhysicalLayer *layer)
+  __attribute__((warn_unused_result));
+
+#endif /* UPGRADE_H */
diff --git a/vdo/base/vdo.c b/vdo/base/vdo.c
new file mode 100644
index 0000000..b4b9a41
--- /dev/null
+++ b/vdo/base/vdo.c
@@ -0,0 +1,1154 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.c#21 $
+ */
+
+/*
+ * This file contains the main entry points for normal operations on a VDO as
+ * well as functions for constructing and destroying VDO instances (in memory).
+ */
+
+#include "vdoInternal.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminCompletion.h"
+#include "blockMap.h"
+#include "extent.h"
+#include "hashZone.h"
+#include "header.h"
+#include "logicalZone.h"
+#include "numUtils.h"
+#include "packer.h"
+#include "physicalZone.h"
+#include "readOnlyNotifier.h"
+#include "recoveryJournal.h"
+#include "releaseVersions.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+#include "statistics.h"
+#include "statusCodes.h"
+#include "threadConfig.h"
+#include "vdoLayout.h"
+#include "vioWrite.h"
+#include "volumeGeometry.h"
+
+/**
+ * The master version of the on-disk format of a VDO. This should be
+ * incremented any time the on-disk representation of any VDO structure
+ * changes. Changes which require only online upgrade steps should increment
+ * the minor version. Changes which require an offline upgrade or which can not
+ * be upgraded to at all should increment the major version and set the minor
+ * version to 0.
+ **/
+static const VersionNumber VDO_MASTER_VERSION_67_0 = {
+  .majorVersion = 67,
+  .minorVersion =  0,
+};
+
+/**
+ * The current version for the data encoded in the super block. This must
+ * be changed any time there is a change to encoding of the component data
+ * of any VDO component.
+ **/
+static const VersionNumber VDO_COMPONENT_DATA_41_0 = {
+  .majorVersion = 41,
+  .minorVersion =  0,
+};
+
+/**
+ * This is the structure that captures the VDO fields saved as a SuperBlock
+ * component.
+ **/
+typedef struct {
+  VDOState  state;
+  uint64_t  completeRecoveries;
+  uint64_t  readOnlyRecoveries;
+  VDOConfig config;
+  Nonce     nonce;
+} __attribute__((packed)) VDOComponent41_0;
+
+/**********************************************************************/
+int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr)
+{
+  int result = registerStatusCodes();
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDO *vdo;
+  result = ALLOCATE(1, VDO, __func__, &vdo);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  vdo->layer = layer;
+  if (layer->createEnqueueable != NULL) {
+    result = initializeAdminCompletion(vdo, &vdo->adminCompletion);
+    if (result != VDO_SUCCESS) {
+      freeVDO(&vdo);
+      return result;
+    }
+  }
+
+  *vdoPtr = vdo;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeVDO(PhysicalLayer *layer, VDO **vdoPtr)
+{
+  VDO *vdo;
+  int result = allocateVDO(layer, &vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeZeroThreadConfig(&vdo->loadConfig.threadConfig);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  *vdoPtr = vdo;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void destroyVDO(VDO *vdo)
+{
+  freeFlusher(&vdo->flusher);
+  freePacker(&vdo->packer);
+  freeRecoveryJournal(&vdo->recoveryJournal);
+  freeSlabDepot(&vdo->depot);
+  freeVDOLayout(&vdo->layout);
+  freeSuperBlock(&vdo->superBlock);
+  freeBlockMap(&vdo->blockMap);
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  if (vdo->hashZones != NULL) {
+    for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) {
+      freeHashZone(&vdo->hashZones[zone]);
+    }
+  }
+  FREE(vdo->hashZones);
+  vdo->hashZones = NULL;
+
+  freeLogicalZones(&vdo->logicalZones);
+
+  if (vdo->physicalZones != NULL) {
+    for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) {
+      freePhysicalZone(&vdo->physicalZones[zone]);
+    }
+  }
+  FREE(vdo->physicalZones);
+  vdo->physicalZones = NULL;
+
+  uninitializeAdminCompletion(&vdo->adminCompletion);
+  freeReadOnlyNotifier(&vdo->readOnlyNotifier);
+  freeThreadConfig(&vdo->loadConfig.threadConfig);
+}
+
+/**********************************************************************/
+void freeVDO(VDO **vdoPtr)
+{
+  if (*vdoPtr == NULL) {
+    return;
+  }
+
+  destroyVDO(*vdoPtr);
+  FREE(*vdoPtr);
+  *vdoPtr = NULL;
+}
+
+/**********************************************************************/
+size_t getComponentDataSize(VDO *vdo)
+{
+  return (sizeof(VersionNumber)
+          + sizeof(VersionNumber)
+          + sizeof(VDOComponent41_0)
+          + getVDOLayoutEncodedSize(vdo->layout)
+          + getRecoveryJournalEncodedSize()
+          + getSlabDepotEncodedSize()
+          + getBlockMapEncodedSize());
+}
+
+/**
+ * Encode the VDO master version.
+ *
+ * @param buffer  The buffer in which to encode the version
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int encodeMasterVersion(Buffer *buffer)
+{
+  return encodeVersionNumber(VDO_MASTER_VERSION_67_0, buffer);
+}
+
+/**
+ * Encode a VDOConfig structure into a buffer.
+ *
+ * @param config  The config structure to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int encodeVDOConfig(const VDOConfig *config, Buffer *buffer)
+{
+  int result = putUInt64LEIntoBuffer(buffer, config->logicalBlocks);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->physicalBlocks);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->slabSize);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, config->recoveryJournalSize);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks);
+}
+
+/**
+ * Encode the component data for the VDO itself.
+ *
+ * @param vdo     The vdo to encode
+ * @param buffer  The buffer in which to encode the VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int encodeVDOComponent(const VDO *vdo, Buffer *buffer)
+{
+  int result = encodeVersionNumber(VDO_COMPONENT_DATA_41_0, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  size_t initialLength = contentLength(buffer);
+
+  result = putUInt32LEIntoBuffer(buffer, vdo->state);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, vdo->completeRecoveries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, vdo->readOnlyRecoveries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeVDOConfig(&vdo->config, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, vdo->nonce);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  size_t encodedSize = contentLength(buffer) - initialLength;
+  return ASSERT(encodedSize == sizeof(VDOComponent41_0),
+                "encoded VDO component size must match structure size");
+}
+
+/**********************************************************************/
+static int encodeVDO(VDO *vdo)
+{
+  Buffer *buffer = getComponentBuffer(vdo->superBlock);
+  int result = resetBufferEnd(buffer, 0);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeMasterVersion(buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeVDOComponent(vdo, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeVDOLayout(vdo->layout, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeRecoveryJournal(vdo->recoveryJournal, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeSlabDepot(vdo->depot, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeBlockMap(vdo->blockMap, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ASSERT_LOG_ONLY((contentLength(buffer) == getComponentDataSize(vdo)),
+                  "All super block component data was encoded");
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int saveVDOComponents(VDO *vdo)
+{
+  int result = encodeVDO(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo));
+}
+
+/**********************************************************************/
+void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent)
+{
+  int result = encodeVDO(vdo);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  saveSuperBlockAsync(vdo->superBlock, getFirstBlockOffset(vdo), parent);
+}
+
+/**********************************************************************/
+int saveReconfiguredVDO(VDO *vdo)
+{
+  Buffer *buffer         = getComponentBuffer(vdo->superBlock);
+  size_t  componentsSize = contentLength(buffer);
+
+  byte *components;
+  int   result = copyBytes(buffer, componentsSize, &components);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = resetBufferEnd(buffer, 0);
+  if (result != VDO_SUCCESS) {
+    FREE(components);
+    return result;
+  }
+
+  result = encodeMasterVersion(buffer);
+  if (result != VDO_SUCCESS) {
+    FREE(components);
+    return result;
+  }
+
+  result = encodeVDOComponent(vdo, buffer);
+  if (result != VDO_SUCCESS) {
+    FREE(components);
+    return result;
+  }
+
+  result = putBytes(buffer, componentsSize, components);
+  FREE(components);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo));
+}
+
+/**********************************************************************/
+int decodeVDOVersion(VDO *vdo)
+{
+  return decodeVersionNumber(getComponentBuffer(vdo->superBlock),
+                             &vdo->loadVersion);
+}
+
+/**********************************************************************/
+int validateVDOVersion(VDO *vdo)
+{
+  int result = decodeVDOVersion(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ReleaseVersionNumber loadedReleaseVersion
+    = getLoadedReleaseVersion(vdo->superBlock);
+  if (vdo->loadConfig.releaseVersion != loadedReleaseVersion) {
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "Geometry release version %" PRIu32 " does "
+                                   "not match super block release version %"
+                                   PRIu32,
+                                   vdo->loadConfig.releaseVersion,
+                                   loadedReleaseVersion);
+  }
+
+  return validateVersion(VDO_MASTER_VERSION_67_0, vdo->loadVersion, "master");
+}
+
+/**
+ * Decode a VDOConfig structure from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param config  The config structure to receive the decoded values
+ *
+ * @return UDS_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int decodeVDOConfig(Buffer *buffer, VDOConfig *config)
+{
+  BlockCount logicalBlocks;
+  int result = getUInt64LEFromBuffer(buffer, &logicalBlocks);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount physicalBlocks;
+  result = getUInt64LEFromBuffer(buffer, &physicalBlocks);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount slabSize;
+  result = getUInt64LEFromBuffer(buffer, &slabSize);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount recoveryJournalSize;
+  result = getUInt64LEFromBuffer(buffer, &recoveryJournalSize);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount slabJournalBlocks;
+  result = getUInt64LEFromBuffer(buffer, &slabJournalBlocks);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *config = (VDOConfig) {
+    .logicalBlocks       = logicalBlocks,
+    .physicalBlocks      = physicalBlocks,
+    .slabSize            = slabSize,
+    .recoveryJournalSize = recoveryJournalSize,
+    .slabJournalBlocks   = slabJournalBlocks,
+  };
+  return VDO_SUCCESS;
+}
+
+/**
+ * Decode the version 41.0 component state for the VDO itself from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param state   The state structure to receive the decoded values
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+  static int decodeVDOComponent_41_0(Buffer *buffer, VDOComponent41_0 *state)
+{
+  size_t initialLength = contentLength(buffer);
+
+  VDOState vdoState;
+  int result = getUInt32LEFromBuffer(buffer, &vdoState);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  uint64_t completeRecoveries;
+  result = getUInt64LEFromBuffer(buffer, &completeRecoveries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  uint64_t readOnlyRecoveries;
+  result = getUInt64LEFromBuffer(buffer, &readOnlyRecoveries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDOConfig config;
+  result = decodeVDOConfig(buffer, &config);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Nonce nonce;
+  result = getUInt64LEFromBuffer(buffer, &nonce);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *state = (VDOComponent41_0) {
+    .state              = vdoState,
+    .completeRecoveries = completeRecoveries,
+    .readOnlyRecoveries = readOnlyRecoveries,
+    .config             = config,
+    .nonce              = nonce,
+  };
+
+  size_t decodedSize = initialLength - contentLength(buffer);
+  return ASSERT(decodedSize == sizeof(VDOComponent41_0),
+                "decoded VDO component size must match structure size");
+}
+
+/**********************************************************************/
+int decodeVDOComponent(VDO *vdo)
+{
+  Buffer *buffer = getComponentBuffer(vdo->superBlock);
+
+  VersionNumber version;
+  int result = decodeVersionNumber(buffer, &version);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateVersion(version, VDO_COMPONENT_DATA_41_0,
+                           "VDO component data");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDOComponent41_0 component;
+  result = decodeVDOComponent_41_0(buffer, &component);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Copy the decoded component into the VDO structure.
+  vdo->state              = component.state;
+  vdo->loadState          = component.state;
+  vdo->completeRecoveries = component.completeRecoveries;
+  vdo->readOnlyRecoveries = component.readOnlyRecoveries;
+  vdo->config             = component.config;
+  vdo->nonce              = component.nonce;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int validateVDOConfig(const VDOConfig *config,
+                      BlockCount       blockCount,
+                      bool             requireLogical)
+{
+  int result = ASSERT(config->slabSize > 0, "slab size unspecified");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(isPowerOfTwo(config->slabSize),
+                  "slab size must be a power of two");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->slabSize <= (1 << MAX_SLAB_BITS),
+                  "slab size must be less than or equal to 2^%d",
+                  MAX_SLAB_BITS);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->slabJournalBlocks >= MINIMUM_SLAB_JOURNAL_BLOCKS,
+                  "slab journal size meets minimum size");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->slabJournalBlocks <= config->slabSize,
+                  "slab journal size is within expected bound");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  SlabConfig slabConfig;
+  result = configureSlab(config->slabSize, config->slabJournalBlocks,
+                         &slabConfig);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((slabConfig.dataBlocks >= 1),
+                  "slab must be able to hold at least one block");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->physicalBlocks > 0, "physical blocks unspecified");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->physicalBlocks <= MAXIMUM_PHYSICAL_BLOCKS,
+                  "physical block count %llu exceeds maximum %llu",
+                  config->physicalBlocks, MAXIMUM_PHYSICAL_BLOCKS);
+  if (result != UDS_SUCCESS) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  // This can't check equality because FileLayer et al can only known about
+  // the storage size, which may not match the super block size.
+  if (blockCount < config->physicalBlocks) {
+    logError("A physical size of %llu blocks was specified,"
+             " but that is smaller than the %llu blocks"
+             " configured in the VDO super block",
+             blockCount, config->physicalBlocks);
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  result = ASSERT(!requireLogical || (config->logicalBlocks > 0),
+                  "logical blocks unspecified");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->logicalBlocks <= MAXIMUM_LOGICAL_BLOCKS,
+                  "logical blocks too large");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(config->recoveryJournalSize > 0,
+                  "recovery journal size unspecified");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(isPowerOfTwo(config->recoveryJournalSize),
+                  "recovery journal size must be a power of two");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return result;
+}
+
+/**
+ * Notify a VDO that it is going read-only. This will save the read-only state
+ * to the super block.
+ *
+ * <p>Implements ReadOnlyNotification.
+ *
+ * @param listener  The VDO
+ * @param parent    The completion to notify in order to acknowledge the
+ *                  notification
+ **/
+static void notifyVDOOfReadOnlyMode(void *listener, VDOCompletion *parent)
+{
+  VDO *vdo = listener;
+  if (inReadOnlyMode(vdo)) {
+    completeCompletion(parent);
+  }
+
+  vdo->state = VDO_READ_ONLY_MODE;
+  saveVDOComponentsAsync(vdo, parent);
+}
+
+/**********************************************************************/
+int enableReadOnlyEntry(VDO *vdo)
+{
+  return registerReadOnlyListener(vdo->readOnlyNotifier, vdo,
+                                  notifyVDOOfReadOnlyMode,
+                                  getAdminThread(getThreadConfig(vdo)));
+}
+
+/**********************************************************************/
+bool inReadOnlyMode(const VDO *vdo)
+{
+  return (vdo->state == VDO_READ_ONLY_MODE);
+}
+
+/**********************************************************************/
+bool isClean(const VDO *vdo)
+{
+  return ((vdo->state == VDO_CLEAN) || (vdo->state == VDO_NEW));
+}
+
+/**********************************************************************/
+bool wasClean(const VDO *vdo)
+{
+  return ((vdo->loadState == VDO_CLEAN) || (vdo->loadState == VDO_NEW));
+}
+
+/**********************************************************************/
+bool wasNew(const VDO *vdo)
+{
+  return (vdo->loadState == VDO_NEW);
+}
+
+/**********************************************************************/
+bool requiresReadOnlyRebuild(const VDO *vdo)
+{
+  return ((vdo->loadState == VDO_FORCE_REBUILD)
+          || (vdo->loadState == VDO_REBUILD_FOR_UPGRADE));
+}
+
+/**********************************************************************/
+bool requiresRebuild(const VDO *vdo)
+{
+  return ((vdo->state == VDO_DIRTY)
+          || (vdo->state == VDO_FORCE_REBUILD)
+          || (vdo->state == VDO_REPLAYING)
+          || (vdo->state == VDO_REBUILD_FOR_UPGRADE));
+}
+
+/**********************************************************************/
+bool requiresRecovery(const VDO *vdo)
+{
+  return ((vdo->loadState == VDO_DIRTY) || (vdo->loadState == VDO_REPLAYING)
+          || (vdo->loadState == VDO_RECOVERING));
+}
+
+/**********************************************************************/
+bool isReplaying(const VDO *vdo)
+{
+  return (vdo->state == VDO_REPLAYING);
+}
+
+/**********************************************************************/
+bool inRecoveryMode(const VDO *vdo)
+{
+  return (vdo->state == VDO_RECOVERING);
+}
+
+/**********************************************************************/
+void enterRecoveryMode(VDO *vdo)
+{
+  assertOnAdminThread(vdo, __func__);
+
+  if (inReadOnlyMode(vdo)) {
+    return;
+  }
+
+  logInfo("Entering recovery mode");
+  vdo->state = VDO_RECOVERING;
+}
+
+/**********************************************************************/
+void leaveRecoveryMode(VDO *vdo)
+{
+  assertOnAdminThread(vdo, __func__);
+
+  /*
+   * Since scrubbing can be stopped by vdoClose during recovery mode,
+   * do not change the VDO state if there are outstanding unrecovered slabs.
+   */
+  if (inReadOnlyMode(vdo)) {
+    return;
+  }
+
+  ASSERT_LOG_ONLY(inRecoveryMode(vdo), "VDO is in recovery mode");
+  logInfo("Exiting recovery mode");
+  vdo->state = VDO_DIRTY;
+}
+
+/**********************************************************************/
+void makeVDOReadOnly(VDO *vdo, int errorCode)
+{
+  enterReadOnlyMode(vdo->readOnlyNotifier, errorCode);
+}
+
+/**********************************************************************/
+bool setVDOCompressing(VDO *vdo, bool enableCompression)
+{
+  bool stateChanged = compareAndSwapBool(&vdo->compressing, !enableCompression,
+                                         enableCompression);
+  if (stateChanged && !enableCompression) {
+    // Flushing the packer is asynchronous, but we don't care when it
+    // finishes.
+    flushPacker(vdo->packer);
+  }
+
+  logInfo("compression is %s", (enableCompression ? "enabled" : "disabled"));
+  return (stateChanged ? !enableCompression : enableCompression);
+}
+
+/**********************************************************************/
+bool getVDOCompressing(VDO *vdo)
+{
+  return atomicLoadBool(&vdo->compressing);
+}
+
+/**********************************************************************/
+static size_t getBlockMapCacheSize(const VDO *vdo)
+{
+  return ((size_t) vdo->loadConfig.cacheSize) * VDO_BLOCK_SIZE;
+}
+
+/**
+ * Tally the hash lock statistics from all the hash zones.
+ *
+ * @param vdo  The vdo to query
+ *
+ * @return The sum of the hash lock statistics from all hash zones
+ **/
+static HashLockStatistics getHashLockStatistics(const VDO *vdo)
+{
+  HashLockStatistics totals;
+  memset(&totals, 0, sizeof(totals));
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) {
+    HashLockStatistics stats  = getHashZoneStatistics(vdo->hashZones[zone]);
+    totals.dedupeAdviceValid        += stats.dedupeAdviceValid;
+    totals.dedupeAdviceStale        += stats.dedupeAdviceStale;
+    totals.concurrentDataMatches    += stats.concurrentDataMatches;
+    totals.concurrentHashCollisions += stats.concurrentHashCollisions;
+  }
+
+  return totals;
+}
+
+/**
+ * Get the current error statistics from VDO.
+ *
+ * @param vdo  The vdo to query
+ *
+ * @return a copy of the current VDO error counters
+ **/
+static ErrorStatistics getVDOErrorStatistics(const VDO *vdo)
+{
+  /*
+   * The error counts can be incremented from arbitrary threads and so must be
+   * incremented atomically, but they are just statistics with no semantics
+   * that could rely on memory order, so unfenced reads are sufficient.
+   */
+  const AtomicErrorStatistics *atoms = &vdo->errorStats;
+  return (ErrorStatistics) {
+    .invalidAdvicePBNCount = relaxedLoad64(&atoms->invalidAdvicePBNCount),
+    .noSpaceErrorCount     = relaxedLoad64(&atoms->noSpaceErrorCount),
+    .readOnlyErrorCount    = relaxedLoad64(&atoms->readOnlyErrorCount),
+  };
+}
+
+/**********************************************************************/
+static const char *describeWritePolicy(WritePolicy policy)
+{
+  switch (policy) {
+  case WRITE_POLICY_ASYNC:
+    return "async";
+  case WRITE_POLICY_ASYNC_UNSAFE:
+    return "async-unsafe";
+  case WRITE_POLICY_SYNC:
+    return "sync";
+  default:
+    return "unknown";
+  }
+}
+
+/**********************************************************************/
+void getVDOStatistics(const VDO *vdo, VDOStatistics *stats)
+{
+  // These are immutable properties of the VDO object, so it is safe to
+  // query them from any thread.
+  RecoveryJournal *journal  = vdo->recoveryJournal;
+  SlabDepot       *depot    = vdo->depot;
+  // XXX config.physicalBlocks is actually mutated during resize and is in a
+  // packed structure, but resize runs on the admin thread so we're usually OK.
+  stats->version            = STATISTICS_VERSION;
+  stats->releaseVersion     = CURRENT_RELEASE_VERSION_NUMBER;
+  stats->logicalBlocks      = vdo->config.logicalBlocks;
+  stats->physicalBlocks     = vdo->config.physicalBlocks;
+  stats->blockSize          = VDO_BLOCK_SIZE;
+  stats->completeRecoveries = vdo->completeRecoveries;
+  stats->readOnlyRecoveries = vdo->readOnlyRecoveries;
+  stats->blockMapCacheSize  = getBlockMapCacheSize(vdo);
+  snprintf(stats->writePolicy, sizeof(stats->writePolicy), "%s",
+           describeWritePolicy(getWritePolicy(vdo)));
+
+  // The callees are responsible for thread-safety.
+  stats->dataBlocksUsed     = getPhysicalBlocksAllocated(vdo);
+  stats->overheadBlocksUsed = getPhysicalBlocksOverhead(vdo);
+  stats->logicalBlocksUsed  = getJournalLogicalBlocksUsed(journal);
+  stats->allocator          = getDepotBlockAllocatorStatistics(depot);
+  stats->journal            = getRecoveryJournalStatistics(journal);
+  stats->packer             = getPackerStatistics(vdo->packer);
+  stats->slabJournal        = getDepotSlabJournalStatistics(depot);
+  stats->slabSummary        = getSlabSummaryStatistics(getSlabSummary(depot));
+  stats->refCounts          = getDepotRefCountsStatistics(depot);
+  stats->blockMap           = getBlockMapStatistics(vdo->blockMap);
+  stats->hashLock           = getHashLockStatistics(vdo);
+  stats->errors             = getVDOErrorStatistics(vdo);
+  SlabCount slabTotal       = getDepotSlabCount(depot);
+  stats->recoveryPercentage
+    = (slabTotal - getDepotUnrecoveredSlabCount(depot)) * 100 / slabTotal;
+
+  // The "state" field is mutable, but we just need a unfenced atomic read.
+  VDOState state        = *((const volatile VDOState *) &vdo->state);
+  stats->inRecoveryMode = (state == VDO_RECOVERING);
+  snprintf(stats->mode, sizeof(stats->mode), "%s", describeVDOState(state));
+}
+
+/**********************************************************************/
+BlockCount getPhysicalBlocksAllocated(const VDO *vdo)
+{
+  return (getDepotAllocatedBlocks(vdo->depot)
+          - getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal));
+}
+
+/**********************************************************************/
+BlockCount getPhysicalBlocksFree(const VDO *vdo)
+{
+  return getDepotFreeBlocks(vdo->depot);
+}
+
+/**********************************************************************/
+BlockCount getPhysicalBlocksOverhead(const VDO *vdo)
+{
+  // XXX config.physicalBlocks is actually mutated during resize and is in a
+  // packed structure, but resize runs on admin thread so we're usually OK.
+  return (vdo->config.physicalBlocks
+          - getDepotDataBlocks(vdo->depot)
+          + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal));
+}
+
+/**********************************************************************/
+BlockCount getTotalBlockMapBlocks(const VDO *vdo)
+{
+  return (getNumberOfFixedBlockMapPages(vdo->blockMap)
+          + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal));
+}
+
+/**********************************************************************/
+WritePolicy getWritePolicy(const VDO *vdo)
+{
+  return vdo->loadConfig.writePolicy;
+}
+
+/**********************************************************************/
+void setWritePolicy(VDO *vdo, WritePolicy new)
+{
+  vdo->loadConfig.writePolicy = new;
+}
+
+/**********************************************************************/
+const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo)
+{
+  return &vdo->loadConfig;
+}
+
+/**********************************************************************/
+const ThreadConfig *getThreadConfig(const VDO *vdo)
+{
+  return vdo->loadConfig.threadConfig;
+}
+
+/**********************************************************************/
+BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo)
+{
+  return vdo->loadConfig.maximumAge;
+}
+
+/**********************************************************************/
+PageCount getConfiguredCacheSize(const VDO *vdo)
+{
+  return vdo->loadConfig.cacheSize;
+}
+
+/**********************************************************************/
+PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo)
+{
+  return vdo->loadConfig.firstBlockOffset;
+}
+
+/**********************************************************************/
+BlockMap *getBlockMap(const VDO *vdo)
+{
+  return vdo->blockMap;
+}
+
+/**********************************************************************/
+SlabDepot *getSlabDepot(VDO *vdo)
+{
+  return vdo->depot;
+}
+
+/**********************************************************************/
+RecoveryJournal *getRecoveryJournal(VDO *vdo)
+{
+  return vdo->recoveryJournal;
+}
+
+/**********************************************************************/
+void dumpVDOStatus(const VDO *vdo)
+{
+  dumpFlusher(vdo->flusher);
+  dumpRecoveryJournalStatistics(vdo->recoveryJournal);
+  dumpPacker(vdo->packer);
+  dumpSlabDepot(vdo->depot);
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) {
+    dumpLogicalZone(getLogicalZone(vdo->logicalZones, zone));
+  }
+
+  for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) {
+    dumpPhysicalZone(vdo->physicalZones[zone]);
+  }
+
+  for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) {
+    dumpHashZone(vdo->hashZones[zone]);
+  }
+}
+
+/**********************************************************************/
+void setVDOTracingFlags(VDO *vdo, bool vioTracing)
+{
+  vdo->vioTraceRecording = vioTracing;
+}
+
+/**********************************************************************/
+bool vdoVIOTracingEnabled(const VDO *vdo)
+{
+  return ((vdo != NULL) && vdo->vioTraceRecording);
+}
+
+/**********************************************************************/
+void assertOnAdminThread(VDO *vdo, const char *name)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == getAdminThread(getThreadConfig(vdo))),
+                  "%s called on admin thread", name);
+}
+
+/**********************************************************************/
+void assertOnLogicalZoneThread(const VDO  *vdo,
+                               ZoneCount   logicalZone,
+                               const char *name)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == getLogicalZoneThread(getThreadConfig(vdo), logicalZone)),
+                  "%s called on logical thread", name);
+}
+
+/**********************************************************************/
+void assertOnPhysicalZoneThread(const VDO  *vdo,
+                                ZoneCount   physicalZone,
+                                const char *name)
+{
+  ASSERT_LOG_ONLY((getCallbackThreadID()
+                   == getPhysicalZoneThread(getThreadConfig(vdo),
+                                            physicalZone)),
+                  "%s called on physical thread", name);
+}
+
+/**********************************************************************/
+HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name)
+{
+  /*
+   * Use a fragment of the chunk name as a hash code. To ensure uniform
+   * distributions, it must not overlap with fragments used elsewhere. Eight
+   * bits of hash should suffice since the number of hash zones is small.
+   */
+  // XXX Make a central repository for these offsets ala hashUtils.
+  // XXX Verify that the first byte is independent enough.
+  uint32_t hash = name->name[0];
+
+  /*
+   * Scale the 8-bit hash fragment to a zone index by treating it as a binary
+   * fraction and multiplying that by the zone count. If the hash is uniformly
+   * distributed over [0 .. 2^8-1], then (hash * count / 2^8) should be
+   * uniformly distributed over [0 .. count-1]. The multiply and shift is much
+   * faster than a divide (modulus) on X86 CPUs.
+   */
+  return vdo->hashZones[(hash * getThreadConfig(vdo)->hashZoneCount) >> 8];
+}
+
+/**********************************************************************/
+int getPhysicalZone(const VDO            *vdo,
+                    PhysicalBlockNumber   pbn,
+                    PhysicalZone        **zonePtr)
+{
+  if (pbn == ZERO_BLOCK) {
+    *zonePtr = NULL;
+    return VDO_SUCCESS;
+  }
+
+  // Used because it does a more restrictive bounds check than getSlab(), and
+  // done first because it won't trigger read-only mode on an invalid PBN.
+  if (!isPhysicalDataBlock(vdo->depot, pbn)) {
+    return VDO_OUT_OF_RANGE;
+  }
+
+  // With the PBN already checked, we should always succeed in finding a slab.
+  Slab *slab = getSlab(vdo->depot, pbn);
+  int result = ASSERT(slab != NULL, "getSlab must succeed on all valid PBNs");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *zonePtr = vdo->physicalZones[getSlabZoneNumber(slab)];
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+ZonedPBN validateDedupeAdvice(VDO                *vdo,
+                              const DataLocation *advice,
+                              LogicalBlockNumber  lbn)
+{
+  ZonedPBN noAdvice = { .pbn = ZERO_BLOCK };
+  if (advice == NULL) {
+    return noAdvice;
+  }
+
+  // Don't use advice that's clearly meaningless.
+  if ((advice->state == MAPPING_STATE_UNMAPPED)
+      || (advice->pbn == ZERO_BLOCK)) {
+    logDebug("Invalid advice from deduplication server: pbn %llu, "
+             "state %u. Giving up on deduplication of logical block %llu",
+             advice->pbn, advice->state, lbn);
+    atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1);
+    return noAdvice;
+  }
+
+  PhysicalZone  *zone;
+  int result = getPhysicalZone(vdo, advice->pbn, &zone);
+  if ((result != VDO_SUCCESS) || (zone == NULL)) {
+    logDebug("Invalid physical block number from deduplication server: %"
+             PRIu64 ", giving up on deduplication of logical block %llu",
+             advice->pbn, lbn);
+    atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1);
+    return noAdvice;
+  }
+
+  return (ZonedPBN) {
+    .pbn   = advice->pbn,
+    .state = advice->state,
+    .zone  = zone,
+  };
+}
diff --git a/vdo/base/vdo.h b/vdo/base/vdo.h
new file mode 100644
index 0000000..5741112
--- /dev/null
+++ b/vdo/base/vdo.h
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.h#3 $
+ */
+
+#ifndef VDO_H
+#define VDO_H
+
+#include "types.h"
+
+/**
+ * Allocate a VDO and associate it with its physical layer.
+ *
+ * @param [in]  layer       The physical layer the VDO sits on
+ * @param [out] vdoPtr      A pointer to hold the allocated VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Construct a VDO for use in user space with a synchronous layer.
+ *
+ * @param [in]  layer   The physical layer the VDO sits on
+ * @param [out] vdoPtr  A pointer to hold the allocated VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeVDO(PhysicalLayer *layer, VDO **vdoPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a VDO instance.
+ *
+ * @param vdo  The VDO to destroy
+ **/
+void destroyVDO(VDO *vdo);
+
+/**
+ * Destroy a VDO instance, free it, and null out the reference to it.
+ *
+ * @param vdoPtr  A reference to the VDO to free
+ **/
+void freeVDO(VDO **vdoPtr);
+
+/**
+ * Put a VDO into read-only mode and save the read-only state in the super
+ * block.
+ *
+ * @param vdo             The VDO to put into read-only mode
+ * @param errorCode       The error which caused the VDO to enter read-only
+ *                        mode
+ **/
+void makeVDOReadOnly(VDO *vdo, int errorCode);
+
+/**
+ * Set whether compression is enabled in VDO.
+ *
+ * @param vdo                The VDO
+ * @param enableCompression  Whether to enable compression in VDO
+ *
+ * @return State of compression before new value is set
+ **/
+bool setVDOCompressing(VDO *vdo, bool enableCompression);
+
+/**
+ * Get whether compression is enabled in VDO.
+ *
+ * @param vdo  The VDO
+ *
+ * @return State of compression
+ **/
+bool getVDOCompressing(VDO *vdo);
+
+/**
+ * Get the VDO statistics.
+ *
+ * @param [in]  vdo    The VDO
+ * @param [out] stats  The VDO statistics are returned here
+ **/
+void getVDOStatistics(const VDO *vdo, VDOStatistics *stats);
+
+/**
+ * Get the number of physical blocks in use by user data.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The number of blocks allocated for user data
+ **/
+BlockCount getPhysicalBlocksAllocated(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of unallocated physical blocks.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The number of free blocks
+ **/
+BlockCount getPhysicalBlocksFree(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the number of physical blocks used by VDO metadata.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The number of overhead blocks
+ **/
+BlockCount getPhysicalBlocksOverhead(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the total number of blocks used for the block map.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The number of block map blocks
+ **/
+BlockCount getTotalBlockMapBlocks(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the VDO write policy.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The write policy
+ **/
+WritePolicy getWritePolicy(const VDO *vdo);
+
+/**
+ * Set the VDO write policy.
+ *
+ * @param vdo  The VDO
+ * @param new  The new write policy
+ **/
+void setWritePolicy(VDO *vdo, WritePolicy new);
+
+/**
+ * Get a copy of the load-time configuration of the VDO.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The load-time configuration of the VDO
+ **/
+const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the thread config of the VDO.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The thread config
+ **/
+const ThreadConfig *getThreadConfig(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the configured maximum age of a dirty block map page.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The block map era length
+ **/
+BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the configured page cache size of the VDO.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The number of pages for the page cache
+ **/
+PageCount getConfiguredCacheSize(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the location of the first block of the VDO.
+ *
+ * @param vdo  The VDO
+ *
+ * @return The location of the first block managed by the VDO
+ **/
+PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO was new when it was loaded.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO was new
+ **/
+bool wasNew(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a DataLocation containing potential dedupe advice is
+ * well-formed and addresses a data block in one of the configured physical
+ * zones of the VDO. If it is, return the location and zone as a ZonedPBN;
+ * otherwise increment statistics tracking invalid advice and return an
+ * unmapped ZonedPBN.
+ *
+ * @param vdo     The VDO
+ * @param advice  The advice to validate (NULL indicates no advice)
+ * @param lbn     The logical block number of the write that requested advice,
+ *                which is only used for debug-level logging of invalid advice
+ *
+ * @return The ZonedPBN representing the advice, if valid, otherwise an
+ *         unmapped ZonedPBN if the advice was invalid or NULL
+ **/
+ZonedPBN validateDedupeAdvice(VDO                *vdo,
+                              const DataLocation *advice,
+                              LogicalBlockNumber  lbn)
+  __attribute__((warn_unused_result));
+
+// TEST SUPPORT ONLY BEYOND THIS POINT
+
+/**
+ * Dump status information about VDO to the log for debugging.
+ *
+ * @param vdo  The vdo to dump
+ **/
+void dumpVDOStatus(const VDO *vdo);
+
+/**
+ * Set the VIO tracing flag.
+ *
+ * @param vdo         The VDO
+ * @param vioTracing  Whether VIO tracing is enabled for this device
+ **/
+void setVDOTracingFlags(VDO *vdo, bool vioTracing);
+
+/**
+ * Indicate whether VIO tracing is enabled.
+ *
+ * @param vdo  The VDO
+ *
+ * @return Whether VIO tracing is enabled
+ **/
+bool vdoVIOTracingEnabled(const VDO *vdo);
+
+/**
+ * Indicate whether extent tracing is enabled.
+ *
+ * @param vdo  The VDO
+ *
+ * @return Whether extent tracing is enabled
+ **/
+bool vdoExtentTracingEnabled(const VDO *vdo);
+
+#endif /* VDO_H */
diff --git a/vdo/base/vdoDebug.c b/vdo/base/vdoDebug.c
new file mode 100644
index 0000000..6c03ece
--- /dev/null
+++ b/vdo/base/vdoDebug.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.c#1 $
+ */
+
+#include "vdoDebug.h"
+
+#include "logger.h"
+#include "stringUtils.h"
+#include "vdoInternal.h"
+
+static const char xLogDebugMessage[]     = "x-log-debug-message";
+
+/**********************************************************************/
+int initializeVDOCommandCompletion(VDOCommandCompletion  *command,
+                                   VDO                   *vdo,
+                                   int                    argc,
+                                   char                 **argv)
+{
+  *command = (VDOCommandCompletion) {
+    .vdo  = vdo,
+    .argc = argc,
+    .argv = argv,
+  };
+  initializeCompletion(&command->completion, VDO_COMMAND_COMPLETION,
+                       vdo->layer);
+  return initializeEnqueueableCompletion(&command->subCompletion,
+                                         VDO_COMMAND_SUB_COMPLETION,
+                                         vdo->layer);
+}
+
+/**********************************************************************/
+int destroyVDOCommandCompletion(VDOCommandCompletion *command)
+{
+  if (command == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  destroyEnqueueable(&command->subCompletion);
+  return command->completion.result;
+}
+
+/**********************************************************************/
+static inline VDOCommandCompletion *
+asVDOCommandCompletion(VDOCompletion *completion)
+{
+  if (completion->type == VDO_COMMAND_COMPLETION) {
+    return (VDOCommandCompletion *)
+      ((uintptr_t) completion - offsetof(VDOCommandCompletion, completion));
+  } else if (completion->type == VDO_COMMAND_SUB_COMPLETION) {
+    return (VDOCommandCompletion *)
+      ((uintptr_t) completion - offsetof(VDOCommandCompletion, subCompletion));
+  } else {
+    ASSERT_LOG_ONLY(((completion->type == VDO_COMMAND_COMPLETION) ||
+                     (completion->type == VDO_COMMAND_SUB_COMPLETION)),
+                    "completion type is %s instead of "
+                    "VDO_COMMAND_COMPLETION or VDO_COMMAND_SUB_COMPLETION",
+                    getCompletionTypeName(completion->type));
+    return NULL;
+  }
+}
+
+/**********************************************************************/
+static void logDebugMessage(VDOCommandCompletion *cmd)
+{
+  static char buffer[256];
+
+  char *buf = buffer;
+  char *end = buffer + sizeof(buffer);
+
+  for (int i = 1; i < cmd->argc; ++i) {
+    buf = appendToBuffer(buf, end, " %s", cmd->argv[i]);
+  }
+  if (buf == end) {
+    strcpy(buf - 4, "...");
+  }
+  logInfo("debug message:%s", buffer);
+  finishCompletion(&cmd->completion, VDO_SUCCESS);
+}
+
+/**********************************************************************/
+void executeVDOExtendedCommand(VDOCompletion *completion)
+{
+  VDOCommandCompletion *cmd = asVDOCommandCompletion(completion);
+
+  if ((cmd->vdo == NULL) || (cmd->argc == 0)) {
+    finishCompletion(&cmd->completion, VDO_COMMAND_ERROR);
+    return;
+  }
+  if (strcmp(cmd->argv[0], xLogDebugMessage) == 0) {
+    logDebugMessage(cmd);
+  } else {
+    finishCompletion(&cmd->completion, VDO_UNKNOWN_COMMAND);
+  }
+}
diff --git a/vdo/base/vdoDebug.h b/vdo/base/vdoDebug.h
new file mode 100644
index 0000000..c626533
--- /dev/null
+++ b/vdo/base/vdoDebug.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.h#1 $
+ */
+
+#ifndef VDO_DEBUG_H
+#define VDO_DEBUG_H
+
+#include "completion.h"
+#include "vdo.h"
+
+/**
+ * A completion used to pass information to a potentially asynchronous
+ * (because it must run in a different zone) extended command.
+ *
+ * These commands are dispatched according to argv[0], which is of the form
+ * "x-some-command-name", and intentionally open ended for debugging.
+ *
+ * The command "x-log-debug-message" is currently defined to echo the
+ * remainder of the arguments into the kernel log via the vdo logger at
+ * info level.
+ **/
+typedef struct vdoCommandCompletion {
+  VDOCompletion         completion;
+  VDOCompletion         subCompletion;
+  VDO                  *vdo;
+  int                   argc;
+  char                **argv;
+} VDOCommandCompletion;
+
+/**
+ * Initialize a VDO command completion.
+ *
+ * @param command       The command completion to initialize.
+ * @param vdo           The VDO.
+ * @param argc          An argument count.
+ * @param argv          An argument vector of length argc.
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int initializeVDOCommandCompletion(VDOCommandCompletion  *command,
+                                   VDO                   *vdo,
+                                   int                    argc,
+                                   char                 **argv);
+
+/**
+ * Destroy a VDO command completion.
+ *
+ * @param command               The command completion.
+ *
+ * @return the completion result
+ **/
+int destroyVDOCommandCompletion(VDOCommandCompletion *command);
+
+/**
+ * Perform an asynchronous extended command (usually debugging related).
+ *
+ * @param completion    The completion embedded in VDOCommandCompletion.
+ **/
+void executeVDOExtendedCommand(VDOCompletion *completion);
+
+#endif // VDO_DEBUG_H
diff --git a/vdo/base/vdoInternal.h b/vdo/base/vdoInternal.h
new file mode 100644
index 0000000..1337e73
--- /dev/null
+++ b/vdo/base/vdoInternal.h
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoInternal.h#11 $
+ */
+
+#ifndef VDO_INTERNAL_H
+#define VDO_INTERNAL_H
+
+#include "vdo.h"
+
+#include "adminCompletion.h"
+#include "adminState.h"
+#include "atomic.h"
+#include "header.h"
+#include "packer.h"
+#include "statistics.h"
+#include "superBlock.h"
+#include "readOnlyNotifier.h"
+#include "types.h"
+#include "uds.h"
+#include "vdoLayout.h"
+#include "vdoState.h"
+
+/**
+ * Error counters are atomic since updates can arrive concurrently from
+ * arbitrary threads.
+ **/
+typedef struct atomicErrorStatistics {
+  // Dedupe path error stats
+  Atomic64 invalidAdvicePBNCount;
+  Atomic64 noSpaceErrorCount;
+  Atomic64 readOnlyErrorCount;
+} AtomicErrorStatistics;
+
+struct vdo {
+  /* The state of this VDO */
+  VDOState              state;
+  /* The read-only notifier */
+  ReadOnlyNotifier     *readOnlyNotifier;
+  /* The number of times this VDO has recovered from a dirty state */
+  uint64_t              completeRecoveries;
+  /* The number of times this VDO has recovered from a read-only state */
+  uint64_t              readOnlyRecoveries;
+  /* The format-time configuration of this VDO */
+  VDOConfig             config;
+  /* The load-time configuration of this VDO */
+  VDOLoadConfig         loadConfig;
+  /* The nonce for this VDO */
+  Nonce                 nonce;
+
+  /* The super block */
+  SuperBlock           *superBlock;
+
+  /* The physical storage below us */
+  PhysicalLayer        *layer;
+
+  /* Our partitioning of the physical layer's storage */
+  VDOLayout            *layout;
+
+  /* The block map */
+  BlockMap             *blockMap;
+
+  /* The journal for block map recovery */
+  RecoveryJournal      *recoveryJournal;
+
+  /* The slab depot */
+  SlabDepot            *depot;
+
+  /* The compressed-block packer */
+  Packer               *packer;
+  /* Whether incoming data should be compressed */
+  AtomicBool            compressing;
+
+  /* The handler for flush requests */
+  Flusher              *flusher;
+
+  /* The master version of the VDO when loaded (for upgrading) */
+  VersionNumber         loadVersion;
+  /* The state the VDO was in when loaded (primarily for unit tests) */
+  VDOState              loadState;
+  /* Whether VIO tracing is enabled */
+  bool                  vioTraceRecording;
+
+  /* The logical zones of this VDO */
+  LogicalZones          *logicalZones;
+
+  /* The physical zones of this VDO */
+  PhysicalZone         **physicalZones;
+
+  /* The hash lock zones of this VDO */
+  HashZone             **hashZones;
+
+  /* The completion for administrative operations */
+  AdminCompletion        adminCompletion;
+
+  /* The administrative state of the VDO */
+  AdminState             adminState;
+
+  /* Whether a close is required */
+  bool                   closeRequired;
+
+  /* Atomic global counts of error events */
+  AtomicErrorStatistics  errorStats;
+};
+
+/**
+ * Get the component data size of a VDO.
+ *
+ * @param vdo  The VDO whose component data size is desired
+ *
+ * @return the component data size of the VDO
+ **/
+size_t getComponentDataSize(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode the VDO and save the super block synchronously.
+ *
+ * @param vdo  The VDO whose state is being saved
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int saveVDOComponents(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode the VDO and save the super block asynchronously. All non-user mode
+ * super block savers should use this bottle neck instead of calling
+ * saveSuperBlockAsync() directly.
+ *
+ * @param vdo     The VDO whose state is being saved
+ * @param parent  The completion to notify when the save is complete
+ **/
+void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent);
+
+/**
+ * Re-encode the VDO component after a reconfiguration and save the super
+ * block synchronously. This function avoids the need to decode and re-encode
+ * the other components by simply copying their previous encoding.
+ *
+ * @param vdo  The VDO which was reconfigured
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int saveReconfiguredVDO(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the VDO master version from the component data buffer in the super
+ * block and store it in the VDO's loadVersion field.
+ **/
+int decodeVDOVersion(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Loads the VDO master version into the VDO and checks that the version
+ * can be understood by VDO.
+ *
+ * @param vdo  The VDO to validate
+ *
+ * @return VDO_SUCCESS or an error if the loaded version is not supported
+ **/
+int validateVDOVersion(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode the component data for the VDO itself from the component data buffer
+ * in the super block.
+ *
+ * @param vdo  The VDO to decode
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int decodeVDOComponent(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Validate constraints on VDO config.
+ *
+ * @param config          The VDO config
+ * @param blockCount      The block count of the VDO
+ * @param requireLogical  Set to <code>true</code> if the number logical blocks
+ *                        must be configured (otherwise, it may be zero)
+ *
+ * @return a success or error code
+ **/
+int validateVDOConfig(const VDOConfig *config,
+                      BlockCount       blockCount,
+                      bool             requireLogical)
+  __attribute__((warn_unused_result));
+
+/**
+ * Enable a VDO to enter read-only mode on errors.
+ *
+ * @param vdo  The VDO to enable
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int enableReadOnlyEntry(VDO *vdo);
+
+/**
+ * Get the block map.
+ *
+ * @param vdo  The VDO whose block map is desired
+ *
+ * @return the block map from the VDO
+ **/
+BlockMap *getBlockMap(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the slab depot from a VDO.
+ *
+ * @param vdo  The VDO whose slab depot is desired
+ *
+ * @return the slab depot from the VDO
+ **/
+SlabDepot *getSlabDepot(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the recovery journal from a VDO.
+ *
+ * @param vdo  The VDO whose recovery journal is desired
+ *
+ * @return the recovery journal from the VDO
+ **/
+RecoveryJournal *getRecoveryJournal(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a VDO is in read-only mode.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO is in read-only mode
+ **/
+bool inReadOnlyMode(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO is in a clean state.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO is clean
+ **/
+bool isClean(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO was in a clean state when it was loaded.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO was clean
+ **/
+bool wasClean(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO requires a read-only mode rebuild.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO requires a read-only rebuild
+ **/
+bool requiresReadOnlyRebuild(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a VDO requires rebuilding.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO must be rebuilt
+ **/
+bool requiresRebuild(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a VDO should enter recovery mode.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO requires recovery
+ **/
+bool requiresRecovery(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether a VDO was replaying the recovery journal into the block map
+ * when it crashed.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO crashed while reconstructing the
+ *         block map
+ **/
+bool isReplaying(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Check whether the VDO is in recovery mode.
+ *
+ * @param vdo  The VDO to query
+ *
+ * @return <code>true</code> if the VDO is in recovery mode
+ **/
+bool inRecoveryMode(const VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Put the VDO into recovery mode
+ *
+ * @param vdo  The VDO
+ **/
+void enterRecoveryMode(VDO *vdo);
+
+/**
+ * Leave recovery mode if slab scrubbing has actually finished.
+ *
+ * @param vdo  The VDO
+ **/
+void leaveRecoveryMode(VDO *vdo);
+
+/**
+ * Assert that we are running on the admin thread.
+ *
+ * @param vdo   The VDO
+ * @param name  The name of the function which should be running on the admin
+ *              thread (for logging).
+ **/
+void assertOnAdminThread(VDO *vdo, const char *name);
+
+/**
+ * Assert that this function was called on the specified logical zone thread.
+ *
+ * @param vdo          The VDO
+ * @param logicalZone  The number of the logical zone
+ * @param name         The name of the calling function
+ **/
+void assertOnLogicalZoneThread(const VDO  *vdo,
+                               ZoneCount   logicalZone,
+                               const char *name);
+
+/**
+ * Assert that this function was called on the specified physical zone thread.
+ *
+ * @param vdo           The VDO
+ * @param physicalZone  The number of the physical zone
+ * @param name          The name of the calling function
+ **/
+void assertOnPhysicalZoneThread(const VDO  *vdo,
+                                ZoneCount   physicalZone,
+                                const char *name);
+
+/**
+ * Select the hash zone responsible for locking a given chunk name.
+ *
+ * @param vdo   The VDO containing the hash zones
+ * @param name  The chunk name
+ *
+ * @return  The hash zone responsible for the chunk name
+ **/
+HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the physical zone responsible for a given physical block number of a
+ * data block in this VDO instance, or of the zero block (for which a NULL
+ * zone is returned). For any other block number that is not in the range of
+ * valid data block numbers in any slab, an error will be returned. This
+ * function is safe to call on invalid block numbers; it will not put the VDO
+ * into read-only mode.
+ *
+ * @param [in]  vdo      The VDO containing the physical zones
+ * @param [in]  pbn      The PBN of the data block
+ * @param [out] zonePtr  A pointer to return the physical zone
+ *
+ * @return VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid
+ *         or an error code for any other failure
+ **/
+int getPhysicalZone(const VDO            *vdo,
+                    PhysicalBlockNumber   pbn,
+                    PhysicalZone        **zonePtr)
+  __attribute__((warn_unused_result));
+
+/**********************************************************************/
+// Asynchronous callback to share a duplicate block. This is only public so
+// test code may compare it against the current callback in the completion.
+void shareBlock(VDOCompletion *completion);
+
+#endif /* VDO_INTERNAL_H */
diff --git a/vdo/base/vdoLayout.c b/vdo/base/vdoLayout.c
new file mode 100644
index 0000000..3dfce96
--- /dev/null
+++ b/vdo/base/vdoLayout.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.c#2 $
+ */
+
+#include "vdoLayout.h"
+#include "vdoLayoutInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockMap.h"
+#include "partitionCopy.h"
+#include "slab.h"
+#include "slabSummary.h"
+#include "types.h"
+#include "vdoInternal.h"
+
+#include "statusCodes.h"
+
+static const PartitionID REQUIRED_PARTITIONS[] = {
+  BLOCK_MAP_PARTITION,
+  BLOCK_ALLOCATOR_PARTITION,
+  RECOVERY_JOURNAL_PARTITION,
+  SLAB_SUMMARY_PARTITION,
+};
+
+static const uint8_t REQUIRED_PARTITION_COUNT = 4;
+
+/**
+ * Make a fixed layout for a VDO.
+ *
+ * @param [in]  physicalBlocks  The number of physical blocks in the VDO
+ * @param [in]  startingOffset  The starting offset of the layout
+ * @param [in]  blockMapBlocks  The size of the block map partition
+ * @param [in]  journalBlocks   The size of the journal partition
+ * @param [in]  summaryBlocks   The size of the slab summary partition
+ * @param [out] layoutPtr       A pointer to hold the new FixedLayout
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int makeVDOFixedLayout(BlockCount            physicalBlocks,
+                              PhysicalBlockNumber   startingOffset,
+                              BlockCount            blockMapBlocks,
+                              BlockCount            journalBlocks,
+                              BlockCount            summaryBlocks,
+                              FixedLayout         **layoutPtr)
+{
+  BlockCount necessarySize
+    = (startingOffset + blockMapBlocks + journalBlocks + summaryBlocks);
+  if (necessarySize > physicalBlocks) {
+    return logErrorWithStringError(VDO_NO_SPACE, "Not enough space to"
+                                   " make a VDO");
+  }
+
+  FixedLayout *layout;
+  int result = makeFixedLayout(physicalBlocks - startingOffset,
+                               startingOffset, &layout);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeFixedLayoutPartition(layout, BLOCK_MAP_PARTITION,
+                                    blockMapBlocks, FROM_BEGINNING, 0);
+  if (result != VDO_SUCCESS) {
+    freeFixedLayout(&layout);
+    return result;
+  }
+
+  result = makeFixedLayoutPartition(layout, SLAB_SUMMARY_PARTITION,
+                                    summaryBlocks, FROM_END, 0);
+  if (result != VDO_SUCCESS) {
+    freeFixedLayout(&layout);
+    return result;
+  }
+
+  result = makeFixedLayoutPartition(layout, RECOVERY_JOURNAL_PARTITION,
+                                    journalBlocks, FROM_END, 0);
+  if (result != VDO_SUCCESS) {
+    freeFixedLayout(&layout);
+    return result;
+  }
+
+  /*
+   * The block allocator no longer traffics in relative PBNs so the offset
+   * doesn't matter. We need to keep this partition around both for upgraded
+   * systems, and because we decided that all of the usable space in the
+   * volume, other than the super block, should be part of some partition.
+   */
+  result = makeFixedLayoutPartition(layout, BLOCK_ALLOCATOR_PARTITION,
+                                    ALL_FREE_BLOCKS, FROM_BEGINNING,
+                                    blockMapBlocks);
+  if (result != VDO_SUCCESS) {
+    freeFixedLayout(&layout);
+    return result;
+  }
+
+  *layoutPtr = layout;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Get the offset of a given partition.
+ *
+ * @param layout       The layout containing the partition
+ * @param partitionID  The ID of the partition whose offset is desired
+ *
+ * @return The offset of the partition (in blocks)
+ **/
+__attribute__((warn_unused_result))
+static BlockCount getPartitionOffset(VDOLayout   *layout,
+                                     PartitionID  partitionID)
+{
+  return getFixedLayoutPartitionOffset(getVDOPartition(layout, partitionID));
+}
+
+/**********************************************************************/
+int makeVDOLayout(BlockCount            physicalBlocks,
+                  PhysicalBlockNumber   startingOffset,
+                  BlockCount            blockMapBlocks,
+                  BlockCount            journalBlocks,
+                  BlockCount            summaryBlocks,
+                  VDOLayout           **vdoLayoutPtr)
+{
+  VDOLayout *vdoLayout;
+  int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeVDOFixedLayout(physicalBlocks, startingOffset, blockMapBlocks,
+                              journalBlocks, summaryBlocks, &vdoLayout->layout);
+  if (result != VDO_SUCCESS) {
+    freeVDOLayout(&vdoLayout);
+    return result;
+  }
+
+  vdoLayout->startingOffset = startingOffset;
+
+  *vdoLayoutPtr = vdoLayout;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr)
+{
+  VDOLayout *vdoLayout;
+  int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeFixedLayout(buffer, &vdoLayout->layout);
+  if (result != VDO_SUCCESS) {
+    freeVDOLayout(&vdoLayout);
+    return result;
+  }
+
+  // Check that all the expected partitions exist
+  Partition *partition;
+  for (uint8_t i = 0; i < REQUIRED_PARTITION_COUNT; i++) {
+    result = getPartition(vdoLayout->layout, REQUIRED_PARTITIONS[i],
+                          &partition);
+    if (result != VDO_SUCCESS) {
+      freeVDOLayout(&vdoLayout);
+      return logErrorWithStringError(result,
+                                     "VDO layout is missing required partition"
+                                     " %u", REQUIRED_PARTITIONS[i]);
+    }
+  }
+
+  // XXX Assert this is the same as where we loaded the super block.
+  vdoLayout->startingOffset
+    = getPartitionOffset(vdoLayout, BLOCK_MAP_PARTITION);
+
+  *vdoLayoutPtr = vdoLayout;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeVDOLayout(VDOLayout **vdoLayoutPtr)
+{
+  VDOLayout *vdoLayout = *vdoLayoutPtr;
+  if (vdoLayout == NULL) {
+    return;
+  }
+
+  freeCopyCompletion(&vdoLayout->copyCompletion);
+  freeFixedLayout(&vdoLayout->nextLayout);
+  freeFixedLayout(&vdoLayout->layout);
+  freeFixedLayout(&vdoLayout->previousLayout);
+  FREE(vdoLayout);
+  *vdoLayoutPtr = NULL;
+}
+
+/**
+ * Get a partition from a FixedLayout in conditions where we expect that it can
+ * not fail.
+ *
+ * @param layout  The FixedLayout from which to get the partition
+ * @param id      The ID of the partition to retrieve
+ *
+ * @return The desired partition
+ **/
+__attribute__((warn_unused_result))
+static Partition *retrievePartition(FixedLayout *layout, PartitionID id)
+{
+  Partition *partition;
+  int result = getPartition(layout, id, &partition);
+  ASSERT_LOG_ONLY(result == VDO_SUCCESS, "VDOLayout has expected partition");
+  return partition;
+}
+
+/**********************************************************************/
+Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id)
+{
+  return retrievePartition(vdoLayout->layout, id);
+}
+
+/**
+ * Get a partition from a VDOLayout's next FixedLayout. This method should
+ * only be called when the VDOLayout is prepared to grow.
+ *
+ * @param vdoLayout  The VDOLayout from which to get the partition
+ * @param id         The ID of the desired partition
+ *
+ * @return The requested partition
+ **/
+__attribute__((warn_unused_result))
+static Partition *getPartitionFromNextLayout(VDOLayout   *vdoLayout,
+                                             PartitionID  id)
+{
+  ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL,
+                  "VDOLayout is prepared to grow");
+  return retrievePartition(vdoLayout->nextLayout, id);
+}
+
+/**
+ * Get the size of a given partition.
+ *
+ * @param layout       The layout containing the partition
+ * @param partitionID  The partition ID whose size to find
+ *
+ * @return The size of the partition (in blocks)
+ **/
+__attribute__((warn_unused_result))
+static BlockCount getPartitionSize(VDOLayout *layout, PartitionID partitionID)
+{
+  return getFixedLayoutPartitionSize(getVDOPartition(layout, partitionID));
+}
+
+/**********************************************************************/
+int prepareToGrowVDOLayout(VDOLayout     *vdoLayout,
+                           BlockCount     oldPhysicalBlocks,
+                           BlockCount     newPhysicalBlocks,
+                           PhysicalLayer *layer)
+{
+  if (getNextVDOLayoutSize(vdoLayout) == newPhysicalBlocks) {
+    // We are already prepared to grow to the new size, so we're done.
+    return VDO_SUCCESS;
+  }
+
+  // Make a copy completion if there isn't one
+  if (vdoLayout->copyCompletion == NULL) {
+    int result = makeCopyCompletion(layer, &vdoLayout->copyCompletion);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Free any unused preparation.
+  freeFixedLayout(&vdoLayout->nextLayout);
+
+  // Make a new layout with the existing partition sizes for everything but the
+  // block allocator partition.
+  int result = makeVDOFixedLayout(newPhysicalBlocks,
+                                  vdoLayout->startingOffset,
+                                  getPartitionSize(vdoLayout,
+                                                   BLOCK_MAP_PARTITION),
+                                  getPartitionSize(vdoLayout,
+                                                   RECOVERY_JOURNAL_PARTITION),
+                                  getPartitionSize(vdoLayout,
+                                                   SLAB_SUMMARY_PARTITION),
+                                  &vdoLayout->nextLayout);
+  if (result != VDO_SUCCESS) {
+    freeCopyCompletion(&vdoLayout->copyCompletion);
+    return result;
+  }
+
+  // Ensure the new journal and summary are entirely within the added blocks.
+  Partition *slabSummaryPartition
+    = getPartitionFromNextLayout(vdoLayout, SLAB_SUMMARY_PARTITION);
+  Partition *recoveryJournalPartition
+    = getPartitionFromNextLayout(vdoLayout, RECOVERY_JOURNAL_PARTITION);
+  BlockCount minNewSize
+    = (oldPhysicalBlocks
+       + getFixedLayoutPartitionSize(slabSummaryPartition)
+       + getFixedLayoutPartitionSize(recoveryJournalPartition));
+  if (minNewSize > newPhysicalBlocks) {
+    // Copying the journal and summary would destroy some old metadata.
+    freeFixedLayout(&vdoLayout->nextLayout);
+    freeCopyCompletion(&vdoLayout->copyCompletion);
+    return VDO_INCREMENT_TOO_SMALL;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Get the size of a VDO from the specified FixedLayout and the
+ * starting offset thereof.
+ *
+ * @param layout          The fixed layout whose size to use
+ * @param startingOffset  The starting offset of the layout
+ *
+ * @return The total size of a VDO (in blocks) with the given layout
+ **/
+__attribute__((warn_unused_result))
+static BlockCount getVDOSize(FixedLayout *layout, BlockCount startingOffset)
+{
+  // The FixedLayout does not include the super block or any earlier
+  // metadata; all that is captured in the VDOLayout's starting offset
+  return getTotalFixedLayoutSize(layout) + startingOffset;
+}
+
+/**********************************************************************/
+BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout)
+{
+  return ((vdoLayout->nextLayout == NULL)
+          ? 0 : getVDOSize(vdoLayout->nextLayout, vdoLayout->startingOffset));
+}
+
+/**********************************************************************/
+BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout)
+{
+  if (vdoLayout->nextLayout == NULL) {
+    return 0;
+  }
+
+  Partition *partition = getPartitionFromNextLayout(vdoLayout,
+                                                    BLOCK_ALLOCATOR_PARTITION);
+  return getFixedLayoutPartitionSize(partition);
+}
+
+/**********************************************************************/
+BlockCount growVDOLayout(VDOLayout *vdoLayout)
+{
+  ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL,
+                  "VDO prepared to grow physical");
+  vdoLayout->previousLayout = vdoLayout->layout;
+  vdoLayout->layout         = vdoLayout->nextLayout;
+  vdoLayout->nextLayout     = NULL;
+
+  return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset);
+}
+
+/**********************************************************************/
+BlockCount revertVDOLayout(VDOLayout *vdoLayout)
+{
+  if ((vdoLayout->previousLayout != NULL)
+      && (vdoLayout->previousLayout != vdoLayout->layout)) {
+    // Only revert if there's something to revert to.
+    freeFixedLayout(&vdoLayout->layout);
+    vdoLayout->layout         = vdoLayout->previousLayout;
+    vdoLayout->previousLayout = NULL;
+  }
+
+  return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset);
+}
+
+/**********************************************************************/
+void finishVDOLayoutGrowth(VDOLayout *vdoLayout)
+{
+  if (vdoLayout->layout != vdoLayout->previousLayout) {
+    freeFixedLayout(&vdoLayout->previousLayout);
+  }
+
+  if (vdoLayout->layout != vdoLayout->nextLayout) {
+    freeFixedLayout(&vdoLayout->nextLayout);
+  }
+
+  freeCopyCompletion(&vdoLayout->copyCompletion);
+}
+
+/**********************************************************************/
+void copyPartition(VDOLayout     *layout,
+                   PartitionID    partitionID,
+                   VDOCompletion *parent)
+{
+  copyPartitionAsync(layout->copyCompletion,
+                     getVDOPartition(layout, partitionID),
+                     getPartitionFromNextLayout(layout, partitionID), parent);
+}
+
+/**********************************************************************/
+size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout)
+{
+  return getFixedLayoutEncodedSize(vdoLayout->layout);
+}
+
+/**********************************************************************/
+int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer)
+{
+  return encodeFixedLayout(vdoLayout->layout, buffer);
+}
+
diff --git a/vdo/base/vdoLayout.h b/vdo/base/vdoLayout.h
new file mode 100644
index 0000000..3de24ae
--- /dev/null
+++ b/vdo/base/vdoLayout.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.h#2 $
+ */
+
+/**
+ * VDOLayout is an object which manages the layout of a VDO. It wraps
+ * FixedLayout, but includes the knowledge of exactly which partitions a VDO is
+ * expected to have. Because of this knowledge, the VDOLayout validates the
+ * FixedLayout encoded in the super block at load time, obviating the need for
+ * subsequent error checking when other modules need to get partitions from the
+ * layout.
+ *
+ * The VDOLayout also manages the preparation and growth of the layout for grow
+ * physical operations.
+ **/
+
+#ifndef VDO_LAYOUT_H
+#define VDO_LAYOUT_H
+
+#include "fixedLayout.h"
+#include "types.h"
+
+/**
+ * Make a VDO layout with the specified parameters.
+ *
+ * @param [in]  physicalBlocks  The number of physical blocks in the VDO
+ * @param [in]  startingOffset  The starting offset of the layout
+ * @param [in]  blockMapBlocks  The size of the block map partition
+ * @param [in]  journalBlocks   The size of the journal partition
+ * @param [in]  summaryBlocks   The size of the slab summary partition
+ * @param [out] vdoLayoutPtr    A pointer to hold the new VDOLayout
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeVDOLayout(BlockCount            physicalBlocks,
+                  PhysicalBlockNumber   startingOffset,
+                  BlockCount            blockMapBlocks,
+                  BlockCount            journalBlocks,
+                  BlockCount            summaryBlocks,
+                  VDOLayout           **vdoLayoutPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Decode a VDOLayout from a buffer.
+ *
+ * @param [in]  buffer        The buffer from which to decode
+ * @param [out] vdoLayoutPtr  A pointer to hold the VDOLayout
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a VDOLayout and NULL out the reference to it.
+ *
+ * @param vdoLayoutPtr  The pointer to a VDOLayout to free
+ **/
+void freeVDOLayout(VDOLayout **vdoLayoutPtr);
+
+/**
+ * Get a partition from a VDOLayout. Because the layout's FixedLayout has
+ * already been validated, this can not fail.
+ *
+ * @param vdoLayout  The VDOLayout from which to get the partition
+ * @param id         The ID of the desired partition
+ *
+ * @return The requested partition
+ **/
+Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prepare the layout to be grown.
+ *
+ * @param vdoLayout          The layout to grow
+ * @param oldPhysicalBlocks  The current size of the VDO
+ * @param newPhysicalBlocks  The size to which the VDO will be grown
+ * @param layer              The layer being grown
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int prepareToGrowVDOLayout(VDOLayout     *vdoLayout,
+                           BlockCount     oldPhysicalBlocks,
+                           BlockCount     newPhysicalBlocks,
+                           PhysicalLayer *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the size of the next layout.
+ *
+ * @param vdoLayout  The layout to check
+ *
+ * @return The size which was specified when the layout was prepared for growth
+ *         or 0 if the layout is not prepared to grow
+ **/
+BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the size of the next block allocator partition.
+ *
+ * @param vdoLayout  The VDOLayout which has been prepared to grow
+ *
+ * @return The size of the block allocator partition in the next layout or 0
+ *         if the layout is not prepared to grow
+ **/
+BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Grow the layout by swapping in the prepared layout.
+ *
+ * @param vdoLayout  The layout to grow
+ *
+ * @return The new size of the VDO
+ **/
+BlockCount growVDOLayout(VDOLayout *vdoLayout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Revert the last growth attempt.
+ *
+ * @param vdoLayout  The layout to revert
+ *
+ * @return The reverted size (in blocks) of the VDO
+ **/
+BlockCount revertVDOLayout(VDOLayout *vdoLayout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Clean up any unused resources once an attempt to grow has completed.
+ *
+ * @param vdoLayout  The layout
+ **/
+void finishVDOLayoutGrowth(VDOLayout *vdoLayout);
+
+/**
+ * Copy a partition from the location specified in the current layout to that in
+ * the next layout.
+ *
+ * @param layout       The VDOLayout which is prepared to grow
+ * @param partitionID  The ID of the partition to copy
+ * @param parent       The completion to notify when the copy is complete
+ **/
+void copyPartition(VDOLayout     *layout,
+                   PartitionID    partitionID,
+                   VDOCompletion *parent);
+
+/**
+ * Get the size of an encoded VDOLayout.
+ *
+ * @param vdoLayout  The VDOLayout
+ *
+ * @return The encoded size of the VDOLayout
+ **/
+size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout)
+  __attribute__((warn_unused_result));
+
+/**
+ * Encode a VDOLayout into a buffer.
+ *
+ * @param vdoLayout  The VDOLayout to encode
+ * @param buffer     The buffer to encode into
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer)
+  __attribute__((warn_unused_result));
+
+#endif // VDO_LAYOUT_H
diff --git a/vdo/base/vdoLayoutInternals.h b/vdo/base/vdoLayoutInternals.h
new file mode 100644
index 0000000..5f038fe
--- /dev/null
+++ b/vdo/base/vdoLayoutInternals.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayoutInternals.h#2 $
+ */
+
+#ifndef VDO_LAYOUT_INTERNALS_H
+#define VDO_LAYOUT_INTERNALS_H
+
+#include "fixedLayout.h"
+#include "types.h"
+
+struct vdoLayout {
+  // The current layout of the VDO
+  FixedLayout         *layout;
+  // The next layout of the VDO
+  FixedLayout         *nextLayout;
+  // The previous layout of the VDO
+  FixedLayout         *previousLayout;
+  // The first block in the layouts
+  PhysicalBlockNumber  startingOffset;
+  // A pointer to the copy completion (if there is one)
+  VDOCompletion       *copyCompletion;
+};
+
+#endif // VDO_LAYOUT_INTERNALS_H
diff --git a/vdo/base/vdoLoad.c b/vdo/base/vdoLoad.c
new file mode 100644
index 0000000..c72f39e
--- /dev/null
+++ b/vdo/base/vdoLoad.c
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.c#17 $
+ */
+
+#include "vdoLoad.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "adminCompletion.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "constants.h"
+#include "hashZone.h"
+#include "header.h"
+#include "logicalZone.h"
+#include "physicalZone.h"
+#include "readOnlyRebuild.h"
+#include "recoveryJournal.h"
+#include "releaseVersions.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+#include "threadConfig.h"
+#include "types.h"
+#include "vdoInternal.h"
+#include "vdoRecovery.h"
+#include "volumeGeometry.h"
+
+/**
+ * Extract the VDO from an AdminCompletion, checking that the current operation
+ * is a load.
+ *
+ * @param completion  The AdminCompletion's sub-task completion
+ *
+ * @return The VDO
+ **/
+static inline VDO *vdoFromLoadSubTask(VDOCompletion *completion)
+{
+  return vdoFromAdminSubTask(completion, ADMIN_OPERATION_LOAD);
+}
+
+/**
+ * Finish aborting a load now that any entry to read-only mode is complete.
+ * This callback is registered in abortLoad().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void finishAborting(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  vdo->closeRequired = false;
+  finishParentCallback(completion);
+}
+
+/**
+ * Make sure the recovery journal is closed when aborting a load.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void closeRecoveryJournalForAbort(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  prepareAdminSubTask(vdo, finishAborting, finishAborting);
+  drainRecoveryJournal(vdo->recoveryJournal, ADMIN_STATE_SAVING, completion);
+}
+
+/**
+ * Clean up after an error loading a VDO. This error handler is set in
+ * loadCallback() and loadVDOComponents().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void abortLoad(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  logErrorWithStringError(completion->result, "aborting load");
+  if (vdo->readOnlyNotifier == NULL) {
+    // There are no threads, so we're done
+    finishParentCallback(completion);
+    return;
+  }
+
+  // Preserve the error.
+  setCompletionResult(completion->parent, completion->result);
+  if (vdo->recoveryJournal == NULL) {
+    prepareAdminSubTask(vdo, finishAborting, finishAborting);
+  } else {
+    prepareAdminSubTaskOnThread(vdo, closeRecoveryJournalForAbort,
+                                closeRecoveryJournalForAbort,
+                                getJournalZoneThread(getThreadConfig(vdo)));
+  }
+
+  waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion);
+}
+
+/**
+ * Wait for the VDO to be in read-only mode.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void waitForReadOnlyMode(VDOCompletion *completion)
+{
+  prepareToFinishParent(completion, completion->parent);
+  setCompletionResult(completion, VDO_READ_ONLY);
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion);
+}
+
+/**
+ * Finish loading the VDO after an error, but leave it in read-only
+ * mode.  This error handler is set in makeDirty(), scrubSlabs(), and
+ * loadVDOComponents().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void continueLoadReadOnly(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  logErrorWithStringError(completion->result,
+                          "Entering read-only mode due to load error");
+  enterReadOnlyMode(vdo->readOnlyNotifier, completion->result);
+  waitForReadOnlyMode(completion);
+}
+
+/**
+ * Exit recovery mode if necessary now that online slab scrubbing or loading
+ * is complete. This callback is registrered in scrubSlabs().
+ *
+ * @param completion  The slab scrubber completion
+ **/
+static void finishScrubbingSlabs(VDOCompletion *completion)
+{
+  VDO *vdo = completion->parent;
+  assertOnAdminThread(vdo, __func__);
+  if (inRecoveryMode(vdo)) {
+    leaveRecoveryMode(vdo);
+  } else {
+    logInfo("VDO commencing normal operation");
+  }
+}
+
+/**
+ * Handle an error scrubbing or loading all slabs after the VDO has come
+ * online. This error handler is registered in scrubSlabs().
+ *
+ * @param completion  The slab scrubber completion
+ **/
+static void handleScrubAllError(VDOCompletion *completion)
+{
+  VDO *vdo = completion->parent;
+  enterReadOnlyMode(vdo->readOnlyNotifier, completion->result);
+}
+
+/**
+ * Initiate slab scrubbing if necessary. This callback is registered in
+ * prepareToComeOnline().
+ *
+ * @param completion   The sub-task completion
+ **/
+static void scrubSlabs(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  if (!hasUnrecoveredSlabs(vdo->depot)) {
+    finishParentCallback(completion);
+    return;
+  }
+
+  if (requiresRecovery(vdo)) {
+    enterRecoveryMode(vdo);
+  }
+
+  prepareAdminSubTask(vdo, finishParentCallback, continueLoadReadOnly);
+  scrubAllUnrecoveredSlabs(vdo->depot, vdo, finishScrubbingSlabs,
+                           handleScrubAllError, 0, completion);
+}
+
+/**
+ * This is the error handler for slab scrubbing. It is registered in
+ * prepareToComeOnline().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void handleScrubbingError(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  enterReadOnlyMode(vdo->readOnlyNotifier, completion->result);
+  waitForReadOnlyMode(completion);
+}
+
+/**
+ * This is the callback after the super block is written. It prepares the block
+ * allocator to come online and start allocating. It is registered in
+ * makeDirty().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void prepareToComeOnline(VDOCompletion *completion)
+{
+  VDO               *vdo      = vdoFromLoadSubTask(completion);
+  SlabDepotLoadType  loadType = NORMAL_LOAD;
+  if (requiresReadOnlyRebuild(vdo)) {
+    loadType = REBUILD_LOAD;
+  } else if (requiresRecovery(vdo)) {
+    loadType = RECOVERY_LOAD;
+  }
+
+  initializeBlockMapFromJournal(vdo->blockMap, vdo->recoveryJournal);
+
+  prepareAdminSubTask(vdo, scrubSlabs, handleScrubbingError);
+  prepareToAllocate(vdo->depot, loadType, completion);
+}
+
+/**
+ * Mark the super block as dirty now that everything has been loaded or
+ * rebuilt.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void makeDirty(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  if (isReadOnly(vdo->readOnlyNotifier)) {
+    finishCompletion(completion->parent, VDO_READ_ONLY);
+    return;
+  }
+
+  vdo->state = VDO_DIRTY;
+  prepareAdminSubTask(vdo, prepareToComeOnline, continueLoadReadOnly);
+  saveVDOComponentsAsync(vdo, completion);
+}
+
+/**
+ * Callback to do the destructive parts of a load now that the new VDO device
+ * is being resumed.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void loadCallback(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  assertOnAdminThread(vdo, __func__);
+
+  // Prepare the recovery journal for new entries.
+  openRecoveryJournal(vdo->recoveryJournal, vdo->depot, vdo->blockMap);
+  vdo->closeRequired = true;
+  if (isReadOnly(vdo->readOnlyNotifier)) {
+    // In read-only mode we don't use the allocator and it may not
+    // even be readable, so use the default structure.
+    finishCompletion(completion->parent, VDO_READ_ONLY);
+    return;
+  }
+
+  if (requiresReadOnlyRebuild(vdo)) {
+    prepareAdminSubTask(vdo, makeDirty, abortLoad);
+    launchRebuild(vdo, completion);
+    return;
+  }
+
+  if (requiresRebuild(vdo)) {
+    prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly);
+    launchRecovery(vdo, completion);
+    return;
+  }
+
+  prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly);
+  loadSlabDepot(vdo->depot,
+                (wasNew(vdo) ? ADMIN_STATE_FORMATTING : ADMIN_STATE_LOADING),
+                completion, NULL);
+}
+
+/**********************************************************************/
+int performVDOLoad(VDO *vdo)
+{
+  return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, loadCallback,
+                               loadCallback);
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int startVDODecode(VDO *vdo, bool validateConfig)
+{
+  int result = validateVDOVersion(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeVDOComponent(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (!validateConfig) {
+    return VDO_SUCCESS;
+  }
+
+  if (vdo->loadConfig.nonce != vdo->nonce) {
+    return logErrorWithStringError(VDO_BAD_NONCE, "Geometry nonce %" PRIu64
+                                   " does not match superblock nonce %llu",
+                                   vdo->loadConfig.nonce, vdo->nonce);
+  }
+
+  BlockCount blockCount = vdo->layer->getBlockCount(vdo->layer);
+  return validateVDOConfig(&vdo->config, blockCount, true);
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int finishVDODecode(VDO *vdo)
+{
+  Buffer             *buffer       = getComponentBuffer(vdo->superBlock);
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  int result = makeRecoveryJournal(vdo->nonce, vdo->layer,
+                                   getVDOPartition(vdo->layout,
+                                                   RECOVERY_JOURNAL_PARTITION),
+                                   vdo->completeRecoveries,
+                                   vdo->config.recoveryJournalSize,
+                                   RECOVERY_JOURNAL_TAIL_BUFFER_SIZE,
+                                   vdo->readOnlyNotifier, threadConfig,
+                                   &vdo->recoveryJournal);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeRecoveryJournal(vdo->recoveryJournal, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer,
+                           getVDOPartition(vdo->layout,
+                                           SLAB_SUMMARY_PARTITION),
+                           vdo->readOnlyNotifier, vdo->recoveryJournal,
+                           &vdo->depot);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeBlockMap(buffer, vdo->config.logicalBlocks, threadConfig,
+                          &vdo->blockMap);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  ASSERT_LOG_ONLY((contentLength(buffer) == 0),
+                  "All decoded component data was used");
+  return VDO_SUCCESS;
+}
+
+/**
+ * Decode the component data portion of a super block and fill in the
+ * corresponding portions of the VDO being loaded. This will also allocate the
+ * recovery journal and slab depot. If this method is called with an
+ * asynchronous layer (i.e. a thread config which specifies at least one base
+ * thread), the block map and packer will be constructed as well.
+ *
+ * @param vdo             The VDO being loaded
+ * @param validateConfig  Whether to validate the config
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int decodeVDO(VDO *vdo, bool validateConfig)
+{
+  int result = startVDODecode(vdo, validateConfig);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer,
+                                &vdo->readOnlyNotifier);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = enableReadOnlyEntry(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = finishVDODecode(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = makeFlusher(vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount maximumAge = getConfiguredBlockMapMaximumAge(vdo);
+  BlockCount journalLength
+    = getRecoveryJournalLength(vdo->config.recoveryJournalSize);
+  if ((maximumAge > (journalLength / 2)) || (maximumAge < 1)) {
+    return VDO_BAD_CONFIGURATION;
+  }
+  result = makeBlockMapCaches(vdo->blockMap, vdo->layer,
+                              vdo->readOnlyNotifier, vdo->recoveryJournal,
+                              vdo->nonce, getConfiguredCacheSize(vdo),
+                              maximumAge);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(threadConfig->hashZoneCount, HashZone *, __func__,
+                    &vdo->hashZones);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) {
+    result = makeHashZone(vdo, zone, &vdo->hashZones[zone]);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  result = makeLogicalZones(vdo, &vdo->logicalZones);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(threadConfig->physicalZoneCount, PhysicalZone *, __func__,
+                    &vdo->physicalZones);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) {
+    result = makePhysicalZone(vdo, zone, &vdo->physicalZones[zone]);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return makePacker(vdo->layer, DEFAULT_PACKER_INPUT_BINS,
+                    DEFAULT_PACKER_OUTPUT_BINS, threadConfig, &vdo->packer);
+}
+
+/**
+ * Load the components of a VDO. This is the super block load callback
+ * set by loadCallback().
+ *
+ * @param completion The sub-task completion
+ **/
+static void loadVDOComponents(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+
+  prepareCompletion(completion, finishParentCallback, abortLoad,
+                    completion->callbackThreadID, completion->parent);
+  finishCompletion(completion, decodeVDO(vdo, true));
+}
+
+/**
+ * Callback to initiate a pre-load, registered in prepareToLoadVDO().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void preLoadCallback(VDOCompletion *completion)
+{
+  VDO *vdo = vdoFromLoadSubTask(completion);
+  assertOnAdminThread(vdo, __func__);
+  prepareAdminSubTask(vdo, loadVDOComponents, abortLoad);
+  loadSuperBlockAsync(completion, getFirstBlockOffset(vdo), &vdo->superBlock);
+}
+
+/**********************************************************************/
+int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig)
+{
+  vdo->loadConfig = *loadConfig;
+  return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL,
+                               preLoadCallback, preLoadCallback);
+}
+
+/**********************************************************************/
+__attribute__((warn_unused_result))
+static int decodeSynchronousVDO(VDO *vdo, bool validateConfig)
+{
+  int result = startVDODecode(vdo, validateConfig);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return finishVDODecode(vdo);
+}
+
+/**********************************************************************/
+int loadVDOSuperblock(PhysicalLayer   *layer,
+                      VolumeGeometry  *geometry,
+                      bool             validateConfig,
+                      VDODecoder      *decoder,
+                      VDO            **vdoPtr)
+{
+  VDO *vdo;
+  int result = makeVDO(layer, &vdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  setLoadConfigFromGeometry(geometry, &vdo->loadConfig);
+  result = loadSuperBlock(layer, getFirstBlockOffset(vdo), &vdo->superBlock);
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  result = ((decoder == NULL)
+            ? decodeSynchronousVDO(vdo, validateConfig)
+            : decoder(vdo, validateConfig));
+  if (result != VDO_SUCCESS) {
+    freeVDO(&vdo);
+    return result;
+  }
+
+  *vdoPtr = vdo;
+  return VDO_SUCCESS;
+
+}
+/**********************************************************************/
+int loadVDO(PhysicalLayer  *layer,
+            bool            validateConfig,
+            VDODecoder     *decoder,
+            VDO           **vdoPtr)
+{
+  VolumeGeometry geometry;
+  int result = loadVolumeGeometry(layer, &geometry);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return loadVDOSuperblock(layer, &geometry, validateConfig, decoder, vdoPtr);
+}
diff --git a/vdo/base/vdoLoad.h b/vdo/base/vdoLoad.h
new file mode 100644
index 0000000..893d6e4
--- /dev/null
+++ b/vdo/base/vdoLoad.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.h#3 $
+ */
+
+#ifndef VDO_LOAD_H
+#define VDO_LOAD_H
+
+#include "volumeGeometry.h"
+#include "types.h"
+
+/**
+ * A function which decodes a VDO from a super block.
+ *
+ * @param vdo             The VDO to be decoded (its super block must already
+ *                        be loaded)
+ * @param validateConfig  If <code>true</code>, the VDO's configuration will
+ *                        be validated before the decode is attempted
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+typedef int VDODecoder(VDO *vdo, bool validateConfig);
+
+/**
+ * Load a VDO for normal operation. This method must not be called from a base
+ * thread.
+ *
+ * @param vdo         The VDO to load
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int performVDOLoad(VDO *vdo)
+  __attribute__((warn_unused_result));
+
+/**
+ * Perpare a VDO for loading by reading structures off disk. This method does
+ * not alter the on-disk state. It should be called from the VDO constructor,
+ * whereas performVDOLoad() will be called during pre-resume if the VDO has
+ * not been resumed before.
+ **/
+int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig)
+  __attribute__((warn_unused_result));
+
+/**
+ * Synchronously load a VDO from a specified super block location for use by
+ * user-space tools.
+ *
+ * @param [in]  layer           The physical layer the VDO sits on
+ * @param [in]  geometry        A pointer to the geometry for the volume
+ * @param [in]  validateConfig  Whether to validate the VDO against the layer
+ * @param [in]  decoder         The VDO decoder to use, if NULL, the default
+ *                              decoder will be used
+ * @param [out] vdoPtr          A pointer to hold the decoded VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int loadVDOSuperblock(PhysicalLayer   *layer,
+                      VolumeGeometry  *geometry,
+                      bool             validateConfig,
+                      VDODecoder      *decoder,
+                      VDO            **vdoPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Synchronously load a VDO volume for use by user-space tools.
+ *
+ * @param [in]  layer           The physical layer the VDO sits on
+ * @param [in]  validateConfig  Whether to validate the VDO against the layer
+ * @param [in]  decoder         The VDO decoder to use, if NULL, the default
+ *                              decoder will be used
+ * @param [out] vdoPtr          A pointer to hold the decoded VDO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int loadVDO(PhysicalLayer  *layer,
+            bool            validateConfig,
+            VDODecoder     *decoder,
+            VDO           **vdoPtr)
+  __attribute__((warn_unused_result));
+
+#endif /* VDO_LOAD_H */
diff --git a/vdo/base/vdoPageCache.c b/vdo/base/vdoPageCache.c
new file mode 100644
index 0000000..c8f4585
--- /dev/null
+++ b/vdo/base/vdoPageCache.c
@@ -0,0 +1,1369 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.c#11 $
+ */
+
+#include "vdoPageCacheInternals.h"
+
+#if __KERNEL__
+#include <linux/ratelimit.h>
+#endif
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "adminState.h"
+#include "constants.h"
+#include "numUtils.h"
+#include "readOnlyNotifier.h"
+#include "statusCodes.h"
+#include "types.h"
+#include "vio.h"
+
+enum {
+  LOG_INTERVAL                = 4000,
+  DISPLAY_INTERVAL            = 100000,
+};
+
+/**********************************************************************/
+static char *getPageBuffer(PageInfo *info)
+{
+  VDOPageCache *cache = info->cache;
+  return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE];
+}
+
+/**
+ * Allocate components of the cache which require their own allocation. The
+ * caller is responsible for all clean up on errors.
+ *
+ * @param cache     The cache being constructed
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int allocateCacheComponents(VDOPageCache *cache)
+{
+  int result = ALLOCATE(cache->pageCount, PageInfo, "page infos",
+                        &cache->infos);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  uint64_t size = cache->pageCount * (uint64_t) VDO_BLOCK_SIZE;
+  result = allocateMemory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  return makeIntMap(cache->pageCount, 0, &cache->pageMap);
+}
+
+/**
+ * Initialize all page info structures and put them on the free list.
+ *
+ * @param cache  The cache to initialize
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int initializeInfo(VDOPageCache *cache)
+{
+  initializeRing(&cache->freeList);
+  PageInfo *info;
+  for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) {
+    info->cache = cache;
+    info->state = PS_FREE;
+    info->pbn   = NO_PAGE;
+
+    if (cache->layer->createMetadataVIO != NULL) {
+      int result = createVIO(cache->layer, VIO_TYPE_BLOCK_MAP,
+                             VIO_PRIORITY_METADATA, info, getPageBuffer(info),
+                             &info->vio);
+      if (result != VDO_SUCCESS) {
+        return result;
+      }
+
+      // The thread ID should never change.
+      info->vio->completion.callbackThreadID = cache->zone->threadID;
+    }
+
+    initializeRing(&info->listNode);
+    pushRingNode(&cache->freeList, &info->listNode);
+    initializeRing(&info->lruNode);
+  }
+
+  relaxedStore64(&cache->stats.counts.freePages, cache->pageCount);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void writeDirtyPagesCallback(RingNode *node, void *context);
+
+/**********************************************************************/
+int makeVDOPageCache(PhysicalLayer         *layer,
+                     PageCount              pageCount,
+                     VDOPageReadFunction   *readHook,
+                     VDOPageWriteFunction  *writeHook,
+                     size_t                 pageContextSize,
+                     BlockCount             maximumAge,
+                     BlockMapZone          *zone,
+                     VDOPageCache         **cachePtr)
+{
+  int result = ASSERT(pageContextSize <= MAX_PAGE_CONTEXT_SIZE,
+                      "page context size %zu cannot exceed %u bytes",
+                      pageContextSize, MAX_PAGE_CONTEXT_SIZE);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  VDOPageCache *cache;
+  result = ALLOCATE(1, VDOPageCache, "page cache", &cache);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  cache->layer            = layer;
+  cache->pageCount        = pageCount;
+  cache->readHook         = readHook;
+  cache->writeHook        = writeHook;
+  cache->zone             = zone;
+
+  result = allocateCacheComponents(cache);
+  if (result != VDO_SUCCESS) {
+    freeVDOPageCache(&cache);
+    return result;
+  }
+
+  result = initializeInfo(cache);
+  if (result != VDO_SUCCESS) {
+    freeVDOPageCache(&cache);
+    return result;
+  }
+
+  result = makeDirtyLists(maximumAge, writeDirtyPagesCallback, cache,
+                          &cache->dirtyLists);
+  if (result != VDO_SUCCESS) {
+    freeVDOPageCache(&cache);
+    return result;
+  }
+
+  // initialize empty circular queues
+  initializeRing(&cache->lruList);
+  initializeRing(&cache->outgoingList);
+
+  *cachePtr = cache;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeVDOPageCache(VDOPageCache **cachePtr)
+{
+  VDOPageCache *cache = *cachePtr;
+  if (cache == NULL) {
+    return;
+  }
+
+  if (cache->infos != NULL) {
+    PageInfo *info;
+    for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) {
+      freeVIO(&info->vio);
+    }
+  }
+
+  freeDirtyLists(&cache->dirtyLists);
+  freeIntMap(&cache->pageMap);
+  FREE(cache->infos);
+  FREE(cache->pages);
+  FREE(cache);
+  *cachePtr = NULL;
+}
+
+/**********************************************************************/
+void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period)
+{
+  setCurrentPeriod(cache->dirtyLists, period);
+}
+
+/**********************************************************************/
+void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding)
+{
+  cache->rebuilding = rebuilding;
+}
+
+/**
+ * Assert that a function has been called on the VDO page cache's thread.
+ *
+ * @param cache         the page cache
+ * @param functionName  the name of the function
+ **/
+static inline void assertOnCacheThread(VDOPageCache *cache,
+                                       const char   *functionName)
+{
+  ThreadID threadID = getCallbackThreadID();
+  ASSERT_LOG_ONLY((threadID == cache->zone->threadID),
+                  "%s() must only be called on cache thread %d, not thread %d",
+                  functionName, cache->zone->threadID, threadID);
+}
+
+/**
+ * Assert that a page cache may issue I/O.
+ *
+ * @param cache  the page cache
+ **/
+static inline void assertIOAllowed(VDOPageCache *cache)
+{
+  ASSERT_LOG_ONLY(!isQuiescent(&cache->zone->state),
+                  "VDO page cache may issue I/O");
+}
+
+/**
+ * Log and, if enabled, report cache pressure.
+ *
+ * @param cache         the page cache
+ **/
+static void reportCachePressure(VDOPageCache *cache)
+{
+  relaxedAdd64(&cache->stats.cachePressure, 1);
+  if (cache->waiterCount > cache->pageCount) {
+    if ((cache->pressureReport % LOG_INTERVAL) == 0) {
+      logInfo("page cache pressure %llu",
+              relaxedLoad64(&cache->stats.cachePressure));
+    }
+
+    if (++cache->pressureReport >= DISPLAY_INTERVAL) {
+      cache->pressureReport = 0;
+    }
+  }
+}
+
+/**********************************************************************/
+const char *vpcPageStateName(PageState state)
+{
+  static const char *stateNames[] = {
+    "FREE",
+    "INCOMING",
+    "FAILED",
+    "RESIDENT",
+    "DIRTY",
+    "OUTGOING"
+  };
+  STATIC_ASSERT(COUNT_OF(stateNames) == PAGE_STATE_COUNT);
+
+  int result = ASSERT(state < COUNT_OF(stateNames),
+                      "Unknown PageState value %d", state);
+  if (result != UDS_SUCCESS) {
+    return "[UNKNOWN PAGE STATE]";
+  }
+
+  return stateNames[state];
+}
+
+/**
+ * Update the counter associated with a given state.
+ *
+ * @param info   the page info to count
+ * @param delta  the delta to apply to the counter
+ **/
+static void updateCounter(PageInfo *info, int32_t delta)
+{
+  VDOPageCache *cache = info->cache;
+  switch (info->state) {
+    case PS_FREE:
+      relaxedAdd64(&cache->stats.counts.freePages, delta);
+      return;
+
+    case PS_INCOMING:
+      relaxedAdd64(&cache->stats.counts.incomingPages, delta);
+      return;
+
+    case PS_OUTGOING:
+      relaxedAdd64(&cache->stats.counts.outgoingPages, delta);
+      return;
+
+    case PS_FAILED:
+      relaxedAdd64(&cache->stats.counts.failedPages, delta);
+      return;
+
+    case PS_RESIDENT:
+      relaxedAdd64(&cache->stats.counts.cleanPages, delta);
+      return;
+
+    case PS_DIRTY:
+      relaxedAdd64(&cache->stats.counts.dirtyPages, delta);
+      return;
+
+    default:
+      return;
+  }
+}
+
+/**
+ * Update the lru information for an active page.
+ **/
+static void updateLru(PageInfo *info)
+{
+  VDOPageCache *cache = info->cache;
+
+  if (cache->lruList.prev != &info->lruNode) {
+    pushRingNode(&cache->lruList, &info->lruNode);
+  }
+}
+
+/**
+ * Set the state of a PageInfo and put it on the right list, adjusting
+ * counters.
+ *
+ * @param info      the PageInfo to modify
+ * @param newState  the new state for the PageInfo
+ **/
+static void setInfoState(PageInfo *info, PageState newState)
+{
+  if (newState == info->state) {
+    return;
+  }
+
+  updateCounter(info, -1);
+  info->state = newState;
+  updateCounter(info, 1);
+
+  switch (info->state) {
+  case PS_FREE:
+  case PS_FAILED:
+    pushRingNode(&info->cache->freeList, &info->listNode);
+    return;
+
+  case PS_OUTGOING:
+    pushRingNode(&info->cache->outgoingList, &info->listNode);
+    return;
+
+  case PS_DIRTY:
+    return;
+
+  default:
+    unspliceRingNode(&info->listNode);
+  }
+}
+
+/**
+ * Set the pbn for an info, updating the map as needed.
+ *
+ * @param info  The page info
+ * @param pbn   The physical block number to set
+ **/
+__attribute__((warn_unused_result))
+static int setInfoPBN(PageInfo *info, PhysicalBlockNumber pbn)
+{
+  VDOPageCache *cache = info->cache;
+
+  // Either the new or the old page number must be NO_PAGE.
+  int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE),
+                      "Must free a page before reusing it.");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (info->pbn != NO_PAGE) {
+    intMapRemove(cache->pageMap, info->pbn);
+  }
+
+  info->pbn = pbn;
+
+  if (pbn != NO_PAGE) {
+    result = intMapPut(cache->pageMap, pbn, info, true, NULL);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+  return VDO_SUCCESS;
+}
+
+/**
+ * Reset page info to represent an unallocated page.
+ **/
+static int resetPageInfo(PageInfo *info)
+{
+  int result = ASSERT(info->busy == 0, "VDO Page must not be busy");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT(!hasWaiters(&info->waiting),
+                  "VDO Page must not have waiters");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = setInfoPBN(info, NO_PAGE);
+  setInfoState(info, PS_FREE);
+  unspliceRingNode(&info->lruNode);
+  return result;
+}
+
+/**
+ * Find a free page.
+ *
+ * @param cache         the page cache
+ *
+ * @return a pointer to the page info structure (if found), NULL otherwise
+ **/
+__attribute__((warn_unused_result))
+static PageInfo *findFreePage(VDOPageCache *cache)
+{
+  if (cache->freeList.next == &cache->freeList) {
+    return NULL;
+  }
+  PageInfo *info = pageInfoFromListNode(cache->freeList.next);
+  unspliceRingNode(&info->listNode);
+  return info;
+}
+
+/**********************************************************************/
+PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn)
+{
+  if ((cache->lastFound != NULL)
+      && (cache->lastFound->pbn == pbn)) {
+    return cache->lastFound;
+  }
+  cache->lastFound = intMapGet(cache->pageMap, pbn);
+  return cache->lastFound;
+}
+
+/**
+ * Determine which page is least recently used.
+ *
+ * @param cache         the page cache structure
+ *
+ * @return a pointer to the info structure for a relevant page,
+ *         or NULL if no such page can be found. The page can be
+ *         dirty or resident.
+ *
+ * @note Picks the least recently used from among the non-busy entries
+ *       at the front of each of the lru ring.
+ *       Since whenever we mark a page busy we also put it to the end
+ *       of the ring it is unlikely that the entries at the front
+ *       are busy unless the queue is very short, but not impossible.
+ **/
+__attribute__((warn_unused_result))
+static PageInfo *selectLRUPage(VDOPageCache *cache)
+{
+  PageInfoNode *lru;
+  for (lru = cache->lruList.next;
+       lru != &cache->lruList;
+       lru = lru->next) {
+    PageInfo *info = pageInfoFromLRUNode(lru);
+    if ((info->busy == 0) && !isInFlight(info)) {
+      return info;
+    }
+  }
+
+  return NULL;
+}
+
+/**********************************************************************/
+AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache)
+{
+  return &cache->stats;
+}
+
+// ASYNCHRONOUS INTERFACE BEYOND THIS POINT
+
+/**
+ * Helper to complete the VDO Page Completion request successfully.
+ *
+ * @param info          the page info representing the result page
+ * @param vdoPageComp   the VDO page completion to complete
+ **/
+static void completeWithPage(PageInfo *info, VDOPageCompletion *vdoPageComp)
+{
+  bool available = vdoPageComp->writable ? isPresent(info) : isValid(info);
+  if (!available) {
+    logErrorWithStringError(VDO_BAD_PAGE,
+                            "Requested cache page %llu in state %s is"
+                            " not %s",
+                            info->pbn, vpcPageStateName(info->state),
+                            vdoPageComp->writable ? "present" : "valid");
+    finishCompletion(&vdoPageComp->completion, VDO_BAD_PAGE);
+    return;
+  }
+
+  vdoPageComp->info = info;
+  vdoPageComp->ready = true;
+  finishCompletion(&vdoPageComp->completion, VDO_SUCCESS);
+}
+
+/**
+ * Complete a page completion with an error code. Implements WaiterCallback.
+ *
+ * @param waiter        The page completion, as a waiter
+ * @param resultPtr     A pointer to the error code.
+ **/
+static void completeWaiterWithError(Waiter *waiter, void *resultPtr)
+{
+  int               *result     = resultPtr;
+  VDOPageCompletion *completion = pageCompletionFromWaiter(waiter);
+  finishCompletion(&completion->completion, *result);
+}
+
+/**
+ * Complete a queue of VDOPageCompletions with an error code.
+ *
+ * @param [in]      result      the error result
+ * @param [in, out] queue       a pointer to the queue
+ *
+ * @note upon completion the queue will be empty
+ **/
+static void distributeErrorOverQueue(int result, WaitQueue *queue)
+{
+  notifyAllWaiters(queue, completeWaiterWithError, &result);
+}
+
+/**
+ * Complete a page completion with a page. Implements WaiterCallback.
+ *
+ * @param waiter        The page completion, as a waiter
+ * @param pageInfo      The page info to complete with
+ **/
+static void completeWaiterWithPage(Waiter *waiter, void *pageInfo)
+{
+  PageInfo *info = pageInfo;
+  VDOPageCompletion *completion = pageCompletionFromWaiter(waiter);
+  completeWithPage(info, completion);
+}
+
+/**
+ * Complete a queue of VDOPageCompletions with a page result.
+ *
+ * @param [in]      info        the page info describing the page
+ * @param [in, out] queue       a pointer to a queue of waiters
+ *
+ * @return the number of pages distributed
+ *
+ * @note upon completion the queue will be empty
+ *
+ **/
+static unsigned int distributePageOverQueue(PageInfo *info, WaitQueue *queue)
+{
+  updateLru(info);
+
+  size_t pages = countWaiters(queue);
+
+  /*
+   * Increment the busy count once for each pending completion so that
+   * this page does not stop being busy until all completions have
+   * been processed (VDO-83).
+   */
+  info->busy += pages;
+
+  notifyAllWaiters(queue, completeWaiterWithPage, info);
+  return pages;
+}
+
+/**
+ * Set a persistent error which all requests will receive in the future.
+ *
+ * @param cache         the page cache
+ * @param context       a string describing what triggered the error
+ * @param result        the error result
+ *
+ * Once triggered, all enqueued completions will get this error.
+ * Any future requests will result in this error as well.
+ **/
+static void setPersistentError(VDOPageCache *cache,
+                               const char   *context,
+                               int           result)
+{
+  // If we're already read-only, there's no need to log.
+  ReadOnlyNotifier *notifier = cache->zone->readOnlyNotifier;
+  if ((result != VDO_READ_ONLY) && !isReadOnly(notifier)) {
+    logErrorWithStringError(result, "VDO Page Cache persistent error: %s",
+                            context);
+    enterReadOnlyMode(notifier, result);
+  }
+
+  assertOnCacheThread(cache, __func__);
+
+  distributeErrorOverQueue(result, &cache->freeWaiters);
+  cache->waiterCount = 0;
+
+  PageInfo *info;
+  for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) {
+    distributeErrorOverQueue(result, &info->waiting);
+  }
+}
+
+/**********************************************************************/
+void initVDOPageCompletion(VDOPageCompletion   *pageCompletion,
+                           VDOPageCache        *cache,
+                           PhysicalBlockNumber  pbn,
+                           bool                 writable,
+                           void                *parent,
+                           VDOAction           *callback,
+                           VDOAction           *errorHandler)
+{
+  ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL),
+                  "New page completion was not already on a wait queue");
+
+  *pageCompletion = (VDOPageCompletion) {
+    .pbn      = pbn,
+    .writable = writable,
+    .cache    = cache,
+  };
+
+  VDOCompletion *completion = &pageCompletion->completion;
+  initializeCompletion(completion, VDO_PAGE_COMPLETION, cache->layer);
+  prepareCompletion(completion, callback, errorHandler, cache->zone->threadID,
+                    parent);
+}
+
+/**
+ * Helper function to check that a completion represents a successfully
+ * completed VDO Page Completion referring to a valid page.
+ *
+ * @param completion    a VDO completion
+ * @param writable      whether a writable page is required
+ *
+ * @return the embedding completion if valid, NULL if not
+ **/
+__attribute__((warn_unused_result))
+static VDOPageCompletion *validateCompletedPage(VDOCompletion *completion,
+                                                bool           writable)
+{
+  VDOPageCompletion *vpc = asVDOPageCompletion(completion);
+
+  int result = ASSERT(vpc->ready, "VDO Page completion not ready");
+  if (result != UDS_SUCCESS) {
+    return NULL;
+  }
+
+  result = ASSERT(vpc->info != NULL, "VDO Page Completion must be complete");
+  if (result != UDS_SUCCESS) {
+    return NULL;
+  }
+
+  result = ASSERT(vpc->info->pbn == vpc->pbn,
+                  "VDO Page Completion pbn must be consistent");
+  if (result != UDS_SUCCESS) {
+    return NULL;
+  }
+
+  result = ASSERT(isValid(vpc->info),
+                  "VDO Page Completion page must be valid");
+  if (result != UDS_SUCCESS) {
+    return NULL;
+  }
+
+  if (writable) {
+    result = ASSERT(vpc->writable, "VDO Page Completion is writable");
+    if (result != UDS_SUCCESS) {
+      return NULL;
+    }
+  }
+
+  return vpc;
+}
+
+/**********************************************************************/
+bool isPageCacheActive(VDOPageCache *cache)
+{
+  return ((cache->outstandingReads != 0) || (cache->outstandingWrites != 0));
+}
+
+/**
+ * VIO callback used when a page has been loaded.
+ *
+ * @param completion  A completion for the VIO, the parent of which is a
+ *                    PageInfo.
+ **/
+static void pageIsLoaded(VDOCompletion *completion)
+{
+  PageInfo     *info   = completion->parent;
+  VDOPageCache *cache  = info->cache;
+  assertOnCacheThread(cache, __func__);
+
+  setInfoState(info, PS_RESIDENT);
+  distributePageOverQueue(info, &info->waiting);
+
+  /*
+   * Don't decrement until right before calling checkForDrainComplete() to
+   * ensure that the above work can't cause the page cache to be freed out from
+   * under us.
+   */
+  cache->outstandingReads--;
+  checkForDrainComplete(cache->zone);
+}
+
+/**
+ * Handle page load errors.
+ *
+ * @param completion  The page read VIO
+ **/
+static void handleLoadError(VDOCompletion *completion)
+{
+  int           result = completion->result;
+  PageInfo     *info   = completion->parent;
+  VDOPageCache *cache  = info->cache;
+  assertOnCacheThread(cache, __func__);
+
+  enterReadOnlyMode(cache->zone->readOnlyNotifier, result);
+  relaxedAdd64(&cache->stats.failedReads, 1);
+  setInfoState(info, PS_FAILED);
+  distributeErrorOverQueue(result, &info->waiting);
+  resetPageInfo(info);
+
+  /*
+   * Don't decrement until right before calling checkForDrainComplete() to
+   * ensure that the above work can't cause the page cache to be freed out from
+   * under us.
+   */
+  cache->outstandingReads--;
+  checkForDrainComplete(cache->zone);
+}
+
+/**
+ * Run the read hook after a page is loaded. This callback is registered in
+ * launchPageLoad() when there is a read hook.
+ *
+ * @param completion  The page load completion
+ **/
+static void runReadHook(VDOCompletion *completion)
+{
+  PageInfo *info       = completion->parent;
+  completion->callback = pageIsLoaded;
+  resetCompletion(completion);
+  int result = info->cache->readHook(getPageBuffer(info), info->pbn,
+                                     info->cache->zone, info->context);
+  continueCompletion(completion, result);
+}
+
+/**
+ * Handle a read error during a read-only rebuild.
+ *
+ * @param completion  The page load completion
+ **/
+static void handleRebuildReadError(VDOCompletion *completion)
+{
+  PageInfo     *info   = completion->parent;
+  VDOPageCache *cache  = info->cache;
+  assertOnCacheThread(cache, __func__);
+
+  // We are doing a read-only rebuild, so treat this as a successful read
+  // of an uninitialized page.
+  relaxedAdd64(&cache->stats.failedReads, 1);
+  memset(getPageBuffer(info), 0, VDO_BLOCK_SIZE);
+  resetCompletion(completion);
+  if (cache->readHook != NULL) {
+    runReadHook(completion);
+  } else {
+    pageIsLoaded(completion);
+  }
+}
+
+/**
+ * Begin the process of loading a page.
+ *
+ * @param info  the page info representing where to load the page
+ * @param pbn   the absolute pbn of the desired page
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int launchPageLoad(PageInfo *info, PhysicalBlockNumber pbn)
+{
+  VDOPageCache *cache = info->cache;
+  assertIOAllowed(cache);
+
+  int result = setInfoPBN(info, pbn);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = ASSERT((info->busy == 0), "Page is not busy before loading.");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  setInfoState(info, PS_INCOMING);
+  cache->outstandingReads++;
+  relaxedAdd64(&cache->stats.pagesLoaded, 1);
+  launchReadMetadataVIO(info->vio, pbn,
+                        (cache->readHook != NULL) ? runReadHook : pageIsLoaded,
+                        (cache->rebuilding
+                         ? handleRebuildReadError : handleLoadError));
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void writePages(VDOCompletion *completion);
+
+/**
+ * Handle errors flushing the layer.
+ *
+ * @param completion  The flush VIO
+ **/
+static void handleFlushError(VDOCompletion *completion)
+{
+  VDOPageCache *cache = ((PageInfo *) completion->parent)->cache;
+  setPersistentError(cache, "flush failed", completion->result);
+  writePages(completion);
+}
+
+/**
+ * Attempt to save the outgoing pages by first flushing the layer.
+ *
+ * @param cache  The cache
+ **/
+static void savePages(VDOPageCache *cache)
+{
+  if ((cache->pagesInFlush > 0) || (cache->pagesToFlush == 0)) {
+    return;
+  }
+
+  assertIOAllowed(cache);
+
+  PageInfo *info      = pageInfoFromListNode(cache->outgoingList.next);
+  cache->pagesInFlush = cache->pagesToFlush;
+  cache->pagesToFlush = 0;
+  relaxedAdd64(&cache->stats.flushCount, 1);
+
+  VIO           *vio   = info->vio;
+  PhysicalLayer *layer = vio->completion.layer;
+
+  /*
+   * We must make sure that the recovery journal entries that changed these
+   * pages were successfully persisted, and thus must issue a flush before
+   * each batch of pages is written to ensure this. However, in sync mode,
+   * every journal block is written with FUA, thus guaranteeing the journal
+   * persisted already.
+   */
+  if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) {
+    launchFlush(vio, writePages, handleFlushError);
+    return;
+  }
+
+  writePages(&vio->completion);
+}
+
+/**
+ * Add a page to the outgoing list of pages waiting to be saved. Once in the
+ * list, a page may not be used until it has been written out.
+ *
+ * @param info  The page to save
+ **/
+static void schedulePageSave(PageInfo *info)
+{
+  if (info->busy > 0) {
+    info->writeStatus = WRITE_STATUS_DEFERRED;
+    return;
+  }
+
+  info->cache->pagesToFlush++;
+  info->cache->outstandingWrites++;
+  setInfoState(info, PS_OUTGOING);
+}
+
+/**********************************************************************/
+static void writeDirtyPagesCallback(RingNode *expired, void *context)
+{
+  while (!isRingEmpty(expired)) {
+    schedulePageSave(pageInfoFromListNode(chopRingNode(expired)));
+  }
+
+  savePages((VDOPageCache *) context);
+}
+
+/**
+ * Add a page to outgoing pages waiting to be saved, and then start saving
+ * pages if another save is not in progress.
+ *
+ * @param info  The page to save
+ **/
+static void launchPageSave(PageInfo *info)
+{
+  schedulePageSave(info);
+  savePages(info->cache);
+}
+
+/**
+ * Determine whether a given VDOPageCompletion (as a waiter) is requesting a
+ * given page number. Implements WaiterMatch.
+ *
+ * @param waiter        The page completion in question
+ * @param context       A pointer to the pbn of the desired page
+ *
+ * @return true if the page completion is for the desired page number
+ **/
+static bool completionNeedsPage(Waiter *waiter, void *context)
+{
+  PhysicalBlockNumber *pbn = context;
+  return (pageCompletionFromWaiter(waiter)->pbn == *pbn);
+}
+
+/**
+ * Allocate a free page to the first completion in the waiting queue,
+ * and any other completions that match it in page number.
+ **/
+static void allocateFreePage(PageInfo *info)
+{
+  VDOPageCache *cache = info->cache;
+  assertOnCacheThread(cache, __func__);
+
+  if (!hasWaiters(&cache->freeWaiters)) {
+    if (relaxedLoad64(&cache->stats.cachePressure) > 0) {
+      logInfo("page cache pressure relieved");
+      relaxedStore64(&cache->stats.cachePressure, 0);
+    }
+    return;
+  }
+
+  int result = resetPageInfo(info);
+  if (result != VDO_SUCCESS) {
+    setPersistentError(cache, "cannot reset page info", result);
+    return;
+  }
+
+  Waiter *oldestWaiter = getFirstWaiter(&cache->freeWaiters);
+  PhysicalBlockNumber pbn = pageCompletionFromWaiter(oldestWaiter)->pbn;
+
+  // Remove all entries which match the page number in question
+  // and push them onto the page info's wait queue.
+  dequeueMatchingWaiters(&cache->freeWaiters, completionNeedsPage,
+                         &pbn, &info->waiting);
+  cache->waiterCount -= countWaiters(&info->waiting);
+
+  result = launchPageLoad(info, pbn);
+  if (result != VDO_SUCCESS) {
+    distributeErrorOverQueue(result, &info->waiting);
+  }
+}
+
+/**
+ * Begin the process of discarding a page.
+ *
+ * @param cache         the page cache
+ *
+ * @note If no page is discardable, increments a count of deferred frees so
+ *       that the next release of a page which is no longer busy will kick
+ *       off another discard cycle. This is an indication that the cache is
+ *       not big enough.
+ *
+ * @note If the selected page is not dirty, immediately allocates the page
+ *       to the oldest completion waiting for a free page.
+ **/
+static void discardAPage(VDOPageCache *cache)
+{
+  PageInfo *info = selectLRUPage(cache);
+  if (info == NULL) {
+    reportCachePressure(cache);
+    return;
+  }
+
+  if (!isDirty(info)) {
+    allocateFreePage(info);
+    return;
+  }
+
+  ASSERT_LOG_ONLY(!isInFlight(info),
+                  "page selected for discard is not in flight");
+
+  ++cache->discardCount;
+  info->writeStatus = WRITE_STATUS_DISCARD;
+  launchPageSave(info);
+}
+
+/**
+ * Helper used to trigger a discard so that the completion can get a different
+ * page.
+ *
+ * @param vdoPageComp   the VDO Page completion
+ **/
+static void discardPageForCompletion(VDOPageCompletion *vdoPageComp)
+{
+  VDOPageCache *cache = vdoPageComp->cache;
+
+  ++cache->waiterCount;
+
+  int result = enqueueWaiter(&cache->freeWaiters, &vdoPageComp->waiter);
+  if (result != VDO_SUCCESS) {
+    setPersistentError(cache, "cannot enqueue waiter", result);
+  }
+
+  discardAPage(cache);
+}
+
+/**
+ * Helper used to trigger a discard if the cache needs another free page.
+ *
+ * @param cache         the page cache
+ **/
+static void discardPageIfNeeded(VDOPageCache *cache)
+{
+  if (cache->waiterCount > cache->discardCount) {
+    discardAPage(cache);
+  }
+}
+
+/**********************************************************************/
+void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period)
+{
+  assertOnCacheThread(cache, __func__);
+  advancePeriod(cache->dirtyLists, period);
+}
+
+/**
+ * Inform the cache that a write has finished (possibly with an error).
+ *
+ * @param info  The info structure for the page whose write just completed
+ *
+ * @return <code>true</code> if the page write was a discard
+ **/
+static bool writeHasFinished(PageInfo *info)
+{
+  assertOnCacheThread(info->cache, __func__);
+  info->cache->outstandingWrites--;
+
+  bool wasDiscard = (info->writeStatus == WRITE_STATUS_DISCARD);
+  info->writeStatus = WRITE_STATUS_NORMAL;
+  return wasDiscard;
+}
+
+/**
+ * Handler for page write errors.
+ *
+ * @param completion  The page write VIO
+ **/
+static void handlePageWriteError(VDOCompletion *completion)
+{
+  int           result = completion->result;
+  PageInfo     *info   = completion->parent;
+  VDOPageCache *cache  = info->cache;
+
+  // If we're already read-only, write failures are to be expected.
+  if (result != VDO_READ_ONLY) {
+#if __KERNEL__
+    static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL,
+                                  DEFAULT_RATELIMIT_BURST);
+
+    if (__ratelimit(&errorLimiter)) {
+      logError("failed to write block map page %llu", info->pbn);
+    }
+#else
+    logError("failed to write block map page %llu", info->pbn);
+#endif
+  }
+
+  setInfoState(info, PS_DIRTY);
+  relaxedAdd64(&cache->stats.failedWrites, 1);
+  setPersistentError(cache, "cannot write page", result);
+
+  if (!writeHasFinished(info)) {
+    discardPageIfNeeded(cache);
+  }
+
+  checkForDrainComplete(cache->zone);
+}
+
+/**
+ * VIO callback used when a page has been written out.
+ *
+ * @param completion    A completion for the VIO, the parent of which
+ *                      is embedded in PageInfo.
+ **/
+static void pageIsWrittenOut(VDOCompletion *completion)
+{
+  PageInfo     *info  = completion->parent;
+  VDOPageCache *cache = info->cache;
+
+  if (cache->writeHook != NULL) {
+    bool rewrite = cache->writeHook(getPageBuffer(info), cache->zone,
+                                    info->context);
+    if (rewrite) {
+      launchWriteMetadataVIOWithFlush(info->vio, info->pbn, pageIsWrittenOut,
+                                      handlePageWriteError, true, false);
+      return;
+    }
+  }
+
+  bool wasDiscard = writeHasFinished(info);
+  bool reclaimed  = (!wasDiscard || (info->busy > 0)
+                     || hasWaiters(&info->waiting));
+
+  setInfoState(info, PS_RESIDENT);
+
+  uint32_t reclamations = distributePageOverQueue(info, &info->waiting);
+  relaxedAdd64(&cache->stats.reclaimed, reclamations);
+
+  if (wasDiscard) {
+    cache->discardCount--;
+  }
+
+  if (reclaimed) {
+    discardPageIfNeeded(cache);
+  } else {
+    allocateFreePage(info);
+  }
+
+  checkForDrainComplete(cache->zone);
+}
+
+/**
+ * Write the batch of pages which were covered by the layer flush which just
+ * completed. This callback is registered in savePages().
+ *
+ * @param flushCompletion  The flush VIO
+ **/
+static void writePages(VDOCompletion *flushCompletion)
+{
+  VDOPageCache *cache = ((PageInfo *) flushCompletion->parent)->cache;
+
+  /*
+   * We need to cache these two values on the stack since in the error case
+   * below, it is possible for the last page info to cause the page cache to
+   * get freed. Hence once we launch the last page, it may be unsafe to
+   * dereference the cache [VDO-4724].
+   */
+  bool      hasUnflushedPages = (cache->pagesToFlush > 0);
+  PageCount pagesInFlush      = cache->pagesInFlush;
+  cache->pagesInFlush         = 0;
+  while (pagesInFlush-- > 0) {
+    PageInfo *info = pageInfoFromListNode(chopRingNode(&cache->outgoingList));
+    if (isReadOnly(info->cache->zone->readOnlyNotifier)) {
+      VDOCompletion *completion = &info->vio->completion;
+      resetCompletion(completion);
+      completion->callback     = pageIsWrittenOut;
+      completion->errorHandler = handlePageWriteError;
+      finishCompletion(completion, VDO_READ_ONLY);
+      continue;
+    }
+    relaxedAdd64(&info->cache->stats.pagesSaved, 1);
+    launchWriteMetadataVIO(info->vio, info->pbn, pageIsWrittenOut,
+                           handlePageWriteError);
+  }
+
+  if (hasUnflushedPages) {
+    // If there are unflushed pages, the cache can't have been freed, so this
+    // call is safe.
+    savePages(cache);
+  }
+}
+
+/**********************************************************************/
+void releaseVDOPageCompletion(VDOCompletion *completion)
+{
+  if (completion == NULL) {
+    return;
+  }
+
+  PageInfo *discardInfo = NULL;
+  VDOPageCompletion *pageCompletion;
+  if (completion->result == VDO_SUCCESS) {
+    pageCompletion = validateCompletedPage(completion, false);
+    if (--pageCompletion->info->busy == 0) {
+      discardInfo = pageCompletion->info;
+    }
+  } else {
+    // Do not check for errors if the completion was not successful.
+    pageCompletion = asVDOPageCompletion(completion);
+  }
+  ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL),
+                  "Page being released after leaving all queues");
+
+  VDOPageCache *cache = pageCompletion->cache;
+  assertOnCacheThread(cache, __func__);
+  memset(pageCompletion, 0, sizeof(VDOPageCompletion));
+
+  if (discardInfo != NULL) {
+    if (discardInfo->writeStatus == WRITE_STATUS_DEFERRED) {
+      discardInfo->writeStatus = WRITE_STATUS_NORMAL;
+      launchPageSave(discardInfo);
+    }
+    // if there are excess requests for pages (that have not already started
+    // discards) we need to discard some page (which may be this one)
+    discardPageIfNeeded(cache);
+  }
+}
+
+/**
+ * Helper function to load a page as described by a VDO Page Completion.
+ *
+ * @param info          the page info representing where to load the page
+ * @param vdoPageComp   the VDO Page Completion describing the page
+ **/
+static void loadPageForCompletion(PageInfo          *info,
+                                  VDOPageCompletion *vdoPageComp)
+{
+  int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(&vdoPageComp->completion, result);
+    return;
+  }
+
+  result = launchPageLoad(info, vdoPageComp->pbn);
+  if (result != VDO_SUCCESS) {
+    distributeErrorOverQueue(result, &info->waiting);
+  }
+}
+
+/**********************************************************************/
+void getVDOPageAsync(VDOCompletion *completion)
+{
+  VDOPageCompletion *vdoPageComp = asVDOPageCompletion(completion);
+  VDOPageCache      *cache       = vdoPageComp->cache;
+  assertOnCacheThread(cache, __func__);
+
+  if (vdoPageComp->writable && isReadOnly(cache->zone->readOnlyNotifier)) {
+    finishCompletion(completion, VDO_READ_ONLY);
+    return;
+  }
+
+  if (vdoPageComp->writable) {
+    relaxedAdd64(&cache->stats.writeCount, 1);
+  } else {
+    relaxedAdd64(&cache->stats.readCount, 1);
+  }
+
+  PageInfo *info = vpcFindPage(cache, vdoPageComp->pbn);
+  if (info != NULL) {
+    // The page is in the cache already.
+    if ((info->writeStatus == WRITE_STATUS_DEFERRED) || isIncoming(info)
+        || (isOutgoing(info) && vdoPageComp->writable)) {
+      // The page is unusable until it has finished I/O.
+      relaxedAdd64(&cache->stats.waitForPage, 1);
+      int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter);
+      if (result != VDO_SUCCESS) {
+        finishCompletion(&vdoPageComp->completion, result);
+      }
+
+      return;
+    }
+
+    if (isValid(info)) {
+      // The page is usable.
+      relaxedAdd64(&cache->stats.foundInCache, 1);
+      if (!isPresent(info)) {
+        relaxedAdd64(&cache->stats.readOutgoing, 1);
+      }
+      updateLru(info);
+      ++info->busy;
+      completeWithPage(info, vdoPageComp);
+      return;
+    }
+    // Something horrible has gone wrong.
+    ASSERT_LOG_ONLY(false, "Info found in a usable state.");
+  }
+
+  // The page must be fetched.
+  info = findFreePage(cache);
+  if (info != NULL) {
+    relaxedAdd64(&cache->stats.fetchRequired, 1);
+    loadPageForCompletion(info, vdoPageComp);
+    return;
+  }
+
+  // The page must wait for a page to be discarded.
+  relaxedAdd64(&cache->stats.discardRequired, 1);
+  discardPageForCompletion(vdoPageComp);
+}
+
+/**********************************************************************/
+void markCompletedVDOPageDirty(VDOCompletion  *completion,
+                               SequenceNumber  oldDirtyPeriod,
+                               SequenceNumber  newDirtyPeriod)
+{
+  VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true);
+  if (vdoPageComp == NULL) {
+    return;
+  }
+
+  PageInfo *info = vdoPageComp->info;
+  setInfoState(info, PS_DIRTY);
+  addToDirtyLists(info->cache->dirtyLists, &info->listNode, oldDirtyPeriod,
+                  newDirtyPeriod);
+}
+
+/**********************************************************************/
+void requestVDOPageWrite(VDOCompletion *completion)
+{
+  VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true);
+  if (vdoPageComp == NULL) {
+    return;
+  }
+
+  PageInfo *info = vdoPageComp->info;
+  setInfoState(info, PS_DIRTY);
+  launchPageSave(info);
+}
+
+/**********************************************************************/
+static void *dereferencePageCompletion(VDOPageCompletion  *completion)
+{
+  return ((completion != NULL) ? getPageBuffer(completion->info) : NULL);
+}
+
+/**********************************************************************/
+const void *dereferenceReadableVDOPage(VDOCompletion *completion)
+{
+  return dereferencePageCompletion(validateCompletedPage(completion, false));
+}
+
+/**********************************************************************/
+void *dereferenceWritableVDOPage(VDOCompletion *completion)
+{
+  return dereferencePageCompletion(validateCompletedPage(completion, true));
+}
+
+/**********************************************************************/
+void *getVDOPageCompletionContext(VDOCompletion *completion)
+{
+  VDOPageCompletion *pageCompletion = asVDOPageCompletion(completion);
+  PageInfo *info = ((pageCompletion != NULL) ? pageCompletion->info : NULL);
+  return (((info != NULL) && isValid(info)) ? info->context : NULL);
+}
+
+/**********************************************************************/
+void drainVDOPageCache(VDOPageCache *cache)
+{
+  assertOnCacheThread(cache, __func__);
+  ASSERT_LOG_ONLY(isDraining(&cache->zone->state),
+                  "drainVDOPageCache() called during block map drain");
+
+  if (!isSuspending(&cache->zone->state)) {
+    flushDirtyLists(cache->dirtyLists);
+    savePages(cache);
+  }
+}
+
+/**********************************************************************/
+int invalidateVDOPageCache(VDOPageCache *cache)
+{
+  assertOnCacheThread(cache, __func__);
+
+  // Make sure we don't throw away any dirty pages.
+  PageInfo *info;
+  for (info = cache->infos; info < cache->infos + cache->pageCount; info++) {
+    int result = ASSERT(!isDirty(info), "cache must have no dirty pages");
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Reset the pageMap by re-allocating it.
+  freeIntMap(&cache->pageMap);
+  return makeIntMap(cache->pageCount, 0, &cache->pageMap);
+}
diff --git a/vdo/base/vdoPageCache.h b/vdo/base/vdoPageCache.h
new file mode 100644
index 0000000..e6a944d
--- /dev/null
+++ b/vdo/base/vdoPageCache.h
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.h#7 $
+ */
+
+#ifndef VDO_PAGE_CACHE_H
+#define VDO_PAGE_CACHE_H
+
+#include "adminState.h"
+#include "atomic.h"
+#include "completion.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * Structure describing page meta data (defined internally).
+ **/
+typedef struct pageInfo PageInfo;
+
+/**
+ * Structure describing entire page cache.
+ * (Unfortunately the name "PageCache" is already taken by Albireo.)
+ **/
+typedef struct vdoPageCache VDOPageCache;
+
+/**
+ * Generation counter for page references.
+ **/
+typedef uint32_t VDOPageGeneration;
+
+/**
+ * Page-state count statistics sub-structure.
+ **/
+typedef struct {
+  /* free pages */
+  Atomic64 freePages;
+  /* clean (resident) pages */
+  Atomic64 cleanPages;
+  /* dirty pages per era */
+  Atomic64 dirtyPages;
+  /* pages incoming */
+  Atomic64 incomingPages;
+  /* pages outgoing */
+  Atomic64 outgoingPages;
+  /* pages in failed state */
+  Atomic64 failedPages;
+} AtomicPageStateCounts;
+
+/**
+ * Statistics and debugging fields for the page cache.
+ */
+typedef struct {
+  /* counts of how many pages are in each state */
+  AtomicPageStateCounts counts;
+  /* how many times free page not available */
+  Atomic64              cachePressure;
+  /* number of getVDOPageAsync() for read */
+  Atomic64              readCount;
+  /* number or getVDOPageAsync() for write */
+  Atomic64              writeCount;
+  /* number of times pages failed to read */
+  Atomic64              failedReads;
+  /* number of times pages failed to write */
+  Atomic64              failedWrites;
+  /* number of gets that are reclaimed */
+  Atomic64              reclaimed;
+  /* number of gets for outgoing pages */
+  Atomic64              readOutgoing;
+  /* number of gets that were already there */
+  Atomic64              foundInCache;
+  /* number of gets requiring discard */
+  Atomic64              discardRequired;
+  /* number of gets enqueued for their page */
+  Atomic64              waitForPage;
+  /* number of gets that have to fetch */
+  Atomic64              fetchRequired;
+  /* number of page fetches */
+  Atomic64              pagesLoaded;
+  /* number of page saves */
+  Atomic64              pagesSaved;
+  /* number of flushes initiated */
+  Atomic64              flushCount;
+} AtomicPageCacheStatistics;
+
+/**
+ * Signature for a function to call when a page is read into the cache.
+ *
+ * <p>If specified, this function is called when a page is fetched from disk.
+ *
+ * @param rawPage      The raw memory of the freshly-fetched page
+ * @param pbn          The absolute physical block number of the page
+ * @param zone         The block map zone to which the cache belongs
+ * @param pageContext  A pointer to client-specific data for the new page
+ *
+ * @return VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly
+ *         formatted
+ **/
+typedef int VDOPageReadFunction(void                *rawPage,
+                                PhysicalBlockNumber  pbn,
+                                BlockMapZone        *zone,
+                                void                *pageContext);
+
+/**
+ * Signature for a function to call when a page is written from the cache.
+ *
+ * <p>If specified, this function is called when a page is written to disk.
+ *
+ * @param rawPage      The raw memory of the freshly-written page
+ * @param zone         The block map zone to which the cache belongs
+ * @param pageContext  A pointer to client-specific data for the new page
+ *
+ * @return whether the page needs to be rewritten
+ **/
+typedef bool VDOPageWriteFunction(void         *rawPage,
+                                  BlockMapZone *zone,
+                                  void         *pageContext);
+
+/**
+ * Construct a PageCache.
+ *
+ * @param [in]  layer             The physical layer to read and write
+ * @param [in]  pageCount         The number of cache pages to hold
+ * @param [in]  readHook          The function to be called when a page is read
+ *                                into the cache
+ * @param [in]  writeHook         The function to be called after a page is
+ *                                written from the cache
+ * @param [in]  pageContextSize   The size of the per-page context that will be
+ *                                passed to the read and write hooks
+ * @param [in]  maximumAge        The number of journal blocks before a dirtied
+ *                                page is considered old and must be written
+ *                                out
+ * @param [in]  zone              The block map zone which owns this cache
+ * @param [out] cachePtr          A pointer to hold the cache
+ *
+ * @return a success or error code
+ **/
+int makeVDOPageCache(PhysicalLayer         *layer,
+                     PageCount              pageCount,
+                     VDOPageReadFunction   *readHook,
+                     VDOPageWriteFunction  *writeHook,
+                     size_t                 pageContextSize,
+                     BlockCount             maximumAge,
+                     BlockMapZone          *zone,
+                     VDOPageCache         **cachePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the page cache structure and null out the reference to it.
+ *
+ * @param cachePtr a pointer to the cache to free
+ **/
+void freeVDOPageCache(VDOPageCache **cachePtr);
+
+/**
+ * Set the initial dirty period for a page cache.
+ *
+ * @param cache  The cache
+ * @param period The initial dirty period to set
+ **/
+void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period);
+
+/**
+ * Switch the page cache into or out of read-only rebuild mode.
+ *
+ * @param cache       The cache
+ * @param rebuilding  <code>true</code> if the cache should be put into
+ *                    read-only rebuild mode, <code>false</code> otherwise
+ **/
+void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding);
+
+/**
+ * Check whether a page cache is active (i.e. has any active lookups,
+ * outstanding I/O, or pending I/O).
+ *
+ * @param cache  The cache to check
+ *
+ * @return <code>true</code> if the cache is active
+ **/
+bool isPageCacheActive(VDOPageCache *cache)
+  __attribute__((warn_unused_result));
+
+/**
+ * Advance the dirty period for a page cache.
+ *
+ * @param cache   The cache to advance
+ * @param period  The new dirty period
+ **/
+void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period);
+
+/**
+ * Write one or more batches of dirty pages.
+ *
+ * All writable pages in the ancient era and some number in the old era
+ * are scheduled for writing.
+ *
+ * @param cache    the VDO page cache
+ * @param batches  how many batches to write now
+ * @param total    how many batches (including those being written now) remain
+ *                   in this era
+ **/
+void writeVDOPageCachePages(VDOPageCache *cache,
+                            size_t        batches,
+                            size_t        total);
+
+/**
+ * Rotate the dirty page eras.
+ *
+ * Move all pages in the old era to the ancient era and then move
+ * the current era bin into the old era.
+ *
+ * @param cache   the VDO page cache
+ **/
+void rotateVDOPageCacheEras(VDOPageCache *cache);
+
+// ASYNC
+
+/**
+ * A completion awaiting a specific page.  Also a live reference into the
+ * page once completed, until freed.
+ **/
+typedef struct {
+  /** The generic completion */
+  VDOCompletion        completion;
+  /** The cache involved */
+  VDOPageCache        *cache;
+  /** The waiter for the pending list */
+  Waiter               waiter;
+  /** The absolute physical block number of the page on disk */
+  PhysicalBlockNumber  pbn;
+  /** Whether the page may be modified */
+  bool                 writable;
+  /** Whether the page is available */
+  bool                 ready;
+  /** The info structure for the page, only valid when ready */
+  PageInfo            *info;
+} VDOPageCompletion;
+
+/**
+ * Initialize a VDO Page Completion, requesting a particular page from the
+ * cache.
+ *
+ * @param pageCompletion  The VDOPageCompletion to initialize
+ * @param cache           The VDO page cache
+ * @param pbn             The absolute physical block of the desired page
+ * @param writable        Whether the page can be modified
+ * @param parent          The parent object
+ * @param callback        The completion callback
+ * @param errorHandler    The handler for page errors
+ *
+ * @note Once a completion has occurred for the getVDOPageAsync operation,
+ *       the underlying page shall be busy (stuck in memory) until the
+ *       VDOCompletion returned by this operation has been released.
+ **/
+void initVDOPageCompletion(VDOPageCompletion   *pageCompletion,
+                           VDOPageCache        *cache,
+                           PhysicalBlockNumber  pbn,
+                           bool                 writable,
+                           void                *parent,
+                           VDOAction           *callback,
+                           VDOAction           *errorHandler);
+
+/**
+ * Release a VDO Page Completion.
+ *
+ * The page referenced by this completion (if any) will no longer be
+ * held busy by this completion. If a page becomes discardable and
+ * there are completions awaiting free pages then a new round of
+ * page discarding is started.
+ *
+ * @param completion The completion to release
+ **/
+void releaseVDOPageCompletion(VDOCompletion *completion);
+
+/**
+ * Asynchronous operation to get a VDO page.
+ *
+ * May cause another page to be discarded (potentially writing a dirty page)
+ * and the one nominated by the completion to be loaded from disk.
+ *
+ * When the page becomes available the callback registered in the completion
+ * provided is triggered. Once triggered the page is marked busy until
+ * the completion is destroyed.
+ *
+ * @param completion    the completion initialized my initVDOPageCompletion().
+ **/
+void getVDOPageAsync(VDOCompletion *completion);
+
+/**
+ * Mark a VDO page referenced by a completed VDOPageCompletion as dirty.
+ *
+ * @param completion      a VDO Page Completion whose callback has been called
+ * @param oldDirtyPeriod  the period in which the page was already dirty (0 if
+ *                        it wasn't)
+ * @param newDirtyPeriod  the period in which the page is now dirty
+ **/
+void markCompletedVDOPageDirty(VDOCompletion  *completion,
+                               SequenceNumber  oldDirtyPeriod,
+                               SequenceNumber  newDirtyPeriod);
+
+/**
+ * Request that a VDO page be written out as soon as it is not busy.
+ *
+ * @param completion  the VDOPageCompletion containing the page
+ **/
+void requestVDOPageWrite(VDOCompletion *completion);
+
+/**
+ * Access the raw memory for a read-only page of a completed VDOPageCompletion.
+ *
+ * @param completion    a vdo page completion whose callback has been called
+ *
+ * @return a pointer to the raw memory at the beginning of the page, or
+ *         NULL if the page is not available.
+ **/
+const void *dereferenceReadableVDOPage(VDOCompletion *completion);
+
+/**
+ * Access the raw memory for a writable page of a completed VDOPageCompletion.
+ *
+ * @param completion    a vdo page completion whose callback has been called
+ *
+ * @return a pointer to the raw memory at the beginning of the page, or
+ *         NULL if the page is not available, or if the page is read-only
+ **/
+void *dereferenceWritableVDOPage(VDOCompletion *completion);
+
+/**
+ * Get the per-page client context for the page in a page completion whose
+ * callback has been invoked. Should only be called after dereferencing the
+ * page completion to validate the page.
+ *
+ * @param completion    a vdo page completion whose callback has been invoked
+ *
+ * @return a pointer to the per-page client context, or NULL if
+ *         the page is not available
+ **/
+void *getVDOPageCompletionContext(VDOCompletion *completion);
+
+/**
+ * Drain I/O for a page cache.
+ *
+ * @param cache  The cache to drain
+ **/
+void drainVDOPageCache(VDOPageCache *cache);
+
+/**
+ * Invalidate all entries in the VDO page cache. There must not be any
+ * dirty pages in the cache.
+ *
+ * @param cache  the cache to invalidate
+ *
+ * @return a success or error code
+ **/
+int invalidateVDOPageCache(VDOPageCache *cache)
+  __attribute__((warn_unused_result));
+
+// STATISTICS & TESTING
+
+/**
+ * Get current cache statistics.
+ *
+ * @param cache  the page cache
+ *
+ * @return the statistics
+ **/
+AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache)
+  __attribute__((warn_unused_result));
+
+#endif // VDO_PAGE_CACHE_H
diff --git a/vdo/base/vdoPageCacheInternals.h b/vdo/base/vdoPageCacheInternals.h
new file mode 100644
index 0000000..4e2c67f
--- /dev/null
+++ b/vdo/base/vdoPageCacheInternals.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCacheInternals.h#8 $
+ */
+
+#ifndef VDO_PAGE_CACHE_INTERNALS_H
+#define VDO_PAGE_CACHE_INTERNALS_H
+
+#include "vdoPageCache.h"
+
+#ifndef __KERNEL__
+# include <stdint.h>
+#endif
+
+#include "blockMapInternals.h"
+#include "completion.h"
+#include "dirtyLists.h"
+#include "intMap.h"
+#include "physicalLayer.h"
+#include "ringNode.h"
+
+enum {
+  MAX_PAGE_CONTEXT_SIZE = 8,
+};
+
+static const PhysicalBlockNumber NO_PAGE = 0xFFFFFFFFFFFFFFFF;
+
+/**
+ * A PageInfoNode is a ring node.
+ **/
+typedef RingNode PageInfoNode;
+
+/**
+ * The VDO Page Cache abstraction.
+ **/
+struct vdoPageCache {
+  /** the physical layer to page to */
+  PhysicalLayer             *layer;
+  /** number of pages in cache */
+  PageCount                  pageCount;
+  /** function to call on page read */
+  VDOPageReadFunction       *readHook;
+  /** function to call on page write */
+  VDOPageWriteFunction      *writeHook;
+  /** number of pages to write in the current batch */
+  PageCount                  pagesInBatch;
+  /** Whether the VDO is doing a read-only rebuild */
+  bool                       rebuilding;
+
+  /** array of page information entries */
+  PageInfo                  *infos;
+  /** raw memory for pages */
+  char                      *pages;
+  /** cache last found page info */
+  PageInfo                  *lastFound;
+  /** map of page number to info */
+  IntMap                    *pageMap;
+  /** master LRU list (all infos) */
+  PageInfoNode               lruList;
+  /** dirty pages by period */
+  DirtyLists                *dirtyLists;
+  /** free page list (oldest first) */
+  PageInfoNode               freeList;
+  /** outgoing page list */
+  PageInfoNode               outgoingList;
+  /** number of read I/O operations pending */
+  PageCount                  outstandingReads;
+  /** number of write I/O operations pending */
+  PageCount                  outstandingWrites;
+  /** number of pages covered by the current flush */
+  PageCount                  pagesInFlush;
+  /** number of pages waiting to be included in the next flush */
+  PageCount                  pagesToFlush;
+  /** number of discards in progress */
+  unsigned int               discardCount;
+  /** how many VPCs waiting for free page */
+  unsigned int               waiterCount;
+  /** queue of waiters who want a free page */
+  WaitQueue                  freeWaiters;
+  /** statistics */
+  AtomicPageCacheStatistics  stats;
+  /** counter for pressure reports */
+  uint32_t                   pressureReport;
+  /** the block map zone to which this cache belongs */
+  BlockMapZone              *zone;
+};
+
+/**
+ * The state of a page buffer. If the page buffer is free no particular page is
+ * bound to it, otherwise the page buffer is bound to particular page whose
+ * absolute pbn is in the pbn field. If the page is resident or dirty the page
+ * data is stable and may be accessed. Otherwise the page is in flight
+ * (incoming or outgoing) and its data should not be accessed.
+ *
+ * @note Update the static data in vpcPageStateName() and vpcPageStateFlag()
+ *       if you change this enumeration.
+ **/
+typedef enum __attribute__((packed)) pageState {
+  /* this page buffer is not being used */
+  PS_FREE,
+  /* this page is being read from store */
+  PS_INCOMING,
+  /* attempt to load this page failed */
+  PS_FAILED,
+  /* this page is valid and un-modified */
+  PS_RESIDENT,
+  /* this page is valid and modified */
+  PS_DIRTY,
+  /* this page is being written and should not be used */
+  PS_OUTGOING,
+  /* not a state */
+  PAGE_STATE_COUNT,
+} PageState;
+
+/**
+ * The write status of page
+ **/
+typedef enum __attribute__((packed)) {
+  WRITE_STATUS_NORMAL,
+  WRITE_STATUS_DISCARD,
+  WRITE_STATUS_DEFERRED,
+} WriteStatus;
+
+/**
+ * Per-page-slot information.
+ **/
+struct pageInfo {
+  /** Preallocated page VIO */
+  VIO                 *vio;
+  /** back-link for references */
+  VDOPageCache        *cache;
+  /** the pbn of the page */
+  PhysicalBlockNumber  pbn;
+  /** page is busy (temporarily locked) */
+  uint16_t             busy;
+  /** the write status the page */
+  WriteStatus          writeStatus;
+  /** page state */
+  PageState            state;
+  /** queue of completions awaiting this item */
+  WaitQueue            waiting;
+  /** state linked list node */
+  PageInfoNode         listNode;
+  /** LRU node */
+  PageInfoNode         lruNode;
+  /** Space for per-page client data */
+  byte                 context[MAX_PAGE_CONTEXT_SIZE];
+};
+
+// PAGE INFO LIST OPERATIONS
+
+/**********************************************************************/
+static inline PageInfo *pageInfoFromListNode(PageInfoNode *node)
+{
+  if (node == NULL) {
+    return NULL;
+  }
+  return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, listNode));
+}
+
+/**********************************************************************/
+static inline PageInfo *pageInfoFromLRUNode(PageInfoNode *node)
+{
+  if (node == NULL) {
+    return NULL;
+  }
+  return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, lruNode));
+}
+
+// PAGE INFO STATE ACCESSOR FUNCTIONS
+
+/**********************************************************************/
+static inline bool isFree(const PageInfo *info)
+{
+  return info->state == PS_FREE;
+}
+
+/**********************************************************************/
+static inline bool isAvailable(const PageInfo *info)
+{
+  return (info->state == PS_FREE) || (info->state == PS_FAILED);
+}
+
+/**********************************************************************/
+static inline bool isPresent(const PageInfo *info)
+{
+  return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY);
+}
+
+/**********************************************************************/
+static inline bool isDirty(const PageInfo *info)
+{
+  return info->state == PS_DIRTY;
+}
+
+/**********************************************************************/
+static inline bool isResident(const PageInfo *info)
+{
+  return info->state == PS_RESIDENT;
+}
+
+/**********************************************************************/
+static inline bool isInFlight(const PageInfo *info)
+{
+  return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING);
+}
+
+/**********************************************************************/
+static inline bool isIncoming(const PageInfo *info)
+{
+  return info->state == PS_INCOMING;
+}
+
+/**********************************************************************/
+static inline bool isOutgoing(const PageInfo *info)
+{
+  return info->state == PS_OUTGOING;
+}
+
+/**********************************************************************/
+static inline bool isValid(const PageInfo *info)
+{
+  return isPresent(info) || isOutgoing(info);
+}
+
+// COMPLETION CONVERSIONS
+
+/**********************************************************************/
+static inline VDOPageCompletion *asVDOPageCompletion(VDOCompletion *completion)
+{
+  assertCompletionType(completion->type, VDO_PAGE_COMPLETION);
+  return (VDOPageCompletion *) ((uintptr_t) completion
+                                - offsetof(VDOPageCompletion, completion));
+}
+
+/**********************************************************************/
+static inline
+VDOPageCompletion *pageCompletionFromWaiter(Waiter *waiter)
+{
+  if (waiter == NULL) {
+    return NULL;
+  }
+
+  VDOPageCompletion *completion = (VDOPageCompletion *)
+    ((uintptr_t) waiter - offsetof(VDOPageCompletion, waiter));
+  assertCompletionType(completion->completion.type, VDO_PAGE_COMPLETION);
+  return completion;
+}
+
+// COMMONLY USED FUNCTIONS
+
+// All of these functions are prefixed "vpc" in order to prevent namespace
+// issues (ordinarily they would be static).
+
+/**
+ * Find the page info (if any) associated with a given pbn.
+ *
+ * @param cache  the page cache
+ * @param pbn    the absolute physical block number of the page
+ *
+ * @return the page info for the page if available, or NULL if not
+ **/
+PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return the name of a page state.
+ *
+ * @param state     a page state
+ *
+ * @return a pointer to a static page state name
+ *
+ * @note If the page state is invalid a static string is returned and the
+ *       invalid state is logged.
+ **/
+const char *vpcPageStateName(PageState state)
+  __attribute__((warn_unused_result));
+
+#endif // VDO_PAGE_CACHE_INTERNALS_H
diff --git a/vdo/base/vdoRecovery.c b/vdo/base/vdoRecovery.c
new file mode 100644
index 0000000..97e72eb
--- /dev/null
+++ b/vdo/base/vdoRecovery.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $
+ */
+
+#include "vdoRecoveryInternals.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "blockAllocator.h"
+#include "blockAllocatorInternals.h"
+#include "blockMapInternals.h"
+#include "blockMapPage.h"
+#include "blockMapRecovery.h"
+#include "completion.h"
+#include "numUtils.h"
+#include "packedRecoveryJournalBlock.h"
+#include "recoveryJournal.h"
+#include "recoveryUtils.h"
+#include "slab.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "slabJournalInternals.h"
+#include "vdoInternal.h"
+#include "waitQueue.h"
+
+enum {
+  // The int map needs capacity of twice the number of VIOs in the system.
+  INT_MAP_CAPACITY            = MAXIMUM_USER_VIOS * 2,
+  // There can be as many missing decrefs as there are VIOs in the system.
+  MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS,
+};
+
+typedef struct missingDecref {
+  /** A waiter for queueing this object */
+  Waiter              waiter;
+  /** The parent of this object */
+  RecoveryCompletion *recovery;
+  /** Whether this decref is complete */
+  bool                complete;
+  /** The slot for which the last decref was lost */
+  BlockMapSlot        slot;
+  /** The penultimate block map entry for this LBN */
+  DataLocation        penultimateMapping;
+  /** The page completion used to fetch the block map page for this LBN */
+  VDOPageCompletion   pageCompletion;
+  /** The journal point which will be used for this entry */
+  JournalPoint        journalPoint;
+  /** The slab journal to which this entry will be applied */
+  SlabJournal        *slabJournal;
+} MissingDecref;
+
+/**
+ * Convert a Waiter to the missing decref of which it is a part.
+ *
+ * @param waiter  The Waiter to convert
+ *
+ * @return The MissingDecref wrapping the Waiter
+ **/
+__attribute__((warn_unused_result))
+static inline MissingDecref *asMissingDecref(Waiter *waiter)
+{
+  STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0);
+  return (MissingDecref *) waiter;
+}
+
+/**
+ * Enqueue a MissingDecref. If the enqueue fails, enter read-only mode.
+ *
+ * @param queue  The queue on which to enqueue the decref
+ * @param decref  The MissingDecref to enqueue
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref)
+{
+  int result = enqueueWaiter(queue, &decref->waiter);
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result);
+    setCompletionResult(&decref->recovery->completion, result);
+    FREE(decref);
+  }
+
+  return result;
+}
+
+/**
+ * Convert a BlockMapSlot into a unique uint64_t.
+ *
+ * @param slot  The block map slot to convert.
+ *
+ * @return a one-to-one mappable uint64_t.
+ **/
+static uint64_t slotAsNumber(BlockMapSlot slot)
+{
+  return (((uint64_t) slot.pbn << 10) + slot.slot);
+}
+
+/**
+ * Create a MissingDecref and enqueue it to wait for a determination of its
+ * penultimate mapping.
+ *
+ * @param [in]  recovery   The parent recovery completion
+ * @param [in]  entry      The recovery journal entry for the increment which is
+ *                         missing a decref
+ * @param [out] decrefPtr  A pointer to hold the new MissingDecref
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int makeMissingDecref(RecoveryCompletion    *recovery,
+                             RecoveryJournalEntry   entry,
+                             MissingDecref        **decrefPtr)
+{
+  MissingDecref *decref;
+  int result = ALLOCATE(1, MissingDecref, __func__, &decref);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  decref->recovery = recovery;
+  result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  /*
+   * Each synthsized decref needs a unique journal point. Otherwise, in the
+   * event of a crash, we would be unable to tell which synthesized decrefs had
+   * already been committed in the slab journals. Instead of using real
+   * recovery journal space for this, we can use fake journal points between
+   * the last currently valid entry in the tail block and the first journal
+   * entry in the next block. We can't overflow the entry count since the
+   * number of synthesized decrefs is bounded by the DataVIO limit.
+   *
+   * It is vital that any given missing decref always have the same fake
+   * journal point since a failed recovery may be retried with a different
+   * number of zones after having written out some slab journal blocks. Since
+   * the missing decrefs are always read out of the journal in the same order,
+   * we can assign them a journal point when they are read. Their subsequent
+   * use will ensure that, for any given slab journal, they are applied in
+   * the order dictated by these assigned journal points.
+   */
+  decref->slot         = entry.slot;
+  decref->journalPoint = recovery->nextSynthesizedJournalPoint;
+  recovery->nextSynthesizedJournalPoint.entryCount++;
+  recovery->missingDecrefCount++;
+  recovery->incompleteDecrefCount++;
+
+  *decrefPtr = decref;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Move the given recovery point forward by one entry.
+ *
+ * @param point  The recovery point to alter
+ **/
+static void incrementRecoveryPoint(RecoveryPoint *point)
+{
+  point->entryCount++;
+  if ((point->sectorCount == (SECTORS_PER_BLOCK - 1))
+      && (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) {
+    point->sequenceNumber++;
+    point->sectorCount = 1;
+    point->entryCount = 0;
+  }
+
+  if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) {
+    point->sectorCount++;
+    point->entryCount = 0;
+    return;
+  }
+}
+
+/**
+ * Move the given recovery point backwards by one entry.
+ *
+ * @param point  The recovery point to alter
+ **/
+static void decrementRecoveryPoint(RecoveryPoint *point)
+{
+  STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0);
+
+  if ((point->sectorCount <= 1) && (point->entryCount == 0)) {
+    point->sequenceNumber--;
+    point->sectorCount = SECTORS_PER_BLOCK - 1;
+    point->entryCount  = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1;
+    return;
+  }
+
+  if (point->entryCount == 0) {
+    point->sectorCount--;
+    point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1;
+    return;
+  }
+
+  point->entryCount--;
+}
+
+/**
+ * Check whether the first point precedes the second point.
+ *
+ * @param first   The first recovery point
+ * @param second  The second recovery point
+ *
+ * @return <code>true</code> if the first point precedes the second point
+ **/
+__attribute__((warn_unused_result))
+static bool beforeRecoveryPoint(const RecoveryPoint *first,
+                                const RecoveryPoint *second)
+{
+  if (first->sequenceNumber < second->sequenceNumber) {
+    return true;
+  }
+
+  if (first->sequenceNumber > second->sequenceNumber) {
+    return false;
+  }
+
+  if (first->sectorCount < second->sectorCount) {
+    return true;
+  }
+
+  return ((first->sectorCount == second->sectorCount)
+          && (first->entryCount < second->entryCount));
+}
+
+/**
+ * Prepare the sub-task completion.
+ *
+ * @param recovery      The RecoveryCompletion whose sub-task completion is to
+ *                      be prepared
+ * @param callback      The callback to register for the next sub-task
+ * @param errorHandler  The error handler for the next sub-task
+ * @param zoneType      The type of zone on which the callback or errorHandler
+ *                      should run
+ **/
+static void prepareSubTask(RecoveryCompletion *recovery,
+                           VDOAction           callback,
+                           VDOAction           errorHandler,
+                           ZoneType            zoneType)
+{
+  const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
+  ThreadID threadID;
+  switch (zoneType) {
+  case ZONE_TYPE_LOGICAL:
+    // All blockmap access is done on single thread, so use logical zone 0.
+    threadID = getLogicalZoneThread(threadConfig, 0);
+    break;
+
+  case ZONE_TYPE_PHYSICAL:
+    threadID = recovery->allocator->threadID;
+    break;
+
+  case ZONE_TYPE_ADMIN:
+  default:
+    threadID = getAdminThread(threadConfig);
+  }
+
+  prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler,
+                    threadID, recovery);
+}
+
+/**********************************************************************/
+int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr)
+{
+  const ThreadConfig *threadConfig = getThreadConfig(vdo);
+  RecoveryCompletion *recovery;
+  int result = ALLOCATE_EXTENDED(RecoveryCompletion,
+                                 threadConfig->physicalZoneCount, RingNode,
+                                 __func__, &recovery);
+ if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  recovery->vdo = vdo;
+  for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
+    initializeWaitQueue(&recovery->missingDecrefs[z]);
+  }
+
+  result = initializeEnqueueableCompletion(&recovery->completion,
+                                           RECOVERY_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    freeRecoveryCompletion(&recovery);
+    return result;
+  }
+
+  result = initializeEnqueueableCompletion(&recovery->subTaskCompletion,
+                                           SUB_TASK_COMPLETION, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    freeRecoveryCompletion(&recovery);
+    return result;
+  }
+
+  result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap);
+  if (result != VDO_SUCCESS) {
+    freeRecoveryCompletion(&recovery);
+    return result;
+  }
+
+  *recoveryPtr  = recovery;
+  return VDO_SUCCESS;
+}
+
+/**
+ * A waiter callback to free MissingDecrefs.
+ *
+ * Implements WaiterCallback.
+ **/
+static void freeMissingDecref(Waiter *waiter,
+                              void   *context __attribute__((unused)))
+{
+  FREE(asMissingDecref(waiter));
+}
+
+/**********************************************************************/
+void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr)
+{
+  RecoveryCompletion *recovery = *recoveryPtr;
+  if (recovery == NULL) {
+    return;
+  }
+
+  freeIntMap(&recovery->slotEntryMap);
+  const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo);
+  for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) {
+    notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL);
+  }
+
+  FREE(recovery->journalData);
+  FREE(recovery->entries);
+  destroyEnqueueable(&recovery->subTaskCompletion);
+  destroyEnqueueable(&recovery->completion);
+  FREE(recovery);
+  *recoveryPtr = NULL;
+}
+
+/**
+ * Finish recovering, free the recovery completion and notify the parent.
+ *
+ * @param completion  The recovery completion
+ **/
+static void finishRecovery(VDOCompletion *completion)
+{
+  VDOCompletion      *parent        = completion->parent;
+  RecoveryCompletion *recovery      = asRecoveryCompletion(completion);
+  VDO                *vdo           = recovery->vdo;
+  uint64_t            recoveryCount = ++vdo->completeRecoveries;
+  initializeRecoveryJournalPostRecovery(vdo->recoveryJournal,
+                                        recoveryCount, recovery->highestTail);
+  freeRecoveryCompletion(&recovery);
+  logInfo("Rebuild complete.");
+
+  // Now that we've freed the recovery completion and its vast array of
+  // journal entries, we can allocate refcounts.
+  int result = allocateSlabRefCounts(vdo->depot);
+  finishCompletion(parent, result);
+}
+
+/**
+ * Handle a recovery error.
+ *
+ * @param completion   The recovery completion
+ **/
+static void abortRecovery(VDOCompletion *completion)
+{
+  VDOCompletion      *parent   = completion->parent;
+  int                 result   = completion->result;
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion);
+  freeRecoveryCompletion(&recovery);
+  logWarning("Recovery aborted");
+  finishCompletion(parent, result);
+}
+
+/**
+ * Abort a recovery if there is an error.
+ *
+ * @param result    The result to check
+ * @param recovery  The recovery completion
+ *
+ * @return <code>true</code> if the result was an error
+ **/
+__attribute__((warn_unused_result))
+static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery)
+{
+  if (result == VDO_SUCCESS) {
+    return false;
+  }
+
+  finishCompletion(&recovery->completion, result);
+  return true;
+}
+
+/**
+ * Unpack the recovery journal entry associated with the given recovery point.
+ *
+ * @param recovery  The recovery completion
+ * @param point     The recovery point
+ *
+ * @return The unpacked contents of the matching recovery journal entry
+ **/
+static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery,
+                                     const RecoveryPoint      *point)
+{
+  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
+  PhysicalBlockNumber blockNumber
+    = getRecoveryJournalBlockNumber(journal, point->sequenceNumber);
+  off_t sectorOffset
+    = (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE);
+  PackedJournalSector *sector
+    = (PackedJournalSector *) &recovery->journalData[sectorOffset];
+  return unpackRecoveryJournalEntry(&sector->entries[point->entryCount]);
+}
+
+/**
+ * Create an array of all valid journal entries, in order, and store it in the
+ * recovery completion.
+ *
+ * @param recovery  The recovery completion
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int extractJournalEntries(RecoveryCompletion *recovery)
+{
+  // Allocate a NumberedBlockMapping array just large enough to transcribe
+  // every increment PackedRecoveryJournalEntry from every valid journal block.
+  int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__,
+                        &recovery->entries);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  RecoveryPoint recoveryPoint = {
+    .sequenceNumber = recovery->blockMapHead,
+    .sectorCount    = 1,
+    .entryCount     = 0,
+  };
+  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
+    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
+    result = validateRecoveryJournalEntry(recovery->vdo, &entry);
+    if (result != VDO_SUCCESS) {
+      enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
+      return result;
+    }
+
+    if (isIncrementOperation(entry.operation)) {
+      recovery->entries[recovery->entryCount] = (NumberedBlockMapping) {
+        .blockMapSlot  = entry.slot,
+        .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state),
+        .number        = recovery->entryCount,
+      };
+      recovery->entryCount++;
+    }
+
+    incrementRecoveryPoint(&recoveryPoint);
+  }
+
+  result = ASSERT((recovery->entryCount <= recovery->increfCount),
+                  "approximate incref count is an upper bound");
+  if (result != VDO_SUCCESS) {
+    enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
+  }
+
+  return result;
+}
+
+/**
+ * Extract journal entries and recover the block map. This callback is
+ * registered in startSuperBlockSave().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void launchBlockMapRecovery(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+  assertOnLogicalZoneThread(vdo, 0, __func__);
+
+  // Extract the journal entries for the block map recovery.
+  int result = extractJournalEntries(recovery);
+  if (abortRecoveryOnError(result, recovery)) {
+    return;
+  }
+
+  prepareToFinishParent(completion, &recovery->completion);
+  recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion);
+}
+
+/**
+ * Finish flushing all slab journals and start a write of the super block.
+ * This callback is registered in addSynthesizedEntries().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void startSuperBlockSave(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+  assertOnAdminThread(vdo, __func__);
+
+  logInfo("Saving recovery progress");
+  vdo->state = VDO_REPLAYING;
+
+  // The block map access which follows the super block save must be done
+  // on a logical thread.
+  prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
+                 ZONE_TYPE_LOGICAL);
+  saveVDOComponentsAsync(vdo, completion);
+}
+
+/**
+ * The callback from loading the slab depot. It will update the logical blocks
+ * and block map data blocks counts in the recovery journal and then drain the
+ * slab depot in order to commit the recovered slab journals. It is registered
+ * in applyToDepot().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void finishRecoveringDepot(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+  assertOnAdminThread(vdo, __func__);
+
+  logInfo("Replayed %zu journal entries into slab journals",
+          recovery->entriesAddedToSlabJournals);
+  logInfo("Synthesized %zu missing journal entries",
+          recovery->missingDecrefCount);
+  vdo->recoveryJournal->logicalBlocksUsed  = recovery->logicalBlocksUsed;
+  vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks;
+
+  prepareSubTask(recovery, startSuperBlockSave, finishParentCallback,
+                 ZONE_TYPE_ADMIN);
+  drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion);
+}
+
+/**
+ * The error handler for recovering slab journals. It will skip any remaining
+ * recovery on the current zone and propagate the error. It is registered in
+ * addSlabJournalEntries() and addSynthesizedEntries().
+ *
+ * @param completion  The completion of the block allocator being recovered
+ **/
+static void handleAddSlabJournalEntryError(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  notifySlabJournalsAreRecovered(recovery->allocator, completion->result);
+}
+
+/**
+ * Add synthesized entries into slab journals, waiting when necessary.
+ *
+ * @param completion  The allocator completion
+ **/
+static void addSynthesizedEntries(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+
+  // Get ready in case we need to enqueue again
+  prepareCompletion(completion, addSynthesizedEntries,
+                    handleAddSlabJournalEntryError,
+                    completion->callbackThreadID, recovery);
+  WaitQueue *missingDecrefs
+    = &recovery->missingDecrefs[recovery->allocator->zoneNumber];
+  while (hasWaiters(missingDecrefs)) {
+    MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs));
+    if (!attemptReplayIntoSlabJournal(decref->slabJournal,
+                                      decref->penultimateMapping.pbn,
+                                      DATA_DECREMENT, &decref->journalPoint,
+                                      completion)) {
+      return;
+    }
+
+    dequeueNextWaiter(missingDecrefs);
+    FREE(decref);
+  }
+
+  notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS);
+}
+
+/**
+ * Determine the LBNs used count as of the end of the journal (but
+ * not including any changes to that count from entries that will be
+ * synthesized later).
+ *
+ * @param recovery  The recovery completion
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int computeUsages(RecoveryCompletion *recovery)
+{
+  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
+  PackedJournalHeader *tailHeader
+    = getJournalBlockHeader(journal, recovery->journalData, recovery->tail);
+
+  RecoveryBlockHeader unpacked;
+  unpackRecoveryBlockHeader(tailHeader, &unpacked);
+  recovery->logicalBlocksUsed  = unpacked.logicalBlocksUsed;
+  recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks;
+
+  RecoveryPoint recoveryPoint = {
+    .sequenceNumber = recovery->tail,
+    .sectorCount    = 1,
+    .entryCount     = 0,
+  };
+  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
+    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
+    if (isMappedLocation(&entry.mapping)) {
+      switch (entry.operation) {
+      case DATA_INCREMENT:
+        recovery->logicalBlocksUsed++;
+        break;
+
+      case DATA_DECREMENT:
+        recovery->logicalBlocksUsed--;
+        break;
+
+      case BLOCK_MAP_INCREMENT:
+        recovery->blockMapDataBlocks++;
+        break;
+
+      default:
+        return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
+                                       "Recovery journal entry at "
+                                       "sequence number %" PRIu64
+                                       ", sector %u, entry %u had invalid "
+                                       "operation %u",
+                                       recoveryPoint.sequenceNumber,
+                                       recoveryPoint.sectorCount,
+                                       recoveryPoint.entryCount,
+                                       entry.operation);
+      }
+    }
+
+    incrementRecoveryPoint(&recoveryPoint);
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Advance the current recovery and journal points.
+ *
+ * @param recovery         The RecoveryCompletion whose points are to be
+ *                         advanced
+ * @param entriesPerBlock  The number of entries in a recovery journal block
+ **/
+static void advancePoints(RecoveryCompletion *recovery,
+                          JournalEntryCount   entriesPerBlock)
+{
+  incrementRecoveryPoint(&recovery->nextRecoveryPoint);
+  advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock);
+}
+
+/**
+ * Replay recovery journal entries into the slab journals of the allocator
+ * currently being recovered, waiting for slab journal tailblock space when
+ * necessary. This method is its own callback.
+ *
+ * @param completion  The allocator completion
+ **/
+static void addSlabJournalEntries(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+  RecoveryJournal    *journal  = vdo->recoveryJournal;
+
+  // Get ready in case we need to enqueue again.
+  prepareCompletion(completion, addSlabJournalEntries,
+                    handleAddSlabJournalEntryError,
+                    completion->callbackThreadID, recovery);
+  for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint;
+       beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint);
+       advancePoints(recovery, journal->entriesPerBlock)) {
+    RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint);
+    int result = validateRecoveryJournalEntry(vdo, &entry);
+    if (result != VDO_SUCCESS) {
+      enterReadOnlyMode(journal->readOnlyNotifier, result);
+      finishCompletion(completion, result);
+      return;
+    }
+
+    if (entry.mapping.pbn == ZERO_BLOCK) {
+      continue;
+    }
+
+    Slab *slab = getSlab(vdo->depot, entry.mapping.pbn);
+    if (slab->allocator != recovery->allocator) {
+      continue;
+    }
+
+    if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn,
+                                      entry.operation,
+                                      &recovery->nextJournalPoint,
+                                      completion)) {
+      return;
+    }
+
+    recovery->entriesAddedToSlabJournals++;
+  }
+
+  logInfo("Recreating missing journal entries for zone %u",
+          recovery->allocator->zoneNumber);
+  addSynthesizedEntries(completion);
+}
+
+/**********************************************************************/
+void replayIntoSlabJournals(BlockAllocator *allocator,
+                            VDOCompletion  *completion,
+                            void           *context)
+{
+  RecoveryCompletion *recovery = context;
+  assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__);
+  if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) {
+    // there's nothing to replay
+    notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS);
+    return;
+  }
+
+  recovery->allocator = allocator;
+  recovery->nextRecoveryPoint = (RecoveryPoint) {
+    .sequenceNumber = recovery->slabJournalHead,
+    .sectorCount    = 1,
+    .entryCount     = 0,
+  };
+
+  recovery->nextJournalPoint = (JournalPoint) {
+    .sequenceNumber = recovery->slabJournalHead,
+    .entryCount     = 0,
+  };
+
+  logInfo("Replaying entries into slab journals for zone %u",
+          allocator->zoneNumber);
+  completion->parent = recovery;
+  addSlabJournalEntries(completion);
+}
+
+/**
+ * A waiter callback to enqueue a MissingDecref on the queue for the physical
+ * zone in which it will be applied.
+ *
+ * Implements WaiterCallback.
+ **/
+static void queueOnPhysicalZone(Waiter *waiter, void *context)
+{
+  MissingDecref *decref  = asMissingDecref(waiter);
+  DataLocation   mapping = decref->penultimateMapping;
+  if (isMappedLocation(&mapping)) {
+    decref->recovery->logicalBlocksUsed--;
+  }
+
+  if (mapping.pbn == ZERO_BLOCK) {
+    // Decrefs of zero are not applied to slab journals.
+    FREE(decref);
+    return;
+  }
+
+  decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn);
+  ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber;
+  enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref);
+}
+
+/**
+ * Queue each missing decref on the slab journal to which it is to be applied
+ * then load the slab depot. This callback is registered in
+ * findSlabJournalEntries().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void applyToDepot(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  assertOnAdminThread(recovery->vdo, __func__);
+  prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback,
+                 ZONE_TYPE_ADMIN);
+
+  SlabDepot *depot = getSlabDepot(recovery->vdo);
+  notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot);
+  if (abortRecoveryOnError(recovery->completion.result, recovery)) {
+    return;
+  }
+
+  loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery);
+}
+
+/**
+ * Validate the location of the penultimate mapping for a MissingDecref. If it
+ * is valid, enqueue it for the appropriate physical zone or account for it.
+ * Otherwise, dispose of it and signal an error.
+ *
+ * @param decref     The decref whose penultimate mapping has just been found
+ * @param location   The penultimate mapping
+ * @param errorCode  The error code to use if the location is invalid
+ **/
+static int recordMissingDecref(MissingDecref *decref,
+                               DataLocation   location,
+                               int            errorCode)
+{
+  RecoveryCompletion *recovery = decref->recovery;
+  recovery->incompleteDecrefCount--;
+  if (isValidLocation(&location)
+      && isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) {
+    decref->penultimateMapping = location;
+    decref->complete           = true;
+    return VDO_SUCCESS;
+  }
+
+  // The location was invalid
+  enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode);
+  setCompletionResult(&recovery->completion, errorCode);
+  logErrorWithStringError(errorCode,
+                          "Invalid mapping for pbn %llu with state %u",
+                          location.pbn, location.state);
+  return errorCode;
+}
+
+/**
+ * Find the block map slots with missing decrefs.
+ *
+ * To find the slots missing decrefs, we iterate through the journal in reverse
+ * so we see decrefs before increfs; if we see an incref before its paired
+ * decref, we instantly know this incref is missing its decref.
+ *
+ * Simultaneously, we attempt to determine the missing decref. If there is a
+ * missing decref, and at least two increfs for that slot, we know we should
+ * decref the PBN from the penultimate incref. Otherwise, there is only one
+ * incref for that slot: we must synthesize the decref out of the block map
+ * instead of the recovery journal.
+ *
+ * @param recovery  The recovery completion
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+__attribute__((warn_unused_result))
+static int findMissingDecrefs(RecoveryCompletion *recovery)
+{
+  IntMap *slotEntryMap = recovery->slotEntryMap;
+  // This placeholder decref is used to mark lbns for which we have observed a
+  // decref but not the paired incref (going backwards through the journal).
+  MissingDecref foundDecref;
+
+  // A buffer is allocated based on the number of incRef entries found, so use
+  // the earliest head.
+  SequenceNumber head = minSequenceNumber(recovery->blockMapHead,
+                                          recovery->slabJournalHead);
+  RecoveryPoint headPoint = {
+    .sequenceNumber = head,
+    .sectorCount    = 1,
+    .entryCount     = 0,
+  };
+
+  // Set up for the first fake journal point that will be used for a
+  // synthesized entry.
+  recovery->nextSynthesizedJournalPoint = (JournalPoint) {
+    .sequenceNumber = recovery->tail,
+    .entryCount     = recovery->vdo->recoveryJournal->entriesPerBlock,
+  };
+
+  RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint;
+  while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) {
+    decrementRecoveryPoint(&recoveryPoint);
+    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
+
+    if (!isIncrementOperation(entry.operation)) {
+      // Observe that we've seen a decref before its incref, but only if
+      // the IntMap does not contain an unpaired incref for this lbn.
+      int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot),
+                             &foundDecref, false, NULL);
+      if (result != VDO_SUCCESS) {
+        return result;
+      }
+
+      continue;
+    }
+
+    recovery->increfCount++;
+
+    MissingDecref *decref
+      = intMapRemove(slotEntryMap, slotAsNumber(entry.slot));
+    if (entry.operation == BLOCK_MAP_INCREMENT) {
+      if (decref != NULL) {
+        return logErrorWithStringError(VDO_CORRUPT_JOURNAL,
+                                       "decref found for block map block %"
+                                       PRIu64 " with state %u",
+                                       entry.mapping.pbn, entry.mapping.state);
+      }
+
+      // There are no decrefs for block map pages, so they can't be missing.
+      continue;
+    }
+
+    if (decref == &foundDecref) {
+      // This incref already had a decref in the intmap, so we know it is
+      // not missing its decref.
+      continue;
+    }
+
+    if (decref == NULL) {
+      // This incref is missing a decref. Add a missing decref object.
+      int result = makeMissingDecref(recovery, entry, &decref);
+      if (result != VDO_SUCCESS) {
+        return result;
+      }
+
+      result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref,
+                         false, NULL);
+      if (result != VDO_SUCCESS) {
+        return result;
+      }
+
+      continue;
+    }
+
+    /*
+     * This MissingDecref was left here by an incref without a decref.
+     * We now know what its penultimate mapping is, and all entries
+     * before here in the journal are paired, decref before incref, so
+     * we needn't remember it in the intmap any longer.
+     */
+    int result = recordMissingDecref(decref, entry.mapping,
+                                     VDO_CORRUPT_JOURNAL);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Process a fetched block map page for a missing decref. This callback is
+ * registered in findSlabJournalEntries().
+ *
+ * @param completion  The page completion which has just finished loading
+ **/
+static void processFetchedPage(VDOCompletion *completion)
+{
+  MissingDecref      *currentDecref = completion->parent;
+  RecoveryCompletion *recovery      = currentDecref->recovery;
+  assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
+
+  const BlockMapPage *page = dereferenceReadableVDOPage(completion);
+  DataLocation location
+    = unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]);
+  releaseVDOPageCompletion(completion);
+  recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING);
+  if (recovery->incompleteDecrefCount == 0) {
+    completeCompletion(&recovery->subTaskCompletion);
+  }
+}
+
+/**
+ * Handle an error fetching a block map page for a missing decref.
+ * This error handler is registered in findSlabJournalEntries().
+ *
+ * @param completion  The page completion which has just finished loading
+ **/
+static void handleFetchError(VDOCompletion *completion)
+{
+  MissingDecref      *decref   = completion->parent;
+  RecoveryCompletion *recovery = decref->recovery;
+  assertOnLogicalZoneThread(recovery->vdo, 0, __func__);
+
+  // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from
+  // the journal was bad, so convert the error code
+  setCompletionResult(&recovery->subTaskCompletion,
+                      ((completion->result == VDO_OUT_OF_RANGE)
+                       ? VDO_CORRUPT_JOURNAL : completion->result));
+  releaseVDOPageCompletion(completion);
+  if (--recovery->incompleteDecrefCount == 0) {
+    completeCompletion(&recovery->subTaskCompletion);
+  }
+}
+
+/**
+ * The waiter callback to requeue a missing decref and launch its page fetch.
+ *
+ * Implements WaiterCallback.
+ **/
+static void launchFetch(Waiter *waiter, void *context)
+{
+  MissingDecref      *decref   = asMissingDecref(waiter);
+  RecoveryCompletion *recovery = decref->recovery;
+  if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref)
+      != VDO_SUCCESS) {
+    return;
+  }
+
+  if (decref->complete) {
+    // We've already found the mapping for this decref, no fetch needed.
+    return;
+  }
+
+  BlockMapZone *zone = context;
+  initVDOPageCompletion(&decref->pageCompletion, zone->pageCache,
+                        decref->slot.pbn, false, decref, processFetchedPage,
+                        handleFetchError);
+  getVDOPageAsync(&decref->pageCompletion.completion);
+}
+
+/**
+ * Find all entries which need to be replayed into the slab journals.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void findSlabJournalEntries(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+
+  // We need to be on logical zone 0's thread since we are going to use its
+  // page cache.
+  assertOnLogicalZoneThread(vdo, 0, __func__);
+  int result = findMissingDecrefs(recovery);
+  if (abortRecoveryOnError(result, recovery)) {
+    return;
+  }
+
+  prepareSubTask(recovery, applyToDepot, finishParentCallback,
+                 ZONE_TYPE_ADMIN);
+
+  /*
+   * Increment the incompleteDecrefCount so that the fetch callback can't
+   * complete the sub-task while we are still processing the queue of missing
+   * decrefs.
+   */
+  if (recovery->incompleteDecrefCount++ > 0) {
+    // Fetch block map pages to fill in the incomplete missing decrefs.
+    notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch,
+                     getBlockMapZone(getBlockMap(vdo), 0));
+  }
+
+  if (--recovery->incompleteDecrefCount == 0) {
+    completeCompletion(completion);
+  }
+}
+
+/**
+ * Find the contiguous range of journal blocks.
+ *
+ * @param recovery  The recovery completion
+ *
+ * @return <code>true</code> if there were valid journal blocks
+ **/
+static bool findContiguousRange(RecoveryCompletion *recovery)
+{
+  RecoveryJournal *journal = recovery->vdo->recoveryJournal;
+  SequenceNumber head
+    = minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead);
+
+  bool foundEntries = false;
+  for (SequenceNumber i = head; i <= recovery->highestTail; i++) {
+    recovery->tail = i;
+    recovery->tailRecoveryPoint = (RecoveryPoint) {
+      .sequenceNumber = i,
+      .sectorCount    = 0,
+      .entryCount     = 0,
+    };
+
+    PackedJournalHeader *packedHeader
+      = getJournalBlockHeader(journal, recovery->journalData, i);
+    RecoveryBlockHeader header;
+    unpackRecoveryBlockHeader(packedHeader, &header);
+
+    if (!isExactRecoveryJournalBlock(journal, &header, i)
+        || (header.entryCount > journal->entriesPerBlock)) {
+      // A bad block header was found so this must be the end of the journal.
+      break;
+    }
+
+    JournalEntryCount blockEntries = header.entryCount;
+    // Examine each sector in turn to determine the last valid sector.
+    for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) {
+      PackedJournalSector *sector = getJournalBlockSector(packedHeader, j);
+
+      // A bad sector means that this block was torn.
+      if (!isValidRecoveryJournalSector(&header, sector)) {
+        break;
+      }
+
+      JournalEntryCount sectorEntries = minBlock(sector->entryCount,
+                                                 blockEntries);
+      if (sectorEntries > 0) {
+        foundEntries = true;
+        recovery->tailRecoveryPoint.sectorCount++;
+        recovery->tailRecoveryPoint.entryCount = sectorEntries;
+        blockEntries -= sectorEntries;
+      }
+
+      // If this sector is short, the later sectors can't matter.
+      if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
+          || (blockEntries == 0)) {
+        break;
+      }
+    }
+
+    // If this block was not filled, or if it tore, no later block can matter.
+    if ((header.entryCount != journal->entriesPerBlock)
+        || (blockEntries > 0)) {
+      break;
+    }
+  }
+
+  // Set the tail to the last valid tail block, if there is one.
+  if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) {
+    recovery->tail--;
+  }
+
+  return foundEntries;
+}
+
+/**
+ * Count the number of increment entries in the journal.
+ *
+ * @param recovery  The recovery completion
+ **/
+static int countIncrementEntries(RecoveryCompletion *recovery)
+{
+  RecoveryPoint recoveryPoint = {
+    .sequenceNumber = recovery->blockMapHead,
+    .sectorCount    = 1,
+    .entryCount     = 0,
+  };
+  while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) {
+    RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint);
+    int result = validateRecoveryJournalEntry(recovery->vdo, &entry);
+    if (result != VDO_SUCCESS) {
+      enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result);
+      return result;
+    }
+    if (isIncrementOperation(entry.operation)) {
+      recovery->increfCount++;
+    }
+    incrementRecoveryPoint(&recoveryPoint);
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Determine the limits of the valid recovery journal and prepare to replay
+ * into the slab journals and block map.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void prepareToApplyJournalEntries(VDOCompletion *completion)
+{
+  RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent);
+  VDO                *vdo      = recovery->vdo;
+  RecoveryJournal    *journal  = vdo->recoveryJournal;
+  logInfo("Finished reading recovery journal");
+  bool foundEntries = findHeadAndTail(journal, recovery->journalData,
+                                      &recovery->highestTail,
+                                      &recovery->blockMapHead,
+                                      &recovery->slabJournalHead);
+  if (foundEntries) {
+    foundEntries = findContiguousRange(recovery);
+  }
+
+  // Both reap heads must be behind the tail.
+  if ((recovery->blockMapHead > recovery->tail)
+      || (recovery->slabJournalHead > recovery->tail)) {
+    int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL,
+                                         "Journal tail too early. "
+                                         "block map head: %" PRIu64
+                                         ", slab journal head: %" PRIu64
+                                         ", tail: %llu",
+                                         recovery->blockMapHead,
+                                         recovery->slabJournalHead,
+                                         recovery->tail);
+    finishCompletion(&recovery->completion, result);
+    return;
+  }
+
+  if (!foundEntries) {
+    // This message must be recognizable by VDOTest::RebuildBase.
+    logInfo("Replaying 0 recovery entries into block map");
+    // We still need to load the SlabDepot.
+    FREE(recovery->journalData);
+    recovery->journalData = NULL;
+    prepareSubTask(recovery, finishParentCallback, finishParentCallback,
+                   ZONE_TYPE_ADMIN);
+    loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY,
+                  completion, recovery);
+    return;
+  }
+
+  logInfo("Highest-numbered recovery journal block has sequence number"
+          " %llu, and the highest-numbered usable block is %"
+          PRIu64, recovery->highestTail, recovery->tail);
+
+  if (isReplaying(vdo)) {
+    // We need to know how many entries the block map rebuild completion will
+    // need to hold.
+    int result = countIncrementEntries(recovery);
+    if (result != VDO_SUCCESS) {
+      finishCompletion(&recovery->completion, result);
+      return;
+    }
+
+    // We need to access the block map from a logical zone.
+    prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback,
+                   ZONE_TYPE_LOGICAL);
+    loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion,
+                  recovery);
+    return;
+  }
+
+  int result = computeUsages(recovery);
+  if (abortRecoveryOnError(result, recovery)) {
+    return;
+  }
+
+  prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback,
+                 ZONE_TYPE_LOGICAL);
+  invokeCallback(completion);
+}
+
+/**********************************************************************/
+void launchRecovery(VDO *vdo, VDOCompletion *parent)
+{
+  // Note: This message must be recognizable by Permabit::VDODeviceBase.
+  logWarning("Device was dirty, rebuilding reference counts");
+
+  RecoveryCompletion *recovery;
+  int result = makeRecoveryCompletion(vdo, &recovery);
+  if (result != VDO_SUCCESS) {
+    finishCompletion(parent, result);
+    return;
+  }
+
+  VDOCompletion *completion = &recovery->completion;
+  prepareCompletion(completion, finishRecovery, abortRecovery,
+                    parent->callbackThreadID, parent);
+  prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback,
+                 ZONE_TYPE_ADMIN);
+  loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion,
+                   &recovery->journalData);
+}
diff --git a/vdo/base/vdoRecovery.h b/vdo/base/vdoRecovery.h
new file mode 100644
index 0000000..f817a05
--- /dev/null
+++ b/vdo/base/vdoRecovery.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.h#2 $
+ */
+
+#ifndef VDO_RECOVERY_H
+#define VDO_RECOVERY_H
+
+#include "completion.h"
+#include "vdo.h"
+
+/**
+ * Replay recovery journal entries in the the slab journals of slabs owned by a
+ * given BlockAllocator.
+ *
+ * @param allocator   The allocator whose slab journals are to be recovered
+ * @param completion  The completion to use for waiting on slab journal space
+ * @param context     The slab depot load context supplied by a recovery when
+ *                    it loads the depot
+ **/
+void replayIntoSlabJournals(BlockAllocator *allocator,
+                            VDOCompletion  *completion,
+                            void           *context);
+
+/**
+ * Construct a recovery completion and launch it. Apply all valid journal block
+ * entries to all VDO structures. This function performs the offline portion of
+ * recovering a VDO from a crash.
+ *
+ * @param vdo     The vdo to recover
+ * @param parent  The completion to notify when the offline portion of the
+ *                recovery is complete
+ **/
+void launchRecovery(VDO *vdo, VDOCompletion *parent);
+
+#endif // VDO_RECOVERY_H
diff --git a/vdo/base/vdoRecoveryInternals.h b/vdo/base/vdoRecoveryInternals.h
new file mode 100644
index 0000000..b0414c1
--- /dev/null
+++ b/vdo/base/vdoRecoveryInternals.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecoveryInternals.h#2 $
+ */
+
+#ifndef VDO_RECOVERY_INTERNALS_H
+#define VDO_RECOVERY_INTERNALS_H
+
+#include "vdoRecovery.h"
+
+#include "blockMapRecovery.h"
+#include "intMap.h"
+#include "journalPoint.h"
+#include "ringNode.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * The absolute position of an entry in the recovery journal, including
+ * the sector number and the entry number within the sector.
+ **/
+typedef struct {
+  SequenceNumber    sequenceNumber; // Block sequence number
+  uint8_t           sectorCount;    // Sector number
+  JournalEntryCount entryCount;     // Entry number
+} RecoveryPoint;
+
+typedef struct {
+  /** The completion header */
+  VDOCompletion                completion;
+  /** The sub-task completion */
+  VDOCompletion                subTaskCompletion;
+  /** The VDO in question */
+  VDO                         *vdo;
+  /** The BlockAllocator whose journals are being recovered */
+  BlockAllocator              *allocator;
+  /** A buffer to hold the data read off disk */
+  char                        *journalData;
+  /** The number of increfs */
+  size_t                       increfCount;
+
+  /** The entry data for the block map recovery */
+  NumberedBlockMapping        *entries;
+  /** The number of entries in the entry array */
+  size_t                       entryCount;
+  /** The sequence number of the first valid block for block map recovery */
+  SequenceNumber               blockMapHead;
+  /** The sequence number of the first valid block for slab journal replay */
+  SequenceNumber               slabJournalHead;
+  /** The sequence number of the last valid block of the journal (if known) */
+  SequenceNumber               tail;
+  /**
+   * The highest sequence number of the journal, not the same as the tail,
+   * since the tail ignores blocks after the first hole.
+   */
+  SequenceNumber               highestTail;
+
+  /** A location just beyond the last valid entry of the journal */
+  RecoveryPoint                tailRecoveryPoint;
+  /** The location of the next recovery journal entry to apply */
+  RecoveryPoint                nextRecoveryPoint;
+  /** The number of logical blocks currently known to be in use */
+  BlockCount                   logicalBlocksUsed;
+  /** The number of block map data blocks known to be allocated */
+  BlockCount                   blockMapDataBlocks;
+  /** The journal point to give to the next synthesized decref */
+  JournalPoint                 nextJournalPoint;
+  /** The number of entries played into slab journals */
+  size_t                       entriesAddedToSlabJournals;
+
+  // Decref synthesis fields
+
+  /** An intMap for use in finding which slots are missing decrefs */
+  IntMap                      *slotEntryMap;
+  /** The number of synthesized decrefs */
+  size_t                       missingDecrefCount;
+  /** The number of incomplete decrefs */
+  size_t                       incompleteDecrefCount;
+  /** The fake journal point of the next missing decref */
+  JournalPoint                 nextSynthesizedJournalPoint;
+  /** The queue of missing decrefs */
+  WaitQueue                    missingDecrefs[];
+} RecoveryCompletion;
+
+/**
+ * Convert a generic completion to a RecoveryCompletion.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The RecoveryCompletion
+ **/
+__attribute__((warn_unused_result))
+static inline RecoveryCompletion *
+asRecoveryCompletion(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(RecoveryCompletion, completion) == 0);
+  assertCompletionType(completion->type, RECOVERY_COMPLETION);
+  return (RecoveryCompletion *) completion;
+}
+
+/**
+ * Allocate and initialize a RecoveryCompletion.
+ *
+ * @param vdo         The VDO in question
+ * @param recoveryPtr  A pointer to hold the new RecoveryCompletion
+ *
+ * @return VDO_SUCCESS or a status code
+ **/
+int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a RecoveryCompletion and all underlying structures.
+ *
+ * @param recoveryPtr  A pointer to the recovery completion to free
+ **/
+void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr);
+
+#endif // VDO_RECOVERY_INTERNALS_H
diff --git a/vdo/base/vdoResize.c b/vdo/base/vdoResize.c
new file mode 100644
index 0000000..ee3271d
--- /dev/null
+++ b/vdo/base/vdoResize.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.c#15 $
+ */
+
+#include "vdoResize.h"
+
+#include "logger.h"
+
+#include "adminCompletion.h"
+#include "completion.h"
+#include "recoveryJournal.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+#include "vdoInternal.h"
+#include "vdoLayout.h"
+
+typedef enum {
+  GROW_PHYSICAL_PHASE_START = 0,
+  GROW_PHYSICAL_PHASE_COPY_SUMMARY,
+  GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS,
+  GROW_PHYSICAL_PHASE_USE_NEW_SLABS,
+  GROW_PHYSICAL_PHASE_END,
+  GROW_PHYSICAL_PHASE_ERROR,
+} GrowPhysicalPhase;
+
+static const char *GROW_PHYSICAL_PHASE_NAMES[] = {
+  "GROW_PHYSICAL_PHASE_START",
+  "GROW_PHYSICAL_PHASE_COPY_SUMMARY",
+  "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS",
+  "GROW_PHYSICAL_PHASE_USE_NEW_SLABS",
+  "GROW_PHYSICAL_PHASE_END",
+  "GROW_PHYSICAL_PHASE_ERROR",
+};
+
+/**
+ * Implements ThreadIDGetterForPhase.
+ **/
+__attribute__((warn_unused_result))
+static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion)
+{
+  return getAdminThread(getThreadConfig(adminCompletion->completion.parent));
+}
+
+/**
+ * Callback to initiate a grow physical, registered in performGrowPhysical().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void growPhysicalCallback(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_PHYSICAL);
+  assertAdminPhaseThread(adminCompletion, __func__, GROW_PHYSICAL_PHASE_NAMES);
+
+  VDO *vdo = adminCompletion->completion.parent;
+  switch (adminCompletion->phase++) {
+  case GROW_PHYSICAL_PHASE_START:
+    if (isReadOnly(vdo->readOnlyNotifier)) {
+      logErrorWithStringError(VDO_READ_ONLY,
+                              "Can't grow physical size of a read-only VDO");
+      setCompletionResult(resetAdminSubTask(completion), VDO_READ_ONLY);
+      break;
+    }
+
+    if (startOperationWithWaiter(&vdo->adminState,
+                                 ADMIN_STATE_SUSPENDED_OPERATION,
+                                 &adminCompletion->completion, NULL)) {
+      // Copy the journal into the new layout.
+      copyPartition(vdo->layout, RECOVERY_JOURNAL_PARTITION,
+                    resetAdminSubTask(completion));
+    }
+    return;
+
+  case GROW_PHYSICAL_PHASE_COPY_SUMMARY:
+    copyPartition(vdo->layout, SLAB_SUMMARY_PARTITION,
+                  resetAdminSubTask(completion));
+    return;
+
+  case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS:
+    vdo->config.physicalBlocks = growVDOLayout(vdo->layout);
+    updateSlabDepotSize(vdo->depot);
+    saveVDOComponentsAsync(vdo, resetAdminSubTask(completion));
+    return;
+
+  case GROW_PHYSICAL_PHASE_USE_NEW_SLABS:
+    useNewSlabs(vdo->depot, resetAdminSubTask(completion));
+    return;
+
+  case GROW_PHYSICAL_PHASE_END:
+    setSlabSummaryOrigin(getSlabSummary(vdo->depot),
+                         getVDOPartition(vdo->layout, SLAB_SUMMARY_PARTITION));
+    setRecoveryJournalPartition(vdo->recoveryJournal,
+                                getVDOPartition(vdo->layout,
+                                                RECOVERY_JOURNAL_PARTITION));
+    break;
+
+  case GROW_PHYSICAL_PHASE_ERROR:
+    enterReadOnlyMode(vdo->readOnlyNotifier, completion->result);
+    break;
+
+  default:
+    setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE);
+  }
+
+  finishVDOLayoutGrowth(vdo->layout);
+  finishOperationWithResult(&vdo->adminState, completion->result);
+}
+
+/**
+ * Handle an error during the grow physical process.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void handleGrowthError(VDOCompletion *completion)
+{
+  adminCompletionFromSubTask(completion)->phase = GROW_PHYSICAL_PHASE_ERROR;
+  growPhysicalCallback(completion);
+}
+
+/**********************************************************************/
+int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks)
+{
+  BlockCount oldPhysicalBlocks = vdo->config.physicalBlocks;
+
+  // Skip any noop grows.
+  if (oldPhysicalBlocks == newPhysicalBlocks) {
+    return VDO_SUCCESS;
+  }
+
+  if (newPhysicalBlocks != getNextVDOLayoutSize(vdo->layout)) {
+    /*
+     * Either the VDO isn't prepared to grow, or it was prepared to grow
+     * to a different size. Doing this check here relies on the fact that
+     * the call to this method is done under the dmsetup message lock.
+     */
+    finishVDOLayoutGrowth(vdo->layout);
+    abandonNewSlabs(vdo->depot);
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  // Validate that we are prepared to grow appropriately.
+  BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout);
+  BlockCount preparedDepotSize = getNewDepotSize(vdo->depot);
+  if (preparedDepotSize != newDepotSize) {
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  int result = performAdminOperation(vdo, ADMIN_OPERATION_GROW_PHYSICAL,
+                                     getThreadIDForPhase, growPhysicalCallback,
+                                     handleGrowthError);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  logInfo("Physical block count was %llu, now %llu",
+          oldPhysicalBlocks, newPhysicalBlocks);
+  return VDO_SUCCESS;
+}
+
+/**
+ * Callback to check that we're not in recovery mode, used in
+ * prepareToGrowPhysical().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void checkMayGrowPhysical(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  assertAdminOperationType(adminCompletion,
+                           ADMIN_OPERATION_PREPARE_GROW_PHYSICAL);
+
+  VDO *vdo = adminCompletion->completion.parent;
+  assertOnAdminThread(vdo, __func__);
+
+  resetAdminSubTask(completion);
+
+  // This check can only be done from a base code thread.
+  if (isReadOnly(vdo->readOnlyNotifier)) {
+    finishCompletion(completion->parent, VDO_READ_ONLY);
+    return;
+  }
+
+  // This check should only be done from a base code thread.
+  if (inRecoveryMode(vdo)) {
+    finishCompletion(completion->parent, VDO_RETRY_AFTER_REBUILD);
+    return;
+  }
+
+  completeCompletion(completion->parent);
+}
+
+/**********************************************************************/
+int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks)
+{
+  BlockCount currentPhysicalBlocks = vdo->config.physicalBlocks;
+  if (newPhysicalBlocks < currentPhysicalBlocks) {
+    return logErrorWithStringError(VDO_NOT_IMPLEMENTED,
+                                   "Removing physical storage from a VDO is "
+                                   "not supported");
+  }
+
+  if (newPhysicalBlocks == currentPhysicalBlocks) {
+    logWarning("Requested physical block count %" PRIu64
+               " not greater than %llu",
+               newPhysicalBlocks, currentPhysicalBlocks);
+    finishVDOLayoutGrowth(vdo->layout);
+    abandonNewSlabs(vdo->depot);
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  int result = performAdminOperation(vdo,
+                                     ADMIN_OPERATION_PREPARE_GROW_PHYSICAL,
+                                     getThreadIDForPhase, checkMayGrowPhysical,
+                                     finishParentCallback);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = prepareToGrowVDOLayout(vdo->layout, currentPhysicalBlocks,
+                                  newPhysicalBlocks, vdo->layer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout);
+  result = prepareToGrowSlabDepot(vdo->depot, newDepotSize);
+  if (result != VDO_SUCCESS) {
+    finishVDOLayoutGrowth(vdo->layout);
+    return result;
+  }
+
+  return VDO_SUCCESS;
+}
diff --git a/vdo/base/vdoResize.h b/vdo/base/vdoResize.h
new file mode 100644
index 0000000..76bfc1f
--- /dev/null
+++ b/vdo/base/vdoResize.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.h#1 $
+ */
+
+#ifndef VDO_RESIZE_H
+#define VDO_RESIZE_H
+
+#include "types.h"
+
+/**
+ * Make the completion for an asynchronous resize.
+ *
+ * @param vdo                The VDO
+ * @param newPhysicalBlocks  The new physical size in blocks
+ * @param completionPtr      A pointer to hold the completion
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeResizeVDOCompletion(VDO            *vdo,
+                            BlockCount      newPhysicalBlocks,
+                            VDOCompletion **completionPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the completion for an asynchronous resize, and NULL out the
+ * reference to it.
+ *
+ * @param completionPtr  A reference to the completion to free
+ **/
+void freeResizeVDOCompletion(VDOCompletion **completionPtr);
+
+/**
+ * Grow the physical size of the VDO. This method may only be called when the
+ * VDO has been suspended and must not be called from a base thread.
+ *
+ * @param vdo                The VDO to resize
+ * @param newPhysicalBlocks  The new physical size in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks);
+
+/**
+ * Prepare to resize the VDO, allocating memory as needed.
+ *
+ * @param vdo                The VDO
+ * @param newPhysicalBlocks  The new physical size in blocks
+ **/
+int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks)
+  __attribute__((warn_unused_result));
+
+#endif /* VDO_RESIZE_H */
diff --git a/vdo/base/vdoResizeLogical.c b/vdo/base/vdoResizeLogical.c
new file mode 100644
index 0000000..97a06d1
--- /dev/null
+++ b/vdo/base/vdoResizeLogical.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.c#6 $
+ */
+
+#include "vdoResizeLogical.h"
+
+#include "logger.h"
+
+#include "adminCompletion.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "vdoInternal.h"
+
+typedef enum {
+  GROW_LOGICAL_PHASE_START = 0,
+  GROW_LOGICAL_PHASE_GROW_BLOCK_MAP,
+  GROW_LOGICAL_PHASE_END,
+  GROW_LOGICAL_PHASE_ERROR,
+} GrowLogicalPhase;
+
+static const char *GROW_LOGICAL_PHASE_NAMES[] = {
+  "GROW_LOGICAL_PHASE_START",
+  "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP",
+  "GROW_LOGICAL_PHASE_END",
+  "GROW_LOGICAL_PHASE_ERROR",
+};
+
+/**
+ * Implements ThreadIDGetterForPhase.
+ **/
+__attribute__((warn_unused_result))
+static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion)
+{
+  return getAdminThread(getThreadConfig(adminCompletion->completion.parent));
+}
+
+/**
+ * Callback to initiate a grow logical, registered in performGrowLogical().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void growLogicalCallback(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_LOGICAL);
+  assertAdminPhaseThread(adminCompletion, __func__, GROW_LOGICAL_PHASE_NAMES);
+
+  VDO *vdo = adminCompletion->completion.parent;
+  switch (adminCompletion->phase++) {
+  case GROW_LOGICAL_PHASE_START:
+    if (isReadOnly(vdo->readOnlyNotifier)) {
+      logErrorWithStringError(VDO_READ_ONLY,
+                              "Can't grow logical size of a read-only VDO");
+      finishCompletion(resetAdminSubTask(completion), VDO_READ_ONLY);
+      return;
+    }
+
+    if (startOperationWithWaiter(&vdo->adminState,
+                                 ADMIN_STATE_SUSPENDED_OPERATION,
+                                 &adminCompletion->completion, NULL)) {
+
+      vdo->config.logicalBlocks = getNewEntryCount(getBlockMap(vdo));
+      saveVDOComponentsAsync(vdo, resetAdminSubTask(completion));
+    }
+
+    return;
+
+  case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP:
+    growBlockMap(getBlockMap(vdo), resetAdminSubTask(completion));
+    return;
+
+  case GROW_LOGICAL_PHASE_END:
+    break;
+
+  case GROW_LOGICAL_PHASE_ERROR:
+    enterReadOnlyMode(vdo->readOnlyNotifier, completion->result);
+    break;
+
+  default:
+    setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE);
+  }
+
+  finishOperationWithResult(&vdo->adminState, completion->result);
+}
+
+/**
+ * Handle an error during the grow physical process.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void handleGrowthError(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  if (adminCompletion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) {
+    // We've failed to write the new size in the super block, so set our
+    // in memory config back to the old size.
+    VDO      *vdo = adminCompletion->completion.parent;
+    BlockMap *map = getBlockMap(vdo);
+    vdo->config.logicalBlocks = getNumberOfBlockMapEntries(map);
+    abandonBlockMapGrowth(map);
+  }
+
+  adminCompletion->phase = GROW_LOGICAL_PHASE_ERROR;
+  growLogicalCallback(completion);
+}
+
+/**********************************************************************/
+int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks)
+{
+  if (getNewEntryCount(getBlockMap(vdo)) != newLogicalBlocks) {
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  return performAdminOperation(vdo, ADMIN_OPERATION_GROW_LOGICAL,
+                               getThreadIDForPhase, growLogicalCallback,
+                               handleGrowthError);
+}
+
+/**********************************************************************/
+int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks)
+{
+  if (newLogicalBlocks < vdo->config.logicalBlocks) {
+    return logErrorWithStringError(VDO_PARAMETER_MISMATCH,
+                                   "Can't shrink VDO logical size from its "
+                                   "current value of %llu",
+                                   vdo->config.logicalBlocks);
+  }
+
+  if (newLogicalBlocks == vdo->config.logicalBlocks) {
+    return logErrorWithStringError(VDO_PARAMETER_MISMATCH,
+                                   "Can't grow VDO logical size to its "
+                                   "current value of %llu",
+                                   vdo->config.logicalBlocks);
+  }
+
+  return prepareToGrowBlockMap(getBlockMap(vdo), newLogicalBlocks);
+}
diff --git a/vdo/base/vdoResizeLogical.h b/vdo/base/vdoResizeLogical.h
new file mode 100644
index 0000000..fbea60d
--- /dev/null
+++ b/vdo/base/vdoResizeLogical.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.h#1 $
+ */
+
+#ifndef VDO_RESIZE_LOGICAL_H
+#define VDO_RESIZE_LOGICAL_H
+
+#include "types.h"
+
+/**
+ * Grow the logical size of the VDO. This method may only be called when the
+ * VDO has been suspended and must not be called from a base thread.
+ *
+ * @param vdo               The VDO to grow
+ * @param newLogicalBlocks  The size to which the VDO should be grown
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks);
+
+/**
+ * Prepare to grow the logical size of the VDO. This method may only be called
+ * while the VDO is running.
+ *
+ * @param vdo               The VDO to prepare for growth
+ * @param newLogicalBlocks  The size to which the VDO should be grown
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks);
+
+#endif /* VDO_RESIZE_LOGICAL_H */
diff --git a/vdo/base/vdoResume.c b/vdo/base/vdoResume.c
new file mode 100644
index 0000000..a10c2ef
--- /dev/null
+++ b/vdo/base/vdoResume.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.c#3 $
+ */
+
+#include "vdoResume.h"
+
+#include "logger.h"
+
+#include "adminCompletion.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "logicalZone.h"
+#include "recoveryJournal.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+#include "threadConfig.h"
+#include "vdoInternal.h"
+
+typedef enum {
+  RESUME_PHASE_START = 0,
+  RESUME_PHASE_ALLOW_READ_ONLY_MODE,
+  RESUME_PHASE_DEPOT,
+  RESUME_PHASE_JOURNAL,
+  RESUME_PHASE_BLOCK_MAP,
+  RESUME_PHASE_LOGICAL_ZONES,
+  RESUME_PHASE_PACKER,
+  RESUME_PHASE_END,
+} ResumePhase;
+
+static const char *RESUME_PHASE_NAMES[] = {
+  "RESUME_PHASE_START",
+  "RESUME_PHASE_ALLOW_READ_ONLY_MODE",
+  "RESUME_PHASE_DEPOT",
+  "RESUME_PHASE_JOURNAL",
+  "RESUME_PHASE_BLOCK_MAP",
+  "RESUME_PHASE_LOGICAL_ZONES",
+  "RESUME_PHASE_PACKER",
+  "RESUME_PHASE_END",
+};
+
+/**
+ * Implements ThreadIDGetterForPhase.
+ **/
+__attribute__((warn_unused_result))
+static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion)
+{
+  const ThreadConfig *threadConfig
+    = getThreadConfig(adminCompletion->completion.parent);
+  switch (adminCompletion->phase) {
+  case RESUME_PHASE_JOURNAL:
+    return getJournalZoneThread(threadConfig);
+
+  case RESUME_PHASE_PACKER:
+    return getPackerZoneThread(threadConfig);
+
+  default:
+    return getAdminThread(threadConfig);
+  }
+}
+
+/**
+ * Update the VDO state and save the super block.
+ *
+ * @param vdo         The VDO being resumed
+ * @param completion  The AdminCompletion's sub-task completion
+ **/
+static void writeSuperBlock(VDO *vdo, VDOCompletion *completion)
+{
+  switch (vdo->state) {
+  case VDO_CLEAN:
+  case VDO_NEW:
+    vdo->state = VDO_DIRTY;
+    saveVDOComponentsAsync(vdo, completion);
+    return;
+
+  case VDO_DIRTY:
+  case VDO_READ_ONLY_MODE:
+  case VDO_FORCE_REBUILD:
+  case VDO_RECOVERING:
+  case VDO_REBUILD_FOR_UPGRADE:
+    // No need to write the super block in these cases
+    completeCompletion(completion);
+    return;
+
+  case VDO_REPLAYING:
+  default:
+    finishCompletion(completion, UDS_BAD_STATE);
+  }
+}
+
+/**
+ * Callback to resume a VDO.
+ *
+ * @param completion  The sub-task completion
+ **/
+static void resumeCallback(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  assertAdminOperationType(adminCompletion, ADMIN_OPERATION_RESUME);
+  assertAdminPhaseThread(adminCompletion, __func__, RESUME_PHASE_NAMES);
+
+  VDO *vdo = adminCompletion->completion.parent;
+  switch (adminCompletion->phase++) {
+  case RESUME_PHASE_START:
+    if (startResuming(&vdo->adminState, ADMIN_STATE_RESUMING,
+                      &adminCompletion->completion, NULL)) {
+      writeSuperBlock(vdo, completion);
+    }
+    return;
+
+  case RESUME_PHASE_ALLOW_READ_ONLY_MODE:
+    allowReadOnlyModeEntry(vdo->readOnlyNotifier,
+                           resetAdminSubTask(completion));
+    return;
+
+  case RESUME_PHASE_DEPOT:
+    resumeSlabDepot(vdo->depot, resetAdminSubTask(completion));
+    return;
+
+  case RESUME_PHASE_JOURNAL:
+    resumeRecoveryJournal(vdo->recoveryJournal, resetAdminSubTask(completion));
+    return;
+
+  case RESUME_PHASE_BLOCK_MAP:
+    resumeBlockMap(vdo->blockMap, resetAdminSubTask(completion));
+    return;
+
+  case RESUME_PHASE_LOGICAL_ZONES:
+      resumeLogicalZones(vdo->logicalZones,resetAdminSubTask(completion));
+      return;
+
+  case RESUME_PHASE_PACKER:
+    resumePacker(vdo->packer, resetAdminSubTask(completion));
+    return;
+
+  case RESUME_PHASE_END:
+    break;
+
+  default:
+    setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE);
+  }
+
+  finishResumingWithResult(&vdo->adminState, completion->result);
+}
+
+/**********************************************************************/
+int performVDOResume(VDO *vdo)
+{
+  return performAdminOperation(vdo, ADMIN_OPERATION_RESUME,
+                               getThreadIDForPhase, resumeCallback,
+                               preserveErrorAndContinue);
+}
diff --git a/vdo/base/vdoResume.h b/vdo/base/vdoResume.h
new file mode 100644
index 0000000..1ef25b2
--- /dev/null
+++ b/vdo/base/vdoResume.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.h#1 $
+ */
+
+#ifndef VDO_RESUME_H
+#define VDO_RESUME_H
+
+#include "types.h"
+
+/**
+ * Resume a suspended VDO.
+ *
+ * @param vdo   The VDO to resume
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int performVDOResume(VDO *vdo);
+
+#endif /* VDO_RESUME_H */
diff --git a/vdo/base/vdoState.c b/vdo/base/vdoState.c
new file mode 100644
index 0000000..00d3986
--- /dev/null
+++ b/vdo/base/vdoState.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.c#1 $
+ */
+
+#include "vdoState.h"
+
+#include "permassert.h"
+
+static const char *VDO_STATE_NAMES[] = {
+  [VDO_CLEAN]               = "CLEAN",
+  [VDO_DIRTY]               = "DIRTY",
+  [VDO_FORCE_REBUILD]       = "FORCE_REBUILD",
+  [VDO_NEW]                 = "NEW",
+  [VDO_READ_ONLY_MODE]      = "READ_ONLY_MODE",
+  [VDO_REBUILD_FOR_UPGRADE] = "REBUILD_FOR_UPGRADE",
+  [VDO_RECOVERING]          = "RECOVERING",
+  [VDO_REPLAYING]           = "REPLAYING",
+};
+
+/**********************************************************************/
+const char *getVDOStateName(VDOState state)
+{
+  // Catch if a state has been added without updating the name array.
+  STATIC_ASSERT(COUNT_OF(VDO_STATE_NAMES) == VDO_STATE_COUNT);
+
+  int result = ASSERT(state < COUNT_OF(VDO_STATE_NAMES),
+                      "VDOState value %u must have a registered name", state);
+  if (result != UDS_SUCCESS) {
+    return "INVALID VDO STATE CODE";
+  }
+
+  return VDO_STATE_NAMES[state];
+}
+
+/**********************************************************************/
+const char *describeVDOState(VDOState state)
+{
+  // These strings should all fit in the 15 chars of VDOStatistics.mode.
+  switch (state) {
+  case VDO_RECOVERING:
+    return "recovering";
+
+  case VDO_READ_ONLY_MODE:
+    return "read-only";
+
+  default:
+    return "normal";
+  }
+}
diff --git a/vdo/base/vdoState.h b/vdo/base/vdoState.h
new file mode 100644
index 0000000..5843565
--- /dev/null
+++ b/vdo/base/vdoState.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.h#2 $
+ */
+
+#ifndef VDO_STATE_H
+#define VDO_STATE_H
+
+/**
+ * The current operating mode of the VDO. These are persistent on disk
+ * so the values must not change.
+ **/
+typedef enum {
+  VDO_DIRTY               = 0,
+  VDO_NEW                 = 1,
+  VDO_CLEAN               = 2,
+  VDO_READ_ONLY_MODE      = 3,
+  VDO_FORCE_REBUILD       = 4,
+  VDO_RECOVERING          = 5,
+  VDO_REPLAYING           = 6,
+  VDO_REBUILD_FOR_UPGRADE = 7,
+
+  // Keep VDO_STATE_COUNT at the bottom.
+  VDO_STATE_COUNT
+} VDOState;
+
+/**
+ * Get the name of a VDO state code for logging purposes.
+ *
+ * @param state  The state code
+ *
+ * @return The name of the state code
+ **/
+const char *getVDOStateName(VDOState state)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return a user-visible string describing the current VDO state.
+ *
+ * @param state  The VDO state to describe
+ *
+ * @return A string constant describing the state
+ **/
+const char *describeVDOState(VDOState state)
+  __attribute__((warn_unused_result));
+
+#endif // VDO_STATE_H
diff --git a/vdo/base/vdoSuspend.c b/vdo/base/vdoSuspend.c
new file mode 100644
index 0000000..e919f19
--- /dev/null
+++ b/vdo/base/vdoSuspend.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.c#4 $
+ */
+
+#include "vdoSuspend.h"
+
+#include "logger.h"
+
+#include "adminCompletion.h"
+#include "blockMap.h"
+#include "completion.h"
+#include "logicalZone.h"
+#include "recoveryJournal.h"
+#include "slabDepot.h"
+#include "slabSummary.h"
+#include "threadConfig.h"
+#include "vdoInternal.h"
+
+typedef enum {
+  SUSPEND_PHASE_START = 0,
+  SUSPEND_PHASE_PACKER,
+  SUSPEND_PHASE_LOGICAL_ZONES,
+  SUSPEND_PHASE_BLOCK_MAP,
+  SUSPEND_PHASE_JOURNAL,
+  SUSPEND_PHASE_DEPOT,
+  SUSPEND_PHASE_WRITE_SUPER_BLOCK,
+  SUSPEND_PHASE_END,
+} SuspendPhase;
+
+static const char *SUSPEND_PHASE_NAMES[] = {
+  "SUSPEND_PHASE_START",
+  "SUSPEND_PHASE_PACKER",
+  "SUSPEND_PHASE_LOGICAL_ZONES",
+  "SUSPEND_PHASE_BLOCK_MAP",
+  "SUSPEND_PHASE_JOURNAL",
+  "SUSPEND_PHASE_DEPOT",
+  "SUSPEND_PHASE_WRITE_SUPER_BLOCK",
+  "SUSPEND_PHASE_END",
+};
+
+/**
+ * Implements ThreadIDGetterForPhase.
+ **/
+__attribute__((warn_unused_result))
+static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion)
+{
+  const ThreadConfig *threadConfig
+    = getThreadConfig(adminCompletion->completion.parent);
+  switch (adminCompletion->phase) {
+  case SUSPEND_PHASE_PACKER:
+    return getPackerZoneThread(threadConfig);
+
+  case SUSPEND_PHASE_JOURNAL:
+    return getJournalZoneThread(threadConfig);
+
+  default:
+    return getAdminThread(threadConfig);
+  }
+}
+
+/**
+ * Update the VDO state and save the super block.
+ *
+ * @param vdo         The VDO being suspended
+ * @param completion  The AdminCompletion's sub-task completion
+ **/
+static void writeSuperBlock(VDO *vdo, VDOCompletion *completion)
+{
+  switch (vdo->state) {
+  case VDO_DIRTY:
+  case VDO_NEW:
+    vdo->state = VDO_CLEAN;
+    break;
+
+  case VDO_CLEAN:
+  case VDO_READ_ONLY_MODE:
+  case VDO_FORCE_REBUILD:
+  case VDO_RECOVERING:
+  case VDO_REBUILD_FOR_UPGRADE:
+    break;
+
+  case VDO_REPLAYING:
+  default:
+    finishCompletion(completion, UDS_BAD_STATE);
+    return;
+  }
+
+  saveVDOComponentsAsync(vdo, completion);
+}
+
+/**
+ * Callback to initiate a suspend, registered in performVDOSuspend().
+ *
+ * @param completion  The sub-task completion
+ **/
+static void suspendCallback(VDOCompletion *completion)
+{
+  AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion);
+  ASSERT_LOG_ONLY(((adminCompletion->type == ADMIN_OPERATION_SUSPEND)
+                   || (adminCompletion->type == ADMIN_OPERATION_SAVE)),
+                  "unexpected admin operation type %u is neither "
+                  "suspend nor save", adminCompletion->type);
+  assertAdminPhaseThread(adminCompletion, __func__, SUSPEND_PHASE_NAMES);
+
+  VDO *vdo = adminCompletion->completion.parent;
+  switch (adminCompletion->phase++) {
+  case SUSPEND_PHASE_START:
+    if (!startDraining(&vdo->adminState,
+                       ((adminCompletion->type == ADMIN_OPERATION_SUSPEND)
+                        ? ADMIN_STATE_SUSPENDING : ADMIN_STATE_SAVING),
+                       &adminCompletion->completion, NULL)) {
+      return;
+    }
+
+    if (!vdo->closeRequired) {
+      // There's nothing to do.
+      break;
+    }
+
+    waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier,
+                                     resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_PACKER:
+    /*
+     * If the VDO was already resumed from a prior suspend while read-only,
+     * some of the components may not have been resumed. By setting a read-only
+     * error here, we guarantee that the result of this suspend will be
+     * VDO_READ_ONLY and not VDO_INVALID_ADMIN_STATE in that case.
+     */
+    if (inReadOnlyMode(vdo)) {
+      setCompletionResult(&adminCompletion->completion, VDO_READ_ONLY);
+    }
+
+    drainPacker(vdo->packer, resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_LOGICAL_ZONES:
+    drainLogicalZones(vdo->logicalZones, vdo->adminState.state,
+                      resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_BLOCK_MAP:
+    drainBlockMap(vdo->blockMap, vdo->adminState.state,
+                  resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_JOURNAL:
+    drainRecoveryJournal(vdo->recoveryJournal, vdo->adminState.state,
+                         resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_DEPOT:
+    drainSlabDepot(vdo->depot, vdo->adminState.state,
+                   resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_WRITE_SUPER_BLOCK:
+    if (isSuspending(&vdo->adminState)
+        || (adminCompletion->completion.result != VDO_SUCCESS)) {
+      // If we didn't save the VDO or there was an error, we're done.
+      break;
+    }
+
+    writeSuperBlock(vdo, resetAdminSubTask(completion));
+    return;
+
+  case SUSPEND_PHASE_END:
+    break;
+
+  default:
+    setCompletionResult(completion, UDS_BAD_STATE);
+  }
+
+  finishDrainingWithResult(&vdo->adminState, completion->result);
+}
+
+/**********************************************************************/
+int performVDOSuspend(VDO *vdo, bool save)
+{
+  return performAdminOperation(vdo, (save
+                                     ? ADMIN_OPERATION_SAVE
+                                     : ADMIN_OPERATION_SUSPEND),
+                               getThreadIDForPhase, suspendCallback,
+                               preserveErrorAndContinue);
+}
diff --git a/vdo/base/vdoSuspend.h b/vdo/base/vdoSuspend.h
new file mode 100644
index 0000000..39172dc
--- /dev/null
+++ b/vdo/base/vdoSuspend.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.h#1 $
+ */
+
+#ifndef VDO_SUSPEND_H
+#define VDO_SUSPEND_H
+
+#include "types.h"
+
+/**
+ * Ensure that the VDO has no outstanding I/O and will issue none until it is
+ * resumed.
+ *
+ * @param vdo   The VDO to suspend
+ * @param save  If <code>true</code>, all dirty metadata will be flushed as
+ *              well
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int performVDOSuspend(VDO *vdo, bool save);
+
+#endif /* VDO_SUSPEND_H */
diff --git a/vdo/base/vio.c b/vdo/base/vio.c
new file mode 100644
index 0000000..9bd678d
--- /dev/null
+++ b/vdo/base/vio.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.c#5 $
+ */
+
+#include "vio.h"
+
+#include "logger.h"
+
+#include "dataVIO.h"
+#include "vdoInternal.h"
+
+#ifdef __KERNEL__
+#include <linux/ratelimit.h>
+#endif
+
+/**********************************************************************/
+void freeVIO(VIO **vioPtr)
+{
+  VIO *vio = *vioPtr;
+  if (vio == NULL) {
+    return;
+  }
+
+  vio->completion.layer->freeVIO(vioPtr);
+}
+
+/**********************************************************************/
+void initializeVIO(VIO           *vio,
+                   VIOType        type,
+                   VIOPriority    priority,
+                   VDOCompletion *parent,
+                   VDO           *vdo,
+                   PhysicalLayer *layer)
+{
+  vio->vdo      = vdo;
+  vio->type     = type;
+  vio->priority = priority;
+
+  VDOCompletion *completion = vioAsCompletion(vio);
+  initializeCompletion(completion, VIO_COMPLETION, layer);
+  completion->parent = parent;
+}
+
+/**********************************************************************/
+void vioDoneCallback(VDOCompletion *completion)
+{
+  VIO *vio = asVIO(completion);
+  completion->callback     = vio->callback;
+  completion->errorHandler = vio->errorHandler;
+  completeCompletion(completion);
+}
+
+/**********************************************************************/
+const char *getVIOReadWriteFlavor(const VIO *vio)
+{
+  if (isReadVIO(vio)) {
+    return "read";
+  }
+  return (isWriteVIO(vio) ? "write" : "read-modify-write");
+}
+
+/**********************************************************************/
+void updateVIOErrorStats(VIO *vio, const char *format, ...)
+{
+  int priority;
+  int result = vioAsCompletion(vio)->result;
+  switch (result) {
+  case VDO_READ_ONLY:
+    atomicAdd64(&vio->vdo->errorStats.readOnlyErrorCount, 1);
+    return;
+
+  case VDO_NO_SPACE:
+    atomicAdd64(&vio->vdo->errorStats.noSpaceErrorCount, 1);
+    priority = LOG_DEBUG;
+    break;
+
+  default:
+    priority = LOG_ERR;
+  }
+
+#ifdef __KERNEL__
+  static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL,
+                                DEFAULT_RATELIMIT_BURST);
+
+  if (!__ratelimit(&errorLimiter)) {
+    return;
+  }
+#endif
+
+  va_list args;
+  va_start(args, format);
+  vLogWithStringError(priority, result, format, args);
+  va_end(args);
+}
+
+/**
+ * Handle an error from a metadata I/O.
+ *
+ * @param completion  The VIO
+ **/
+static void handleMetadataIOError(VDOCompletion *completion)
+{
+  VIO *vio = asVIO(completion);
+  updateVIOErrorStats(vio,
+                      "Completing %s VIO of type %u for physical block %"
+                       PRIu64 " with error",
+                       getVIOReadWriteFlavor(vio), vio->type, vio->physical);
+  vioDoneCallback(completion);
+}
+
+/**********************************************************************/
+void launchMetadataVIO(VIO                 *vio,
+                       PhysicalBlockNumber  physical,
+                       VDOAction           *callback,
+                       VDOAction           *errorHandler,
+                       VIOOperation         operation)
+{
+  vio->operation    = operation;
+  vio->physical     = physical;
+  vio->callback     = callback;
+  vio->errorHandler = errorHandler;
+
+  VDOCompletion *completion = vioAsCompletion(vio);
+  resetCompletion(completion);
+  completion->callback     = vioDoneCallback;
+  completion->errorHandler = handleMetadataIOError;
+
+  if (isReadVIO(vio)) {
+    completion->layer->readMetadata(vio);
+  } else {
+    completion->layer->writeMetadata(vio);
+  }
+}
+
+/**
+ * Handle a flush error.
+ *
+ * @param completion  The flush VIO
+ **/
+static void handleFlushError(VDOCompletion *completion)
+{
+  logErrorWithStringError(completion->result, "Error flushing layer");
+  completion->errorHandler = asVIO(completion)->errorHandler;
+  completeCompletion(completion);
+}
+
+/**********************************************************************/
+void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler)
+{
+  VDOCompletion *completion = vioAsCompletion(vio);
+  resetCompletion(completion);
+  completion->callback     = callback;
+  completion->errorHandler = handleFlushError;
+  vio->errorHandler        = errorHandler;
+  vio->operation           = VIO_FLUSH_BEFORE;
+  vio->physical            = ZERO_BLOCK;
+
+  PhysicalLayer *layer = completion->layer;
+  if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) {
+    // XXX It is dangerous to be subtly dropping flushes possibly
+    // needed for correctness in sync mode.
+    finishCompletion(completion, VDO_SUCCESS);
+    return;
+  }
+
+  layer->flush(vio);
+}
diff --git a/vdo/base/vio.h b/vdo/base/vio.h
new file mode 100644
index 0000000..8129cc6
--- /dev/null
+++ b/vdo/base/vio.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.h#3 $
+ */
+
+#ifndef VIO_H
+#define VIO_H
+
+#include <stdarg.h>
+
+#include "completion.h"
+#include "trace.h"
+#include "types.h"
+#include "vdo.h"
+
+/**
+ * A representation of a single block which may be passed between the VDO base
+ * and the physical layer.
+ **/
+struct vio {
+  /* The completion for this VIO */
+  VDOCompletion        completion;
+
+  /* The functions to call when this VIO's operation is complete */
+  VDOAction           *callback;
+  VDOAction           *errorHandler;
+
+  /* The VDO handling this VIO */
+  VDO                 *vdo;
+
+  /* The address on the underlying device of the block to be read/written */
+  PhysicalBlockNumber  physical;
+
+  /* The type of request this VIO is servicing */
+  VIOOperation         operation;
+
+  /* The queueing priority of the VIO operation */
+  VIOPriority          priority;
+
+  /* The VIO type is used for statistics and instrumentation. */
+  VIOType              type;
+
+  /* Used for logging and debugging */
+  Trace               *trace;
+};
+
+/**
+ * Convert a generic VDOCompletion to a VIO.
+ *
+ * @param completion  The completion to convert
+ *
+ * @return The completion as a VIO
+ **/
+static inline VIO *asVIO(VDOCompletion *completion)
+{
+  STATIC_ASSERT(offsetof(VIO, completion) == 0);
+  assertCompletionType(completion->type, VIO_COMPLETION);
+  return (VIO *) completion;
+}
+
+/**
+ * Convert a VIO to a generic completion.
+ *
+ * @param vio The VIO to convert
+ *
+ * @return The VIO as a completion
+ **/
+static inline VDOCompletion *vioAsCompletion(VIO *vio)
+{
+  return &vio->completion;
+}
+
+/**
+ * Create a VIO.
+ *
+ * @param [in]  layer     The physical layer
+ * @param [in]  vioType   The type of VIO to create
+ * @param [in]  priority  The relative priority to assign to the VIO
+ * @param [in]  parent    The parent of the VIO
+ * @param [in]  data      The buffer
+ * @param [out] vioPtr    A pointer to hold the new VIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static inline int createVIO(PhysicalLayer  *layer,
+                            VIOType         vioType,
+                            VIOPriority     priority,
+                            void           *parent,
+                            char           *data,
+                            VIO           **vioPtr)
+{
+  return layer->createMetadataVIO(layer, vioType, priority, parent, data,
+                                  vioPtr);
+}
+
+/**
+ * Destroy a vio. The pointer to the VIO will be nulled out.
+ *
+ * @param vioPtr  A pointer to the VIO to destroy
+ **/
+void freeVIO(VIO **vioPtr);
+
+/**
+ * Initialize a VIO.
+ *
+ * @param vio       The VIO to initialize
+ * @param type      The VIO type
+ * @param priority  The relative priority of the VIO
+ * @param parent    The parent (the extent completion) to assign to the VIO
+ *                  completion
+ * @param vdo       The VDO for this VIO
+ * @param layer     The layer for this VIO
+ **/
+void initializeVIO(VIO           *vio,
+                   VIOType        type,
+                   VIOPriority    priority,
+                   VDOCompletion *parent,
+                   VDO           *vdo,
+                   PhysicalLayer *layer);
+
+/**
+ * The very last step in processing a VIO. Set the VIO's completion's callback
+ * and error handler from the fields set in the VIO itself on launch and then
+ * actually complete the VIO's completion.
+ *
+ * @param completion  The VIO
+ **/
+void vioDoneCallback(VDOCompletion *completion);
+
+/**
+ * Get the name of a VIO's operation.
+ *
+ * @param vio  The VIO
+ *
+ * @return The name of the VIO's operation (read, write, or read-modify-write)
+ **/
+const char *getVIOReadWriteFlavor(const VIO *vio);
+
+/**
+ * Update per-VIO error stats and log the error.
+ *
+ * @param vio     The VIO which got an error
+ * @param format  The format of the message to log (a printf style format)
+ **/
+void updateVIOErrorStats(VIO *vio, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Add a trace record for the current source location.
+ *
+ * @param vio      The VIO structure to be updated
+ * @param location The source-location descriptor to be recorded
+ **/
+static inline void vioAddTraceRecord(VIO *vio, TraceLocation location)
+{
+  if (unlikely(vio->trace != NULL)) {
+    addTraceRecord(vio->trace, location);
+  }
+}
+
+/**
+ * Check whether a VIO is servicing an external data request.
+ *
+ * @param vio  The VIO to check
+ **/
+static inline bool isDataVIO(VIO *vio)
+{
+  return isDataVIOType(vio->type);
+}
+
+/**
+ * Check whether a VIO is for compressed block writes
+ *
+ * @param vio  The VIO to check
+ **/
+static inline bool isCompressedWriteVIO(VIO *vio)
+{
+  return isCompressedWriteVIOType(vio->type);
+}
+
+/**
+ * Check whether a VIO is for metadata
+ *
+ * @param vio  The VIO to check
+ **/
+static inline bool isMetadataVIO(VIO *vio)
+{
+  return isMetadataVIOType(vio->type);
+}
+
+/**
+ * Check whether a VIO is a read.
+ *
+ * @param vio  The VIO
+ *
+ * @return <code>true</code> if the VIO is a read
+ **/
+static inline bool isReadVIO(const VIO *vio)
+{
+  return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ);
+}
+
+/**
+ * Check whether a VIO is a read-modify-write.
+ *
+ * @param vio  The VIO
+ *
+ * @return <code>true</code> if the VIO is a read-modify-write
+ **/
+static inline bool isReadModifyWriteVIO(const VIO *vio)
+{
+  return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ_MODIFY_WRITE);
+}
+
+/**
+ * Check whether a VIO is a write.
+ *
+ * @param vio  The VIO
+ *
+ * @return <code>true</code> if the VIO is a write
+ **/
+static inline bool isWriteVIO(const VIO *vio)
+{
+  return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_WRITE);
+}
+
+/**
+ * Check whether a VIO requires a flush before doing its I/O.
+ *
+ * @param vio  The VIO
+ *
+ * @return <code>true</code> if the VIO requires a flush before
+ **/
+static inline bool vioRequiresFlushBefore(const VIO *vio)
+{
+  return ((vio->operation & VIO_FLUSH_BEFORE) == VIO_FLUSH_BEFORE);
+}
+
+/**
+ * Check whether a VIO requires a flush after doing its I/O.
+ *
+ * @param vio  The VIO
+ *
+ * @return <code>true</code> if the VIO requires a flush after
+ **/
+static inline bool vioRequiresFlushAfter(const VIO *vio)
+{
+  return ((vio->operation & VIO_FLUSH_AFTER) == VIO_FLUSH_AFTER);
+}
+
+/**
+ * Launch a metadata VIO.
+ *
+ * @param vio           The VIO to launch
+ * @param physical      The physical block number to read or write
+ * @param callback      The function to call when the VIO completes its I/O
+ * @param errorHandler  The handler for write errors
+ * @param operation     The operation to perform (read or write)
+ **/
+void launchMetadataVIO(VIO                 *vio,
+                       PhysicalBlockNumber  physical,
+                       VDOAction           *callback,
+                       VDOAction           *errorHandler,
+                       VIOOperation         operation);
+
+/**
+ * Launch a metadata read VIO.
+ *
+ * @param vio           The VIO to launch
+ * @param physical      The physical block number to read
+ * @param callback      The function to call when the VIO completes its read
+ * @param errorHandler  The handler for write errors
+ **/
+static inline void launchReadMetadataVIO(VIO                 *vio,
+                                         PhysicalBlockNumber  physical,
+                                         VDOAction           *callback,
+                                         VDOAction           *errorHandler)
+{
+  launchMetadataVIO(vio, physical, callback, errorHandler, VIO_READ);
+}
+
+/**
+ * Launch a metadata write VIO.
+ *
+ * @param vio           The VIO to launch
+ * @param physical      The physical block number to write
+ * @param callback      The function to call when the VIO completes its write
+ * @param errorHandler  The handler for write errors
+ **/
+static inline void launchWriteMetadataVIO(VIO                 *vio,
+                                          PhysicalBlockNumber  physical,
+                                          VDOAction           *callback,
+                                          VDOAction           *errorHandler)
+{
+  launchMetadataVIO(vio, physical, callback, errorHandler, VIO_WRITE);
+}
+
+/**
+ * Launch a metadata write VIO optionally flushing the layer before and/or
+ * after the write operation.
+ *
+ * @param vio          The VIO to launch
+ * @param physical     The physical block number to write
+ * @param callback     The function to call when the VIO completes its
+ *                     operation
+ * @param errorHandler The handler for flush or write errors
+ * @param flushBefore  Whether or not to flush before writing
+ * @param flushAfter   Whether or not to flush after writing
+ **/
+static inline
+void launchWriteMetadataVIOWithFlush(VIO                 *vio,
+                                     PhysicalBlockNumber  physical,
+                                     VDOAction           *callback,
+                                     VDOAction           *errorHandler,
+                                     bool                 flushBefore,
+                                     bool                 flushAfter)
+{
+  launchMetadataVIO(vio, physical, callback, errorHandler,
+                    (VIO_WRITE
+                     | (flushBefore ? VIO_FLUSH_BEFORE : 0)
+                     | (flushAfter ? VIO_FLUSH_AFTER : 0)));
+}
+
+/**
+ * Issue a flush to the layer. If the layer does not require flushing, this
+ * method will immediately finish the VIO with which it was called. Care must
+ * be taken to avoid introducing a stack overflow in that case.
+ *
+ * @param vio           The VIO to notify when the flush is complete
+ * @param callback      The function to call when the flush is complete
+ * @param errorHandler  The handler for flush errors
+ **/
+void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler);
+
+#endif // VIO_H
diff --git a/vdo/base/vioPool.c b/vdo/base/vioPool.c
new file mode 100644
index 0000000..3d5ce07
--- /dev/null
+++ b/vdo/base/vioPool.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.c#5 $
+ */
+
+#include "vioPool.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "vio.h"
+#include "types.h"
+
+/**
+ * An VIOPool is a collection of preallocated VIOs.
+ **/
+struct vioPool {
+  /** The number of objects managed by the pool */
+  size_t         size;
+  /** The list of objects which are available */
+  RingNode       available;
+  /** The queue of requestors waiting for objects from the pool */
+  WaitQueue      waiting;
+  /** The number of objects currently in use */
+  size_t         busyCount;
+  /** The list of objects which are in use */
+  RingNode       busy;
+  /** The number of requests when no object was available */
+  uint64_t       outageCount;
+  /** The ID of the thread on which this pool may be used */
+  ThreadID       threadID;
+  /** The buffer backing the pool's VIOs */
+  char          *buffer;
+  /** The pool entries */
+  VIOPoolEntry   entries[];
+};
+
+/**********************************************************************/
+int makeVIOPool(PhysicalLayer   *layer,
+                size_t           poolSize,
+                ThreadID         threadID,
+                VIOConstructor  *vioConstructor,
+                void            *context,
+                VIOPool        **poolPtr)
+{
+  VIOPool *pool;
+  int result = ALLOCATE_EXTENDED(VIOPool, poolSize, VIOPoolEntry, __func__,
+                                 &pool);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  pool->threadID = threadID;
+  initializeRing(&pool->available);
+  initializeRing(&pool->busy);
+
+  result = ALLOCATE(poolSize * VDO_BLOCK_SIZE, char, "VIO pool buffer",
+                    &pool->buffer);
+  if (result != VDO_SUCCESS) {
+    freeVIOPool(&pool);
+    return result;
+  }
+
+  char *ptr = pool->buffer;
+  for (size_t i = 0; i < poolSize; i++) {
+    VIOPoolEntry *entry = &pool->entries[i];
+    entry->buffer       = ptr;
+    entry->context      = context;
+    result = vioConstructor(layer, entry, ptr, &entry->vio);
+    if (result != VDO_SUCCESS) {
+      freeVIOPool(&pool);
+      return result;
+    }
+
+    ptr += VDO_BLOCK_SIZE;
+    initializeRing(&entry->node);
+    pushRingNode(&pool->available, &entry->node);
+    pool->size++;
+  }
+
+  *poolPtr = pool;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeVIOPool(VIOPool **poolPtr)
+{
+  if (*poolPtr == NULL) {
+    return;
+  }
+
+  // Remove all available entries from the object pool.
+  VIOPool *pool = *poolPtr;
+  ASSERT_LOG_ONLY(!hasWaiters(&pool->waiting),
+                  "VIO pool must not have any waiters when being freed");
+  ASSERT_LOG_ONLY((pool->busyCount == 0),
+                  "VIO pool must not have %zu busy entries when being freed",
+                  pool->busyCount);
+  ASSERT_LOG_ONLY(isRingEmpty(&pool->busy),
+                  "VIO pool must not have busy entries when being freed");
+
+  VIOPoolEntry *entry;
+  while ((entry = asVIOPoolEntry(chopRingNode(&pool->available))) != NULL) {
+    freeVIO(&entry->vio);
+  }
+
+  // Make sure every VIOPoolEntry has been removed.
+  for (size_t i = 0; i < pool->size; i++) {
+    VIOPoolEntry *entry = &pool->entries[i];
+    ASSERT_LOG_ONLY(isRingEmpty(&entry->node), "VIO Pool entry still in use:"
+                    " VIO is in use for physical block %" PRIu64
+                    " for operation %u",
+                    entry->vio->physical,
+                    entry->vio->operation);
+  }
+
+  FREE(pool->buffer);
+  FREE(pool);
+  *poolPtr = NULL;
+}
+
+/**********************************************************************/
+bool isVIOPoolBusy(VIOPool *pool)
+{
+  return (pool->busyCount != 0);
+}
+
+/**********************************************************************/
+int acquireVIOFromPool(VIOPool *pool, Waiter *waiter)
+{
+  ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()),
+                  "acquire from active VIOPool called from correct thread");
+
+  if (isRingEmpty(&pool->available)) {
+    pool->outageCount++;
+    return enqueueWaiter(&pool->waiting, waiter);
+  }
+
+  pool->busyCount++;
+  RingNode *entry = chopRingNode(&pool->available);
+  pushRingNode(&pool->busy, entry);
+  (*waiter->callback)(waiter, entry);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry)
+{
+  ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()),
+                  "vio pool entry returned on same thread as it was acquired");
+  entry->vio->completion.errorHandler = NULL;
+  if (hasWaiters(&pool->waiting)) {
+    notifyNextWaiter(&pool->waiting, NULL, entry);
+    return;
+  }
+
+  pushRingNode(&pool->available, &entry->node);
+  --pool->busyCount;
+}
+
+/**********************************************************************/
+uint64_t getVIOPoolOutageCount(VIOPool *pool)
+{
+  return pool->outageCount;
+}
diff --git a/vdo/base/vioPool.h b/vdo/base/vioPool.h
new file mode 100644
index 0000000..bab3dbe
--- /dev/null
+++ b/vdo/base/vioPool.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.h#4 $
+ */
+
+#ifndef VIO_POOL_H
+#define VIO_POOL_H
+
+#include "permassert.h"
+
+#include "completion.h"
+#include "types.h"
+#include "waitQueue.h"
+
+/**
+ * A VIOPool is a collection of preallocated VIOs used to write arbitrary
+ * metadata blocks.
+ **/
+
+/**
+ * An VIOPoolEntry is the pair of VIO and buffer whether in use or not.
+ **/
+typedef struct {
+  RingNode  node;
+  VIO      *vio;
+  void     *buffer;
+  void     *parent;
+  void     *context;
+} VIOPoolEntry;
+
+/**
+ * A function which constructs a VIO for a pool.
+ *
+ * @param [in]  layer   The physical layer in which the VIO will operate
+ * @param [in]  parent  The parent of the VIO
+ * @param [in]  buffer  The data buffer for the VIO
+ * @param [out] vioPtr  A pointer to hold the new VIO
+ **/
+typedef int VIOConstructor(PhysicalLayer  *layer,
+                           void           *parent,
+                           void           *buffer,
+                           VIO           **vioPtr);
+
+/**
+ * Create a new VIO pool.
+ *
+ * @param [in]  layer           the physical layer to write to and read from
+ * @param [in]  poolSize        the number of VIOs in the pool
+ * @param [in]  threadID        the ID of the thread using this pool
+ * @param [in]  vioConstructor  the constructor for VIOs in the pool
+ * @param [in]  context         the context that each entry will have
+ * @param [out] poolPtr         the resulting pool
+ *
+ * @return a success or error code
+ **/
+int makeVIOPool(PhysicalLayer   *layer,
+                size_t           poolSize,
+                ThreadID         threadID,
+                VIOConstructor  *vioConstructor,
+                void            *context,
+                VIOPool        **poolPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Destroy a VIO pool
+ *
+ * @param poolPtr  the pointer holding the pool, which will be nulled out
+ **/
+void freeVIOPool(VIOPool **poolPtr);
+
+/**
+ * Check whether an VIO pool has outstanding entries.
+ *
+ * @return <code>true</code> if the pool is busy
+ **/
+bool isVIOPoolBusy(VIOPool *pool)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire a VIO and buffer from the pool (asynchronous).
+ *
+ * @param pool    the VIO pool
+ * @param waiter  object that is requesting a VIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int acquireVIOFromPool(VIOPool *pool, Waiter *waiter);
+
+/**
+ * Return a VIO and its buffer to the pool.
+ *
+ * @param pool   the VIO pool
+ * @param entry  a VIO pool entry
+ **/
+void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry);
+
+/**
+ * Convert a RingNode to the VIOPoolEntry that contains it.
+ *
+ * @param node  The RingNode to convert
+ *
+ * @return The VIOPoolEntry wrapping the RingNode
+ **/
+static inline VIOPoolEntry *asVIOPoolEntry(RingNode *node)
+{
+  STATIC_ASSERT(offsetof(VIOPoolEntry, node) == 0);
+  return (VIOPoolEntry *) node;
+}
+
+/**
+ * Return the outage count of an VIO pool.
+ *
+ * @param pool  The pool
+ *
+ * @return the number of times an acquisition request had to wait
+ **/
+uint64_t getVIOPoolOutageCount(VIOPool *pool)
+  __attribute__((warn_unused_result));
+
+#endif // VIO_POOL_H
diff --git a/vdo/base/vioRead.c b/vdo/base/vioRead.c
new file mode 100644
index 0000000..ab73727
--- /dev/null
+++ b/vdo/base/vioRead.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.c#1 $
+ */
+
+#include "vioRead.h"
+
+#include "logger.h"
+
+#include "blockMap.h"
+#include "dataVIO.h"
+#include "vdoInternal.h"
+#include "vioWrite.h"
+
+/**
+ * Do the modify-write part of a read-modify-write cycle. This callback is
+ * registered in readBlock().
+ *
+ * @param completion  The DataVIO which has just finished its read
+ **/
+static void modifyForPartialWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+
+  if (completion->result != VDO_SUCCESS) {
+    completeDataVIO(completion);
+    return;
+  }
+
+  completion->layer->applyPartialWrite(dataVIO);
+  VIO *vio = dataVIOAsVIO(dataVIO);
+  vio->operation = VIO_WRITE | (vio->operation & ~VIO_READ_WRITE_MASK);
+  dataVIO->isPartialWrite  = true;
+  launchWriteDataVIO(dataVIO);
+}
+
+/**
+ * Read a block asynchronously. This is the callback registered in
+ * readBlockMapping().
+ *
+ * @param completion  The DataVIO to read
+ **/
+static void readBlock(VDOCompletion *completion)
+{
+  if (completion->result != VDO_SUCCESS) {
+    completeDataVIO(completion);
+    return;
+  }
+
+  DataVIO *dataVIO = asDataVIO(completion);
+  VIO     *vio     = asVIO(completion);
+  completion->callback
+    = (isReadVIO(vio) ? completeDataVIO : modifyForPartialWrite);
+
+  if (dataVIO->mapped.pbn == ZERO_BLOCK) {
+    completion->layer->zeroDataVIO(dataVIO);
+    invokeCallback(completion);
+    return;
+  }
+
+  vio->physical = dataVIO->mapped.pbn;
+  dataVIO->lastAsyncOperation = READ_DATA;
+  completion->layer->readData(dataVIO);
+}
+
+/**
+ * Read the DataVIO's mapping from the block map. This callback is registered
+ * in launchReadDataVIO().
+ *
+ * @param completion  The DataVIO to be read
+ **/
+static void readBlockMapping(VDOCompletion *completion)
+{
+  if (completion->result != VDO_SUCCESS) {
+    completeDataVIO(completion);
+    return;
+  }
+
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  setLogicalCallback(dataVIO, readBlock, THIS_LOCATION("$F;cb=readBlock"));
+  dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK;
+  getMappedBlockAsync(dataVIO);
+}
+
+/**********************************************************************/
+void launchReadDataVIO(DataVIO *dataVIO)
+{
+  assertInLogicalZone(dataVIO);
+  dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT;
+  // Go find the block map slot for the LBN mapping.
+  findBlockMapSlotAsync(dataVIO, readBlockMapping,
+                        getLogicalZoneThreadID(dataVIO->logical.zone));
+}
+
+/**
+ * Release the logical block lock which a read DataVIO obtained now that it
+ * is done.
+ *
+ * @param completion  The DataVIO
+ **/
+static void releaseLogicalLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  releaseLogicalBlockLock(dataVIO);
+  vioDoneCallback(completion);
+}
+
+/**
+ * Clean up a DataVIO which has finished processing a read.
+ *
+ * @param dataVIO  The DataVIO to clean up
+ **/
+void cleanupReadDataVIO(DataVIO *dataVIO)
+{
+  launchLogicalCallback(dataVIO, releaseLogicalLock,
+                        THIS_LOCATION("$F;cb=releaseLL"));
+}
diff --git a/vdo/base/vioRead.h b/vdo/base/vioRead.h
new file mode 100644
index 0000000..ae2fa37
--- /dev/null
+++ b/vdo/base/vioRead.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.h#1 $
+ */
+
+#ifndef VIO_READ_H
+#define VIO_READ_H
+
+#include "types.h"
+
+/**
+ * Start the asynchronous processing of the DataVIO for a read or
+ * read-modify-write request which has acquired a lock on its logical block.
+ * The first step is to perform a block map lookup.
+ *
+ * @param dataVIO  The DataVIO doing the read
+ **/
+void launchReadDataVIO(DataVIO *dataVIO);
+
+/**
+ * Clean up a DataVIO which has finished processing a read.
+ *
+ * @param dataVIO  The DataVIO to clean up
+ **/
+void cleanupReadDataVIO(DataVIO *dataVIO);
+
+#endif /* VIO_READ_H */
diff --git a/vdo/base/vioWrite.c b/vdo/base/vioWrite.c
new file mode 100644
index 0000000..ac2bb53
--- /dev/null
+++ b/vdo/base/vioWrite.c
@@ -0,0 +1,1201 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $
+ */
+
+/*
+ * This file contains almost all of the VDO write path, which begins with
+ * writeExtent(). The progression through the callbacks which make up the
+ * write path depends upon whether or not the write policy is synchronous or
+ * asynchronous. The paths would proceed as outlined in the pseudo-code here
+ * if this were normal, synchronous code without callbacks. Complications
+ * involved in waiting on locks are not included.
+ *
+ * ######################################################################
+ * writeExtentSynchronous(extent)
+ * {
+ *   foreach (vio in extent) {
+ *     launchWriteVIO()
+ *     # allocateBlockForWrite()
+ *     if (!trim and !zero-block) {
+ *       allocate block
+ *       if (vio is compressed) {
+ *         completeCompressedBlockWrite()
+ *         finishVIO()
+ *         return
+ *       }
+ *       writeBlock()
+ *     }
+ *     finishBlockWrite()
+ *     addJournalEntry() # Increment
+ *     if (vio->newMapped is not ZERO_BLOCK) {
+ *       journalIncrementForWrite()
+ *     }
+ *     acknowledgeWriteCallback()
+ *     readOldBlockMapping()
+ *     journalUnmappingForWrite()
+ *     if (vio->mapped is not ZERO_BLOCK) {
+ *       journalDecrementForWrite()
+ *     }
+ *     updateBlockMapForWrite()
+ *     if (trim || zero-block) {
+ *       finishVIO()
+ *       return
+ *     }
+ *
+ *     prepareForDedupe()
+ *     hashData()
+ *     resolveHashZone()
+ *     acquireHashLock()
+ *     attemptDedupe() (query albireo)
+ *     if (isDuplicate) {
+ *       verifyAdvice() (read verify)
+ *       if (isDuplicate and canAddReference) {
+ *         shareBlock()
+ *         addJournalEntryForDedupe()
+ *         incrementForDedupe()
+ *         journalUnmappingForDedupe()
+ *         if (vio->mapped is not ZERO_BLOCK) {
+ *           decrementForDedupe()
+ *         }
+ *         updateBlockMapForDedupe()
+ *         finishVIO()
+ *         return
+ *       }
+ *     }
+ *
+ *     if (not canAddReference) {
+ *       layer->updateAlbireo()
+ *     }
+ *     # compressData()
+ *     if (compressing and not mooted and has no waiters) {
+ *       layer->compressVIO()
+ *       packCompressedData()
+ *       if (compressed) {
+ *         journalCompressedBlocks()
+ *         incrementForDedupe()
+ *         readOldBlockMappingForDedupe()
+ *         journalUnmappingForDedupe()
+ *         if (vio->mapped is not ZERO_BLOCK) {
+ *           decrementForDedupe()
+ *         }
+ *         updateBlockMapForDedupe()
+ *       }
+ *     }
+ *
+ *     finishVIO()
+ *   }
+ * }
+ *
+ * ######################################################################
+ * writeExtentAsynchronous(extent)
+ * {
+ *   foreach (vio in extent) {
+ *     launchWriteVIO()
+ *     # allocateBlockForWrite()
+ *     if (trim || zero-block) {
+ *       acknowledgeWrite()
+ *     } else {
+ *       allocateAndLockBlock()
+ *       if (vio is compressed) {
+ *         writeBlock()
+ *         completeCompressedBlockWrite()
+ *         finishVIO()
+ *         return
+ *       }
+ *
+ *       acknowledgeWrite()
+ *       prepareForDedupe()
+ *       hashData()
+ *       resolveHashZone()
+ *       acquireHashLock()
+ *       attemptDedupe() (query albireo)
+ *       if (isDuplicate) {
+ *         verifyAdvice() (read verify)
+ *         if (isDuplicate and canAddReference) {
+ *           shareBlock()
+ *           addJournalEntryForDedupe()
+ *           incrementForDedupe()
+ *           readOldBlockMappingForDedupe()
+ *           journalUnmappingForDedupe()
+ *           if (vio->mapped is not ZERO_BLOCK) {
+ *             decrementForDedupe()
+ *           }
+ *           updateBlockMapForDedupe()
+ *           finishVIO()
+ *           return
+ *         }
+ *       }
+ *
+ *       if (not canAddReference) {
+ *         layer->updateAlbireo()
+ *       }
+ *       # compressData()
+ *       if (compressing and not mooted and has no waiters) {
+ *         layer->compressVIO()
+ *         packCompressedData()
+ *         if (compressed) {
+ *           journalCompressedBlocks()
+ *           journalIncrementForDedupe()
+ *           readOldBlockMappingForDedupe()
+ *           journalUnmappingForDedupe()
+ *           if (vio->mapped is not ZERO_BLOCK) {
+ *             decrementForDedupe()
+ *           }
+ *           updateBlockMapForDedupe()
+ *           finishVIO()
+ *           return
+ *         }
+ *       }
+ *
+ *       writeBlock()
+ *     }
+ *
+ *     finishBlockWrite()
+ *     addJournalEntry() # Increment
+ *     if (vio->newMapped is not ZERO_BLOCK) {
+ *       journalIncrementForWrite()
+ *     }
+ *     readOldBlockMappingForWrite()
+ *     journalUnmappingForWrite()
+ *     if (vio->mapped is not ZERO_BLOCK) {
+ *       journalDecrementForWrite()
+ *     }
+ *     updateBlockMapForWrite()
+ *     finishVIO()
+ *   }
+ * }
+ */
+
+#include "vioWrite.h"
+
+#include "logger.h"
+
+#include "allocatingVIO.h"
+#include "atomic.h"
+#include "blockMap.h"
+#include "compressionState.h"
+#include "dataVIO.h"
+#include "hashLock.h"
+#include "recoveryJournal.h"
+#include "referenceOperation.h"
+#include "slab.h"
+#include "slabDepot.h"
+#include "slabJournal.h"
+#include "vdoInternal.h"
+#include "vioRead.h"
+
+/**
+ * The steps taken cleaning up a VIO, in the order they are performed.
+ **/
+typedef enum dataVIOCleanupStage {
+  VIO_CLEANUP_START = 0,
+  VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START,
+  VIO_RELEASE_RECOVERY_LOCKS,
+  VIO_RELEASE_HASH_LOCK,
+  VIO_RELEASE_LOGICAL,
+  VIO_CLEANUP_DONE
+} DataVIOCleanupStage;
+
+/**
+ * Actions to take on error used by abortOnError().
+ **/
+typedef enum {
+  NOT_READ_ONLY,
+  READ_ONLY_IF_ASYNC,
+  READ_ONLY,
+} ReadOnlyAction;
+
+// Forward declarations required because of circular function references.
+static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage);
+static void writeBlock(DataVIO *dataVIO);
+
+/**
+ * Check whether we are in async mode.
+ *
+ * @param dataVIO  A DataVIO containing a pointer to the VDO whose write
+ *                 policy we want to check
+ *
+ * @return <code>true</code> if we are in async mode
+ **/
+static inline bool isAsync(DataVIO *dataVIO)
+{
+  return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC);
+}
+
+/**
+ * Release the PBN lock and/or the reference on the allocated block at the
+ * end of processing a DataVIO.
+ *
+ * @param completion  The DataVIO
+ **/
+static void releaseAllocatedLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInAllocatedZone(dataVIO);
+  releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO));
+  performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS);
+}
+
+/**
+ * Release the logical block lock and flush generation lock at the end of
+ * processing a DataVIO.
+ *
+ * @param completion  The DataVIO
+ **/
+static void releaseLogicalLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  releaseLogicalBlockLock(dataVIO);
+  releaseFlushGenerationLock(dataVIO);
+  performCleanupStage(dataVIO, VIO_CLEANUP_DONE);
+}
+
+/**
+ * Release the hash lock at the end of processing a DataVIO.
+ *
+ * @param completion  The DataVIO
+ **/
+static void cleanHashLock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInHashZone(dataVIO);
+  releaseHashLock(dataVIO);
+  performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL);
+}
+
+/**
+ * Make some assertions about a DataVIO which has finished cleaning up
+ * and do its final callback.
+ *
+ * @param dataVIO  The DataVIO which has finished cleaning up
+ **/
+static void finishCleanup(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL,
+                  "complete DataVIO has no allocation lock");
+  ASSERT_LOG_ONLY(dataVIO->hashLock == NULL,
+                  "complete DataVIO has no hash lock");
+  vioDoneCallback(dataVIOAsCompletion(dataVIO));
+}
+
+/**
+ * Perform the next step in the process of cleaning up a DataVIO.
+ *
+ * @param dataVIO  The DataVIO to clean up
+ * @param stage    The cleanup stage to perform
+ **/
+static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage)
+{
+  switch (stage) {
+  case VIO_RELEASE_ALLOCATED:
+    if (hasAllocation(dataVIO)) {
+      launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock,
+                                  THIS_LOCATION("$F;cb=releaseAllocLock"));
+      return;
+    }
+    // fall through
+
+  case VIO_RELEASE_RECOVERY_LOCKS:
+    if ((dataVIO->recoverySequenceNumber > 0)
+        && !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)
+        && (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) {
+      logWarning("VDO not read-only when cleaning DataVIO with RJ lock");
+    }
+    // fall through
+
+  case VIO_RELEASE_HASH_LOCK:
+    if (dataVIO->hashLock != NULL) {
+      launchHashZoneCallback(dataVIO, cleanHashLock,
+                             THIS_LOCATION("$F;cb=cleanHashLock"));
+      return;
+    }
+    // fall through
+
+  case VIO_RELEASE_LOGICAL:
+    if (!isCompressedWriteDataVIO(dataVIO)) {
+      launchLogicalCallback(dataVIO, releaseLogicalLock,
+                            THIS_LOCATION("$F;cb=releaseLL"));
+      return;
+    }
+    // fall through
+
+  default:
+    finishCleanup(dataVIO);
+  }
+}
+
+/**
+ * Return a DataVIO that encountered an error to its hash lock so it can
+ * update the hash lock state accordingly. This continuation is registered in
+ * abortOnError(), and must be called in the hash zone of the DataVIO.
+ *
+ * @param completion  The completion of the DataVIO to return to its hash lock
+ **/
+static void finishWriteDataVIOWithError(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInHashZone(dataVIO);
+  continueHashLockOnError(dataVIO);
+}
+
+/**
+ * Check whether a result is an error, and if so abort the DataVIO associated
+ * with the error.
+ *
+ * @param result            The result to check
+ * @param dataVIO           The DataVIO
+ * @param readOnlyAction    The conditions under which the VDO should be put
+ *                          into read-only mode if the result is an error
+ *
+ * @return <code>true</code> if the result is an error
+ **/
+static bool abortOnError(int             result,
+                         DataVIO        *dataVIO,
+                         ReadOnlyAction  readOnlyAction)
+{
+  if (result == VDO_SUCCESS) {
+    return false;
+  }
+
+  if ((result == VDO_READ_ONLY)
+      || (readOnlyAction == READ_ONLY)
+      || ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) {
+    ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier;
+    if (!isReadOnly(notifier)) {
+      if (result != VDO_READ_ONLY) {
+        logErrorWithStringError(result, "Preparing to enter read-only mode:"
+                                " DataVIO for LBN %llu (becoming mapped"
+                                " to %llu, previously mapped"
+                                " to %llu, allocated %llu) is"
+                                " completing with a fatal error after"
+                                " operation %s", dataVIO->logical.lbn,
+                                dataVIO->newMapped.pbn, dataVIO->mapped.pbn,
+                                getDataVIOAllocation(dataVIO),
+                                getOperationName(dataVIO));
+      }
+
+      enterReadOnlyMode(notifier, result);
+    }
+  }
+
+  if (dataVIO->hashLock != NULL) {
+    launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError,
+                           THIS_LOCATION(NULL));
+  } else {
+    finishDataVIO(dataVIO, result);
+  }
+  return true;
+}
+
+/**
+ * Return a DataVIO that finished writing, compressing, or deduplicating to
+ * its hash lock so it can share the result with any DataVIOs waiting in the
+ * hash lock, or update albireo, or simply release its share of the lock. This
+ * continuation is registered in updateBlockMapForWrite(),
+ * updateBlockMapForDedupe(), and abortDeduplication(), and must be called in
+ * the hash zone of the DataVIO.
+ *
+ * @param completion  The completion of the DataVIO to return to its hash lock
+ **/
+static void finishWriteDataVIO(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInHashZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+  continueHashLock(dataVIO);
+}
+
+/**
+ * Abort the data optimization process.
+ *
+ * @param dataVIO  The DataVIO which does not deduplicate or compress
+ **/
+static void abortDeduplication(DataVIO *dataVIO)
+{
+  if (!hasAllocation(dataVIO)) {
+    // There was no space to write this block and we failed to deduplicate
+    // or compress it.
+    finishDataVIO(dataVIO, VDO_NO_SPACE);
+    return;
+  }
+
+  if (isAsync(dataVIO)) {
+    // We failed to deduplicate or compress an async DataVIO, so now we need
+    // to actually write the data.
+    writeBlock(dataVIO);
+    return;
+  }
+
+  if (dataVIO->hashLock == NULL) {
+    // We failed to compress a synchronous DataVIO that is a hash collision,
+    // which means it can't dedpe or be used for dedupe, so it's done now.
+    finishDataVIO(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  /*
+   * This synchronous DataVIO failed to compress and so is finished, but must
+   * now return to its hash lock so other DataVIOs with the same data can
+   * deduplicate against the uncompressed block it wrote.
+   */
+  launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
+}
+
+/**
+ * Update the block map now that we've added an entry in the recovery journal
+ * for a block we have just shared. This is the callback registered in
+ * decrementForDedupe().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void updateBlockMapForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->hashLock != NULL) {
+    setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
+  } else {
+    completion->callback = completeDataVIO;
+  }
+  dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE;
+  putMappedBlockAsync(dataVIO);
+}
+
+/**
+ * Make a recovery journal increment.
+ *
+ * @param dataVIO  The DataVIO
+ * @param lock     The PBNLock on the block being incremented
+ **/
+static void journalIncrement(DataVIO *dataVIO, PBNLock *lock)
+{
+  setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn,
+                                  dataVIO->newMapped.state, lock,
+                                  &dataVIO->operation);
+  addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal,
+                          dataVIO);
+}
+
+/**
+ * Make a recovery journal decrement entry.
+ *
+ * @param dataVIO  The DataVIO
+ **/
+static void journalDecrement(DataVIO *dataVIO)
+{
+  setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn,
+                                  dataVIO->mapped.state, dataVIO->mapped.zone,
+                                  &dataVIO->operation);
+  addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal,
+                          dataVIO);
+}
+
+/**
+ * Make a reference count change.
+ *
+ * @param dataVIO  The DataVIO
+ **/
+static void updateReferenceCount(DataVIO *dataVIO)
+{
+  SlabDepot           *depot = getVDOFromDataVIO(dataVIO)->depot;
+  PhysicalBlockNumber  pbn   = dataVIO->operation.pbn;
+  int result = ASSERT(isPhysicalDataBlock(depot, pbn),
+                      "Adding slab journal entry for impossible PBN %" PRIu64
+                      "for LBN %llu", pbn, dataVIO->logical.lbn);
+  if (abortOnError(result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO);
+}
+
+/**
+ * Do the decref after a successful dedupe or compression. This is the callback
+ * registered by journalUnmappingForDedupe().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void decrementForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInMappedZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO);
+  if (allocatingVIO->allocation == dataVIO->mapped.pbn) {
+    /*
+     * If we are about to release the reference on the allocated block,
+     * we must release the PBN lock on it first so that the allocator will
+     * not allocate a write-locked block.
+     */
+    releaseAllocationLock(allocatingVIO);
+  }
+
+  setLogicalCallback(dataVIO, updateBlockMapForDedupe,
+                     THIS_LOCATION("$F;js=dec"));
+  dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE;
+  updateReferenceCount(dataVIO);
+}
+
+/**
+ * Write the appropriate journal entry for removing the mapping of logical to
+ * mapped, for dedupe or compression. This is the callback registered in
+ * readOldBlockMappingForDedupe().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void journalUnmappingForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->mapped.pbn == ZERO_BLOCK) {
+    setLogicalCallback(dataVIO, updateBlockMapForDedupe,
+                       THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM"));
+  } else {
+    setMappedZoneCallback(dataVIO, decrementForDedupe,
+                          THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe"));
+  }
+  dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE;
+  journalDecrement(dataVIO);
+}
+
+/**
+ * Get the previous PBN mapped to this LBN from the block map, so as to make
+ * an appropriate journal entry referencing the removal of this LBN->PBN
+ * mapping, for dedupe or compression. This callback is registered in
+ * incrementForDedupe().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void readOldBlockMappingForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE;
+  setJournalCallback(dataVIO, journalUnmappingForDedupe,
+                     THIS_LOCATION("$F;cb=journalUnmapDedupe"));
+  getMappedBlockAsync(dataVIO);
+}
+
+/**
+ * Do the incref after compression. This is the callback registered by
+ * addRecoveryJournalEntryForCompression().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void incrementForCompression(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInNewMappedZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state),
+                  "Impossible attempt to update reference counts for a block "
+                  "which was not compressed (logical block %llu)",
+                  dataVIO->logical.lbn);
+
+  /*
+   * If we are synchronous and allocated a block, we know the one we
+   * allocated is the block we need to decrement, so there is no need
+   * to look in the block map.
+   */
+  if (isAsync(dataVIO) || !hasAllocation(dataVIO)) {
+    setLogicalCallback(dataVIO, readOldBlockMappingForDedupe,
+                       THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe"));
+  } else {
+    setJournalCallback(dataVIO, journalUnmappingForDedupe,
+                       THIS_LOCATION("$F;cb=journalUnmappingForDedupe"));
+  }
+  dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION;
+  updateReferenceCount(dataVIO);
+}
+
+/**
+ * Add a recovery journal entry for the increment resulting from compression.
+ *
+ * @param completion  The DataVIO which has been compressed
+ **/
+static void addRecoveryJournalEntryForCompression(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+
+  if (!isCompressed(dataVIO->newMapped.state)) {
+    abortDeduplication(dataVIO);
+    return;
+  }
+
+  setNewMappedZoneCallback(dataVIO, incrementForCompression,
+                           THIS_LOCATION("$F($dup);js=map/$dup;"
+                                         "cb=incCompress($dup)"));
+  dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION;
+  journalIncrement(dataVIO, getDuplicateLock(dataVIO));
+}
+
+/**
+ * Attempt to pack the compressed DataVIO into a block. This is the callback
+ * registered in compressData().
+ *
+ * @param completion  The completion of a compressed DataVIO
+ **/
+static void packCompressedData(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInPackerZone(dataVIO);
+
+  // XXX this is a callback, so there should probably be an error check here
+  // even if we think compression can't currently return one.
+
+  if (!mayPackDataVIO(dataVIO)) {
+    abortDeduplication(dataVIO);
+    return;
+  }
+
+  setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression,
+                     THIS_LOCATION("$F;cb=update(compress)"));
+  dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK;
+  attemptPacking(dataVIO);
+}
+
+/**********************************************************************/
+void compressData(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(!dataVIO->isDuplicate,
+                  "compressing a non-duplicate block");
+  if (!mayCompressDataVIO(dataVIO)) {
+    abortDeduplication(dataVIO);
+    return;
+  }
+
+  dataVIO->lastAsyncOperation = COMPRESS_DATA;
+  setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack"));
+  dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO);
+}
+
+/**
+ * Do the incref after deduplication. This is the callback registered by
+ * addRecoveryJournalEntryForDedupe().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void incrementForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInNewMappedZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  ASSERT_LOG_ONLY(dataVIO->isDuplicate,
+                  "Impossible attempt to update reference counts for a block "
+                  "which was not a duplicate (logical block %llu)",
+                  dataVIO->logical.lbn);
+
+  /*
+   * If we are synchronous and allocated a block, we know the one we
+   * allocated is the block we need to decrement, so there is no need
+   * to look in the block map.
+   */
+  if (isAsync(dataVIO) || !hasAllocation(dataVIO)) {
+    setLogicalCallback(dataVIO, readOldBlockMappingForDedupe,
+                       THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe"));
+  } else {
+    setJournalCallback(dataVIO, journalUnmappingForDedupe,
+                       THIS_LOCATION("$F;cb=journalUnmappingForDedupe"));
+  }
+  dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE;
+  updateReferenceCount(dataVIO);
+}
+
+/**
+ * Add a recovery journal entry for the increment resulting from deduplication.
+ * This callback is registered in shareBlock().
+ *
+ * @param completion  The DataVIO which has been deduplicated
+ **/
+static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+
+  setNewMappedZoneCallback(dataVIO, incrementForDedupe,
+                           THIS_LOCATION("$F($dup);js=map/$dup;"
+                                         "cb=incDedupe($dup)"));
+  dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE;
+  journalIncrement(dataVIO, getDuplicateLock(dataVIO));
+}
+
+/**
+ * Share a block in the block map if it is a duplicate. This is the lock
+ * callback registered in acquirePBNReadLock(). This is only public so
+ * test code can compare the function to the current callback in a completion.
+ *
+ * @param completion The completion of the write in progress
+ **/
+void shareBlock(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInDuplicateZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+
+  if (!dataVIO->isDuplicate) {
+    compressData(dataVIO);
+    return;
+  }
+
+  dataVIO->newMapped = dataVIO->duplicate;
+  launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe,
+                        THIS_LOCATION("$F;cb=addJournalEntryDup"));
+}
+
+/**
+ * Route the DataVIO to the HashZone responsible for the chunk name to acquire
+ * a hash lock on that name, or join with a existing hash lock managing
+ * concurrent dedupe for that name. This is the callback registered in
+ * resolveHashZone().
+ *
+ * @param completion  The DataVIO to lock
+ **/
+static void lockHashInZone(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInHashZone(dataVIO);
+  // Shouldn't have had any errors since all we did was switch threads.
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  int result = acquireHashLock(dataVIO);
+  if (abortOnError(result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->hashLock == NULL) {
+    // It's extremely unlikely, but in the case of a hash collision, the
+    // DataVIO will not obtain a reference to the lock and cannot deduplicate.
+    compressData(dataVIO);
+    return;
+  }
+
+  enterHashLock(dataVIO);
+}
+
+/**
+ * Set the hash zone (and flag the chunk name as set) while still on the
+ * thread that just hashed the data to set the chunk name. This is the
+ * callback registered by prepareForDedupe().
+ *
+ * @param completion The DataVIO whose chunk name was just generated, as a
+ *                   completion
+ **/
+static void resolveHashZone(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  // We don't care what thread we are on.
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed");
+
+  dataVIO->hashZone
+    = selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName);
+  dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK;
+  launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL));
+}
+
+/**
+ * Prepare for the dedupe path after a synchronous write or an asynchronous
+ * allocation. This callback is registered in updateBlockMapForWrite() for
+ * sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for
+ * async. It is also called directly from the latter when allocation fails.
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void prepareForDedupe(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  // We don't care what thread we are on
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (!isAsync(dataVIO)) {
+    // Remember which block we wrote so we will decrement the reference to it
+    // if we deduplicate. This avoids having to look it up in the block map.
+    dataVIO->mapped = dataVIO->newMapped;
+  }
+
+  ASSERT_LOG_ONLY(!dataVIO->isZeroBlock,
+                  "must not prepare to dedupe zero blocks");
+
+  // Before we can dedupe, we need to know the chunk name, so the first step
+  // is to hash the block data.
+  dataVIO->lastAsyncOperation = HASH_DATA;
+  // XXX this is the wrong thread to run this callback, but we don't yet have
+  // a mechanism for running it on the CPU thread immediately after hashing.
+  setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL));
+  completion->layer->hashData(dataVIO);
+}
+
+/**
+ * Update the block map after a data write (or directly for a ZERO_BLOCK write
+ * or trim). This callback is registered in decrementForWrite() and
+ * journalUnmappingForWrite().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void updateBlockMapForWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) {
+    completion->callback = completeDataVIO;
+  } else if (!isAsync(dataVIO)) {
+    // Synchronous DataVIOs branch off to the hash/dedupe path after finishing
+    // the uncompressed write of their data.
+    completion->callback = prepareForDedupe;
+  } else if (dataVIO->hashLock != NULL) {
+    // Async writes will be finished, but must return to the hash lock to
+    // allow other DataVIOs with the same data to dedupe against the write.
+    setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL));
+  } else {
+    // Async writes without a hash lock (hash collisions) will be finished.
+    completion->callback = completeDataVIO;
+  }
+
+  dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK;
+  putMappedBlockAsync(dataVIO);
+}
+
+/**
+ * Do the decref after a successful block write. This is the callback
+ * by journalUnmappingForWrite() if the old mapping was not the zero block.
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void decrementForWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInMappedZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE;
+  setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL));
+  updateReferenceCount(dataVIO);
+}
+
+/**
+ * Write the appropriate journal entry for unmapping logical to mapped for a
+ * write. This is the callback registered in readOldBlockMappingForWrite().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void journalUnmappingForWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->mapped.pbn == ZERO_BLOCK) {
+    setLogicalCallback(dataVIO, updateBlockMapForWrite,
+                       THIS_LOCATION("$F;js=unmap;cb=updateBMwrite"));
+  } else {
+    setMappedZoneCallback(dataVIO, decrementForWrite,
+                          THIS_LOCATION("$F;js=unmap;cb=decWrite"));
+  }
+  dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE;
+  journalDecrement(dataVIO);
+}
+
+/**
+ * Get the previous PBN mapped to this LBN from the block map for a write, so
+ * as to make an appropriate journal entry referencing the removal of this
+ * LBN->PBN mapping. This callback is registered in finishBlockWrite() in the
+ * async path, and is registered in acknowledgeWrite() in the sync path.
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void readOldBlockMappingForWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInLogicalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  setJournalCallback(dataVIO, journalUnmappingForWrite,
+                     THIS_LOCATION("$F;cb=journalUnmapWrite"));
+  dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE;
+  getMappedBlockAsync(dataVIO);
+}
+
+/**
+ * Acknowledge a write to the requestor.
+ *
+ * @param dataVIO  The DataVIO being acknowledged
+ **/
+static void acknowledgeWrite(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock,
+                  "write VIO to be acknowledged has a flush generation lock");
+  dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE;
+  dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO);
+}
+
+/**
+ * Acknowledge a write now that we have made an entry in the recovery
+ * journal. This is the callback registered in finishBlockWrite() in
+ * synchronous mode.
+ *
+ * @param completion The completion of the write in progress
+ **/
+static void acknowledgeWriteCallback(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY)) {
+    return;
+  }
+
+  setLogicalCallback(dataVIO, readOldBlockMappingForWrite,
+                     THIS_LOCATION(NULL));
+  acknowledgeWrite(dataVIO);
+}
+
+/**********************************************************************/
+static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO)
+{
+  return (isAsync(dataVIO)
+          ? readOldBlockMappingForWrite : acknowledgeWriteCallback);
+}
+
+/**
+ * Do the incref after a successful block write. This is the callback
+ * registered by finishBlockWrite().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void incrementForWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInAllocatedZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+
+  /*
+   * Now that the data has been written, it's safe to deduplicate against the
+   * block. Downgrade the allocation lock to a read lock so it can be used
+   * later by the hash lock (which we don't have yet in sync mode).
+   */
+  downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock);
+
+  dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE;
+  setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO),
+                     THIS_LOCATION(NULL));
+  updateReferenceCount(dataVIO);
+}
+
+/**
+ * Add an entry in the recovery journal after a successful block write. This is
+ * the callback registered by writeBlock(). It is also registered in
+ * allocateBlockForWrite().
+ *
+ * @param completion  The completion of the write in progress
+ **/
+static void finishBlockWrite(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  assertInJournalZone(dataVIO);
+  if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) {
+    return;
+  }
+
+  if (dataVIO->newMapped.pbn == ZERO_BLOCK) {
+    setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO),
+                       THIS_LOCATION("$F;js=writeZero"));
+  } else {
+    setAllocatedZoneCallback(dataVIO, incrementForWrite,
+                             THIS_LOCATION("$F;js=mapWrite"));
+  }
+  dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE;
+  journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock);
+}
+
+/**
+ * Write data to the underlying storage.
+ *
+ * @param dataVIO  The DataVIO to write
+ **/
+static void writeBlock(DataVIO *dataVIO)
+{
+  dataVIO->lastAsyncOperation = WRITE_DATA;
+  setJournalCallback(dataVIO, finishBlockWrite,
+                     THIS_LOCATION("$F(data);cb=finishWrite"));
+  dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO);
+}
+
+/**
+ * Continue the write path for a DataVIO now that block allocation is complete
+ * (the DataVIO may or may not have actually received an allocation). This
+ * callback is registered in continueWriteWithBlockMapSlot().
+ *
+ * @param allocatingVIO  The DataVIO which has finished the allocation process
+ *                       (as an AllocatingVIO)
+ **/
+static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO)
+{
+  DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO);
+  if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO,
+                   NOT_READ_ONLY)) {
+    return;
+  }
+
+  if (!hasAllocation(dataVIO)) {
+    prepareForDedupe(dataVIOAsCompletion(dataVIO));
+    return;
+  }
+
+  atomicStoreBool(&dataVIO->hasAllocation, true);
+  dataVIO->newMapped = (ZonedPBN) {
+    .zone  = allocatingVIO->zone,
+    .pbn   = allocatingVIO->allocation,
+    .state = MAPPING_STATE_UNCOMPRESSED,
+  };
+
+  if (!isAsync(dataVIO)) {
+    writeBlock(dataVIO);
+    return;
+  }
+
+  // XXX prepareForDedupe can run from any thread, so this is a place where
+  // running the callback on the kernel thread would save a thread switch.
+  setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL));
+  if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) {
+    invokeCallback(dataVIOAsCompletion(dataVIO));
+    return;
+  }
+
+  acknowledgeWrite(dataVIO);
+}
+
+/**
+ * Continue the write path for a VIO now that block map slot resolution is
+ * complete. This callback is registered in launchWriteDataVIO().
+ *
+ * @param completion  The DataVIO to write
+ **/
+static void continueWriteWithBlockMapSlot(VDOCompletion *completion)
+{
+  DataVIO *dataVIO = asDataVIO(completion);
+  // We don't care what thread we're on.
+  if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) {
+    return;
+  }
+
+  if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) {
+    int result = ASSERT(isTrimDataVIO(dataVIO),
+                        "dataVIO with no block map page is a trim");
+    if (abortOnError(result, dataVIO, READ_ONLY)) {
+      return;
+    }
+
+    // This is a trim for a block on a block map page which has not been
+    // allocated, so there's nothing more we need to do.
+    finishDataVIO(dataVIO, VDO_SUCCESS);
+    return;
+  }
+
+  if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) {
+    // We don't need to write any data, so skip allocation and just update
+    // the block map and reference counts (via the journal).
+    dataVIO->newMapped.pbn = ZERO_BLOCK;
+    launchJournalCallback(dataVIO, finishBlockWrite,
+                          THIS_LOCATION("$F;cb=finishWrite"));
+    return;
+  }
+
+  allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO),
+                    getAllocationSelector(dataVIO->logical.zone),
+                    VIO_WRITE_LOCK, continueWriteAfterAllocation);
+}
+
+/**********************************************************************/
+void launchWriteDataVIO(DataVIO *dataVIO)
+{
+  if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) {
+    finishDataVIO(dataVIO, VDO_READ_ONLY);
+    return;
+  }
+
+  // Write requests join the current flush generation.
+  int result = acquireFlushGenerationLock(dataVIO);
+  if (abortOnError(result, dataVIO, NOT_READ_ONLY)) {
+    return;
+  }
+
+  // Go find the block map slot for the LBN mapping.
+  dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT;
+  findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot,
+                        getLogicalZoneThreadID(dataVIO->logical.zone));
+}
+
+/**********************************************************************/
+void cleanupWriteDataVIO(DataVIO *dataVIO)
+{
+  performCleanupStage(dataVIO, VIO_CLEANUP_START);
+}
diff --git a/vdo/base/vioWrite.h b/vdo/base/vioWrite.h
new file mode 100644
index 0000000..6effc91
--- /dev/null
+++ b/vdo/base/vioWrite.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.h#1 $
+ */
+
+#ifndef VIO_WRITE_H
+#define VIO_WRITE_H
+
+#include "types.h"
+
+/**
+ * Release the PBN read lock if it is held.
+ *
+ * @param dataVIO  The possible lock holder
+ **/
+void releasePBNReadLock(DataVIO *dataVIO);
+
+/**
+ * Start the asynchronous processing of a DataVIO for a write request which has
+ * acquired a lock on its logical block by joining the current flush generation
+ * and then attempting to allocate a physical block.
+ *
+ * @param dataVIO  The DataVIO doing the write
+ **/
+void launchWriteDataVIO(DataVIO *dataVIO);
+
+/**
+ * Clean up a DataVIO which has finished processing a write.
+ *
+ * @param dataVIO  The DataVIO to clean up
+ **/
+void cleanupWriteDataVIO(DataVIO *dataVIO);
+
+/**
+ * Continue a write by attempting to compress the data. This is a re-entry
+ * point to vioWrite used by hash locks.
+ *
+ * @param dataVIO   The DataVIO to be compressed
+ **/
+void compressData(DataVIO *dataVIO);
+
+#endif /* VIO_WRITE_H */
diff --git a/vdo/base/volumeGeometry.c b/vdo/base/volumeGeometry.c
new file mode 100644
index 0000000..32b2e5f
--- /dev/null
+++ b/vdo/base/volumeGeometry.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.c#10 $
+ */
+
+#include "volumeGeometry.h"
+
+#include "buffer.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#include "constants.h"
+#include "header.h"
+#include "physicalLayer.h"
+#include "releaseVersions.h"
+#include "statusCodes.h"
+#include "types.h"
+
+enum {
+  GEOMETRY_BLOCK_LOCATION = 0,
+  MAGIC_NUMBER_SIZE       = 8,
+};
+
+typedef struct {
+  char            magicNumber[MAGIC_NUMBER_SIZE];
+  Header          header;
+  VolumeGeometry  geometry;
+  CRC32Checksum   checksum;
+} __attribute__((packed)) GeometryBlock;
+
+static const Header GEOMETRY_BLOCK_HEADER_4_0 = {
+  .id = GEOMETRY_BLOCK,
+  .version = {
+    .majorVersion = 4,
+    .minorVersion = 0,
+  },
+  // Note: this size isn't just the payload size following the header, like it
+  // is everywhere else in VDO.
+  .size = sizeof(GeometryBlock),
+};
+
+static const byte MAGIC_NUMBER[MAGIC_NUMBER_SIZE + 1] = "dmvdo001";
+
+static const ReleaseVersionNumber COMPATIBLE_RELEASE_VERSIONS[] = {
+  MAGNESIUM_RELEASE_VERSION_NUMBER,
+};
+
+/**
+ * Determine whether the supplied release version can be understood by
+ * the VDO code.
+ *
+ * @param version  The release version number to check
+ *
+ * @return <code>True</code> if the given version can be loaded.
+ **/
+static inline bool isLoadableReleaseVersion(ReleaseVersionNumber version)
+{
+  if (version == CURRENT_RELEASE_VERSION_NUMBER) {
+    return true;
+  }
+
+  for (unsigned int i = 0; i < COUNT_OF(COMPATIBLE_RELEASE_VERSIONS); i++) {
+    if (version == COMPATIBLE_RELEASE_VERSIONS[i]) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+ * Decode the on-disk representation of an index configuration from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param config  The structure to receive the decoded fields
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int decodeIndexConfig(Buffer *buffer, IndexConfig *config)
+{
+  uint32_t mem;
+  int result = getUInt32LEFromBuffer(buffer, &mem);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  uint32_t checkpointFrequency;
+  result = getUInt32LEFromBuffer(buffer, &checkpointFrequency);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  bool sparse;
+  result = getBoolean(buffer, &sparse);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *config = (IndexConfig) {
+    .mem                 = mem,
+    .checkpointFrequency = checkpointFrequency,
+    .sparse              = sparse,
+  };
+  return VDO_SUCCESS;
+}
+
+/**
+ * Encode the on-disk representation of an index configuration into a buffer.
+ *
+ * @param config  The index configuration to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int encodeIndexConfig(const IndexConfig *config, Buffer *buffer)
+{
+  int result = putUInt32LEIntoBuffer(buffer, config->mem);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt32LEIntoBuffer(buffer, config->checkpointFrequency);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return putBoolean(buffer, config->sparse);
+}
+
+/**
+ * Decode the on-disk representation of a volume region from a buffer.
+ *
+ * @param buffer  A buffer positioned at the start of the encoding
+ * @param region  The structure to receive the decoded fields
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int decodeVolumeRegion(Buffer *buffer, VolumeRegion *region)
+{
+  VolumeRegionID id;
+  int result = getUInt32LEFromBuffer(buffer, &id);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  PhysicalBlockNumber startBlock;
+  result = getUInt64LEFromBuffer(buffer, &startBlock);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  *region = (VolumeRegion) {
+    .id         = id,
+    .startBlock = startBlock,
+  };
+  return VDO_SUCCESS;
+}
+
+/**
+ * Encode the on-disk representation of a volume region into a buffer.
+ *
+ * @param region  The region to encode
+ * @param buffer  A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int encodeVolumeRegion(const VolumeRegion *region, Buffer *buffer)
+{
+  int result = putUInt32LEIntoBuffer(buffer, region->id);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return putUInt64LEIntoBuffer(buffer, region->startBlock);
+}
+
+/**
+ * Decode the on-disk representation of a volume geometry from a buffer.
+ *
+ * @param buffer    A buffer positioned at the start of the encoding
+ * @param geometry  The structure to receive the decoded fields
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int decodeVolumeGeometry(Buffer *buffer, VolumeGeometry *geometry)
+{
+  ReleaseVersionNumber releaseVersion;
+  int result = getUInt32LEFromBuffer(buffer, &releaseVersion);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Nonce nonce;
+  result = getUInt64LEFromBuffer(buffer, &nonce);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  geometry->releaseVersion = releaseVersion;
+  geometry->nonce          = nonce;
+
+  result = getBytesFromBuffer(buffer, sizeof(UUID), geometry->uuid);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) {
+    result = decodeVolumeRegion(buffer, &geometry->regions[id]);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return decodeIndexConfig(buffer, &geometry->indexConfig);
+}
+
+/**
+ * Encode the on-disk representation of a volume geometry into a buffer.
+ *
+ * @param geometry  The geometry to encode
+ * @param buffer    A buffer positioned at the start of the encoding
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int encodeVolumeGeometry(const VolumeGeometry *geometry, Buffer *buffer)
+{
+  int result = putUInt32LEIntoBuffer(buffer, geometry->releaseVersion);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putUInt64LEIntoBuffer(buffer, geometry->nonce);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = putBytes(buffer, sizeof(UUID), geometry->uuid);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) {
+    result = encodeVolumeRegion(&geometry->regions[id], buffer);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  return encodeIndexConfig(&geometry->indexConfig, buffer);
+}
+
+/**
+ * Decode the on-disk representation of a geometry block, up to but not
+ * including the checksum, from a buffer.
+ *
+ * @param buffer    A buffer positioned at the start of the block
+ * @param geometry  The structure to receive the decoded volume geometry fields
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int decodeGeometryBlock(Buffer *buffer, VolumeGeometry *geometry)
+{
+  if (!hasSameBytes(buffer, MAGIC_NUMBER, MAGIC_NUMBER_SIZE)) {
+    return VDO_BAD_MAGIC;
+  }
+
+  int result = skipForward(buffer, MAGIC_NUMBER_SIZE);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Header header;
+  result = decodeHeader(buffer, &header);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = validateHeader(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = decodeVolumeGeometry(buffer, geometry);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Leave the CRC for the caller to decode and verify.
+  return ASSERT(header.size
+                == (uncompactedAmount(buffer) + sizeof(CRC32Checksum)),
+                "should have decoded up to the geometry checksum");
+}
+
+/**
+ * Encode the on-disk representation of a geometry block, up to but not
+ * including the checksum, into a buffer.
+ *
+ * @param geometry  The volume geometry to encode into the block
+ * @param buffer    A buffer positioned at the start of the block
+ *
+ * @return UDS_SUCCESS or an error
+ **/
+static int encodeGeometryBlock(const VolumeGeometry *geometry, Buffer *buffer)
+{
+  int result = putBytes(buffer, MAGIC_NUMBER_SIZE, MAGIC_NUMBER);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeHeader(&GEOMETRY_BLOCK_HEADER_4_0, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = encodeVolumeGeometry(geometry, buffer);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  // Leave the CRC for the caller to compute and encode.
+  return ASSERT(GEOMETRY_BLOCK_HEADER_4_0.size
+                == (contentLength(buffer) + sizeof(CRC32Checksum)),
+                "should have decoded up to the geometry checksum");
+}
+
+/**
+ * Allocate a block-size buffer to read the geometry from the physical layer,
+ * read the block, and return the buffer.
+ *
+ * @param [in]  layer     The physical layer containing the block to read
+ * @param [out] blockPtr  A pointer to receive the allocated buffer
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int readGeometryBlock(PhysicalLayer *layer, byte **blockPtr)
+{
+  int result = ASSERT(layer->reader != NULL, "Layer must have a sync reader");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  char *block;
+  result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block",
+                                   &block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = layer->reader(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL);
+  if (result != VDO_SUCCESS) {
+    FREE(block);
+    return result;
+  }
+
+  *blockPtr = (byte *) block;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry)
+{
+  byte *block;
+  int result = readGeometryBlock(layer, &block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Buffer *buffer;
+  result = wrapBuffer(block, VDO_BLOCK_SIZE, VDO_BLOCK_SIZE, &buffer);
+  if (result != VDO_SUCCESS) {
+    FREE(block);
+    return result;
+  }
+
+  result = decodeGeometryBlock(buffer, geometry);
+  if (result != VDO_SUCCESS) {
+    freeBuffer(&buffer);
+    FREE(block);
+    return result;
+  }
+
+  // Checksum everything decoded so far.
+  CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, block,
+                                              uncompactedAmount(buffer));
+  CRC32Checksum savedChecksum;
+  result = getUInt32LEFromBuffer(buffer, &savedChecksum);
+  if (result != VDO_SUCCESS) {
+    freeBuffer(&buffer);
+    FREE(block);
+    return result;
+  }
+
+  // Finished all decoding. Everything that follows is validation code.
+  freeBuffer(&buffer);
+  FREE(block);
+
+  if (!isLoadableReleaseVersion(geometry->releaseVersion)) {
+    return logErrorWithStringError(VDO_UNSUPPORTED_VERSION,
+                                   "release version %d cannot be loaded",
+                                   geometry->releaseVersion);
+  }
+
+  return ((checksum == savedChecksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH);
+}
+
+/************************************************************************/
+int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr)
+{
+  UdsConfiguration udsConfiguration = NULL;
+  int result = indexConfigToUdsConfiguration(indexConfig, &udsConfiguration);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "error creating index config");
+  }
+
+  uint64_t indexBytes;
+  result = udsComputeIndexSize(udsConfiguration, 0, &indexBytes);
+  udsFreeConfiguration(udsConfiguration);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "error computing index size");
+  }
+
+  BlockCount indexBlocks = indexBytes / VDO_BLOCK_SIZE;
+  if ((((uint64_t) indexBlocks) * VDO_BLOCK_SIZE) != indexBytes) {
+    return logErrorWithStringError(VDO_PARAMETER_MISMATCH, "index size must be"
+                                   " a multiple of block size %d",
+                                   VDO_BLOCK_SIZE);
+  }
+
+  *indexBlocksPtr = indexBlocks;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int initializeVolumeGeometry(Nonce           nonce,
+                             UUID            uuid,
+                             IndexConfig    *indexConfig,
+                             VolumeGeometry *geometry)
+{
+  BlockCount indexSize = 0;
+  if (indexConfig != NULL) {
+    int result = computeIndexBlocks(indexConfig, &indexSize);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  *geometry = (VolumeGeometry) {
+    .releaseVersion = CURRENT_RELEASE_VERSION_NUMBER,
+    .nonce = nonce,
+    .regions = {
+      [INDEX_REGION] = {
+        .id = INDEX_REGION,
+        .startBlock = 1,
+      },
+      [DATA_REGION] = {
+        .id = DATA_REGION,
+        .startBlock = 1 + indexSize,
+      }
+    }
+  };
+  memcpy(geometry->uuid, uuid, sizeof(UUID));
+  if (indexSize > 0) {
+    memcpy(&geometry->indexConfig, indexConfig, sizeof(IndexConfig));
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int clearVolumeGeometry(PhysicalLayer *layer)
+{
+  char *block;
+  int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block",
+                                       &block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL);
+  FREE(block);
+  return result;
+}
+
+/**********************************************************************/
+int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry)
+{
+  char *block;
+  int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block",
+                                       &block);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  Buffer *buffer;
+  result = wrapBuffer((byte *) block, VDO_BLOCK_SIZE, 0, &buffer);
+  if (result != VDO_SUCCESS) {
+    FREE(block);
+    return result;
+  }
+
+  result = encodeGeometryBlock(geometry, buffer);
+  if (result != VDO_SUCCESS) {
+    freeBuffer(&buffer);
+    FREE(block);
+    return result;
+  }
+
+  // Checksum everything encoded so far and then encode the checksum.
+  CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, (byte *) block,
+                                              contentLength(buffer));
+  result = putUInt32LEIntoBuffer(buffer, checksum);
+  if (result != VDO_SUCCESS) {
+    freeBuffer(&buffer);
+    FREE(block);
+    return result;
+  }
+
+  // Write it.
+  result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL);
+  freeBuffer(&buffer);
+  FREE(block);
+  return result;
+}
+
+/************************************************************************/
+int indexConfigToUdsConfiguration(IndexConfig      *indexConfig,
+                                  UdsConfiguration *udsConfigPtr)
+{
+  UdsConfiguration udsConfiguration;
+  int result = udsInitializeConfiguration(&udsConfiguration, indexConfig->mem);
+  if (result != UDS_SUCCESS) {
+    return logErrorWithStringError(result, "error initializing configuration");
+  }
+
+  udsConfigurationSetSparse(udsConfiguration, indexConfig->sparse);
+
+  *udsConfigPtr = udsConfiguration;
+  return VDO_SUCCESS;
+}
+
+/************************************************************************/
+void indexConfigToUdsParameters(IndexConfig           *indexConfig,
+                                struct uds_parameters *userParams)
+{
+  userParams->checkpoint_frequency = indexConfig->checkpointFrequency;
+}
diff --git a/vdo/base/volumeGeometry.h b/vdo/base/volumeGeometry.h
new file mode 100644
index 0000000..c06cdde
--- /dev/null
+++ b/vdo/base/volumeGeometry.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.h#5 $
+ */
+
+#ifndef VOLUME_GEOMETRY_H
+#define VOLUME_GEOMETRY_H
+
+#include "uds.h"
+
+#include "types.h"
+
+struct indexConfig {
+  uint32_t mem;
+  uint32_t checkpointFrequency;
+  bool     sparse;
+} __attribute__((packed));
+
+typedef enum {
+  INDEX_REGION = 0,
+  DATA_REGION  = 1,
+  VOLUME_REGION_COUNT,
+} VolumeRegionID;
+
+typedef struct {
+  /** The ID of the region */
+  VolumeRegionID      id;
+  /**
+   * The absolute starting offset on the device. The region continues until
+   * the next region begins.
+   */
+  PhysicalBlockNumber startBlock;
+} __attribute__((packed)) VolumeRegion;
+
+/** A binary UUID is 16 bytes. */
+typedef unsigned char UUID[16];
+
+typedef struct {
+  /** The release version number of this volume */
+  ReleaseVersionNumber releaseVersion;
+  /** The nonce of this volume */
+  Nonce                nonce;
+  /** The UUID of this volume */
+  UUID                 uuid;
+  /** The regions in ID order */
+  VolumeRegion         regions[VOLUME_REGION_COUNT];
+  /** The index config */
+  IndexConfig          indexConfig;
+} __attribute__((packed)) VolumeGeometry;
+
+/**
+ * Get the start of the index region from a geometry.
+ *
+ * @param geometry  The geometry
+ *
+ * @return The start of the index region
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber getIndexRegionOffset(VolumeGeometry geometry)
+{
+  return geometry.regions[INDEX_REGION].startBlock;
+}
+
+/**
+ * Get the start of the data region from a geometry.
+ *
+ * @param geometry  The geometry
+ *
+ * @return The start of the data region
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber getDataRegionOffset(VolumeGeometry geometry)
+{
+  return geometry.regions[DATA_REGION].startBlock;
+}
+
+/**
+ * Get the size of the index region from a geometry.
+ *
+ * @param geometry  The geometry
+ *
+ * @return the size of the index region
+ **/
+__attribute__((warn_unused_result))
+static inline PhysicalBlockNumber getIndexRegionSize(VolumeGeometry geometry)
+{
+  return getDataRegionOffset(geometry) - getIndexRegionOffset(geometry);
+}
+
+/**
+ * Read the volume geometry from a layer.
+ *
+ * @param layer     The layer to read and parse the geometry from
+ * @param geometry  The geometry to be loaded
+ **/
+int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry)
+__attribute__((warn_unused_result));
+
+/**
+ * Initialize a VolumeGeometry for a VDO.
+ *
+ * @param nonce        The nonce for the VDO
+ * @param uuid         The uuid for the VDO
+ * @param indexConfig  The index config of the VDO.
+ * @param geometry     The geometry being initialized
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int initializeVolumeGeometry(Nonce           nonce,
+                             UUID            uuid,
+                             IndexConfig    *indexConfig,
+                             VolumeGeometry *geometry)
+  __attribute__((warn_unused_result));
+
+/**
+ * Zero out the geometry on a layer.
+ *
+ * @param layer  The layer to clear
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int clearVolumeGeometry(PhysicalLayer *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Write a geometry block for a VDO.
+ *
+ * @param layer     The layer on which to write.
+ * @param geometry  The VolumeGeometry to be written
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry)
+__attribute__((warn_unused_result));
+
+/**
+ * Convert a index config to a UDS configuration, which can be used by UDS.
+ *
+ * @param [in]  indexConfig   The index config to convert
+ * @param [out] udsConfigPtr  A pointer to return the UDS configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int indexConfigToUdsConfiguration(IndexConfig      *indexConfig,
+                                  UdsConfiguration *udsConfigPtr)
+__attribute__((warn_unused_result));
+
+/**
+ * Modify the uds_parameters to match the requested index config.
+ *
+ * @param indexConfig  The index config to convert
+ * @param userParams   The uds_parameters to modify
+ **/
+void indexConfigToUdsParameters(IndexConfig           *indexConfig,
+                                struct uds_parameters *userParams);
+
+/**
+ * Compute the index size in blocks from the IndexConfig.
+ *
+ * @param [in]  indexConfig     The index config
+ * @param [out] indexBlocksPtr  A pointer to return the index size in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr)
+__attribute__((warn_unused_result));
+
+/**
+ * Set load config fields from a volume geometry.
+ *
+ * @param [in]  geometry    The geometry to use
+ * @param [out] loadConfig  The load config to set
+ **/
+static inline void setLoadConfigFromGeometry(VolumeGeometry *geometry,
+                                             VDOLoadConfig  *loadConfig)
+{
+  loadConfig->firstBlockOffset = getDataRegionOffset(*geometry);
+  loadConfig->releaseVersion   = geometry->releaseVersion;
+  loadConfig->nonce            = geometry->nonce;
+}
+
+#endif // VOLUME_GEOMETRY_H
diff --git a/vdo/base/waitQueue.c b/vdo/base/waitQueue.c
new file mode 100644
index 0000000..3d7f175
--- /dev/null
+++ b/vdo/base/waitQueue.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.c#1 $
+ */
+
+#include "waitQueue.h"
+
+#include "permassert.h"
+
+#include "statusCodes.h"
+
+/**********************************************************************/
+int enqueueWaiter(WaitQueue *queue, Waiter *waiter)
+{
+  int result = ASSERT((waiter->nextWaiter == NULL),
+                      "new waiter must not already be in a waiter queue");
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  if (queue->lastWaiter == NULL) {
+    // The queue is empty, so form the initial circular list by self-linking
+    // the initial waiter.
+    waiter->nextWaiter = waiter;
+  } else {
+    // Splice the new waiter in at the end of the queue.
+    waiter->nextWaiter = queue->lastWaiter->nextWaiter;
+    queue->lastWaiter->nextWaiter = waiter;
+  }
+  // In both cases, the waiter we added to the ring becomes the last waiter.
+  queue->lastWaiter   = waiter;
+  queue->queueLength += 1;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue)
+{
+  // If the source queue is empty, there's nothing to do.
+  if (!hasWaiters(fromQueue)) {
+    return;
+  }
+
+  if (hasWaiters(toQueue)) {
+    // Both queues are non-empty. Splice the two circular lists together by
+    // swapping the next (head) pointers in the list tails.
+    Waiter *fromHead = fromQueue->lastWaiter->nextWaiter;
+    Waiter *toHead   = toQueue->lastWaiter->nextWaiter;
+    toQueue->lastWaiter->nextWaiter   = fromHead;
+    fromQueue->lastWaiter->nextWaiter = toHead;
+  }
+
+  toQueue->lastWaiter   = fromQueue->lastWaiter;
+  toQueue->queueLength += fromQueue->queueLength;
+  initializeWaitQueue(fromQueue);
+}
+
+/**********************************************************************/
+void notifyAllWaiters(WaitQueue      *queue,
+                      WaiterCallback *callback,
+                      void           *context)
+{
+  // Copy and empty the queue first, avoiding the possibility of an infinite
+  // loop if entries are returned to the queue by the callback function.
+  WaitQueue waiters;
+  initializeWaitQueue(&waiters);
+  transferAllWaiters(queue, &waiters);
+
+  // Drain the copied queue, invoking the callback on every entry.
+  while (notifyNextWaiter(&waiters, callback, context)) {
+    // All the work is done by the loop condition.
+  }
+}
+
+/**********************************************************************/
+Waiter *getFirstWaiter(const WaitQueue *queue)
+{
+  Waiter *lastWaiter = queue->lastWaiter;
+  if (lastWaiter == NULL) {
+    // There are no waiters, so we're done.
+    return NULL;
+  }
+
+  // The queue is circular, so the last entry links to the head of the queue.
+  return lastWaiter->nextWaiter;
+}
+
+/**********************************************************************/
+int dequeueMatchingWaiters(WaitQueue   *queue,
+                           WaiterMatch *matchMethod,
+                           void        *matchContext,
+                           WaitQueue   *matchedQueue)
+{
+  WaitQueue matchedWaiters;
+  initializeWaitQueue(&matchedWaiters);
+
+  WaitQueue iterationQueue;
+  initializeWaitQueue(&iterationQueue);
+  transferAllWaiters(queue, &iterationQueue);
+  while (hasWaiters(&iterationQueue)) {
+    Waiter *waiter = dequeueNextWaiter(&iterationQueue);
+    int     result = VDO_SUCCESS;
+    if (!matchMethod(waiter, matchContext)) {
+      result = enqueueWaiter(queue, waiter);
+    } else {
+      result = enqueueWaiter(&matchedWaiters, waiter);
+    }
+    if (result != VDO_SUCCESS) {
+      transferAllWaiters(&matchedWaiters, queue);
+      transferAllWaiters(&iterationQueue, queue);
+      return result;
+    }
+  }
+
+  transferAllWaiters(&matchedWaiters, matchedQueue);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+Waiter *dequeueNextWaiter(WaitQueue *queue)
+{
+  Waiter *firstWaiter = getFirstWaiter(queue);
+  if (firstWaiter == NULL) {
+    return NULL;
+  }
+
+  Waiter *lastWaiter = queue->lastWaiter;
+  if (firstWaiter == lastWaiter) {
+    // The queue has a single entry, so just empty it out by nulling the tail.
+    queue->lastWaiter = NULL;
+  } else {
+    // The queue has more than one entry, so splice the first waiter out of
+    // the circular queue.
+    lastWaiter->nextWaiter = firstWaiter->nextWaiter;
+  }
+
+  // The waiter is no longer in a wait queue.
+  firstWaiter->nextWaiter  = NULL;
+  queue->queueLength      -= 1;
+  return firstWaiter;
+}
+
+/**********************************************************************/
+bool notifyNextWaiter(WaitQueue      *queue,
+                      WaiterCallback *callback,
+                      void           *context)
+{
+  Waiter *waiter = dequeueNextWaiter(queue);
+  if (waiter == NULL) {
+    return false;
+  }
+
+  if (callback == NULL) {
+    callback = waiter->callback;
+  }
+  (*callback)(waiter, context);
+  return true;
+}
+
+/**********************************************************************/
+const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter)
+{
+  Waiter *firstWaiter = getFirstWaiter(queue);
+  if (waiter == NULL) {
+    return firstWaiter;
+  }
+  return ((waiter->nextWaiter != firstWaiter) ? waiter->nextWaiter : NULL);
+}
diff --git a/vdo/base/waitQueue.h b/vdo/base/waitQueue.h
new file mode 100644
index 0000000..5eb754e
--- /dev/null
+++ b/vdo/base/waitQueue.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.h#1 $
+ */
+
+#ifndef WAIT_QUEUE_H
+#define WAIT_QUEUE_H
+
+#include "common.h"
+
+/**
+ * A wait queue is a circular list of entries waiting to be notified of a
+ * change in a condition. Keeping a circular list allows the queue structure
+ * to simply be a pointer to the tail (newest) entry in the queue, supporting
+ * constant-time enqueue and dequeue operations. A null pointer is an empty
+ * queue.
+ *
+ *   An empty queue:
+ *     queue0.lastWaiter -> NULL
+ *
+ *   A singleton queue:
+ *     queue1.lastWaiter -> entry1 -> entry1 -> [...]
+ *
+ *   A three-element queue:
+ *     queue2.lastWaiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...]
+ **/
+
+typedef struct waiter Waiter;
+
+typedef struct {
+  /** The tail of the queue, the last (most recently added) entry */
+  Waiter *lastWaiter;
+  /** The number of waiters currently in the queue */
+  size_t  queueLength;
+} WaitQueue;
+
+/**
+ * Callback type for functions which will be called to resume processing of a
+ * waiter after it has been removed from its wait queue.
+ **/
+typedef void WaiterCallback(Waiter *waiter, void *context);
+
+/**
+ * Method type for Waiter matching methods.
+ *
+ * A WaiterMatch method returns false if the waiter does not match.
+ **/
+typedef bool WaiterMatch(Waiter *waiter, void *context);
+
+/**
+ * The queue entry structure for entries in a WaitQueue.
+ **/
+struct waiter {
+  /**
+   * The next waiter in the queue. If this entry is the last waiter, then this
+   * is actually a pointer back to the head of the queue.
+   **/
+  struct waiter  *nextWaiter;
+
+  /** Optional waiter-specific callback to invoke when waking this waiter. */
+  WaiterCallback *callback;
+};
+
+/**
+ * Check whether a Waiter is waiting.
+ *
+ * @param waiter  The waiter to check
+ *
+ * @return <code>true</code> if the waiter is on some WaitQueue
+ **/
+static inline bool isWaiting(Waiter *waiter)
+{
+  return (waiter->nextWaiter != NULL);
+}
+
+/**
+ * Initialize a wait queue.
+ *
+ * @param queue  The queue to initialize
+ **/
+static inline void initializeWaitQueue(WaitQueue *queue)
+{
+  *queue = (WaitQueue) {
+    .lastWaiter  = NULL,
+    .queueLength = 0,
+  };
+}
+
+/**
+ * Check whether a wait queue has any entries waiting in it.
+ *
+ * @param queue  The queue to query
+ *
+ * @return <code>true</code> if there are any waiters in the queue
+ **/
+__attribute__((warn_unused_result))
+static inline bool hasWaiters(const WaitQueue *queue)
+{
+  return (queue->lastWaiter != NULL);
+}
+
+/**
+ * Add a waiter to the tail end of a wait queue. The waiter must not already
+ * be waiting in a queue.
+ *
+ * @param queue     The queue to which to add the waiter
+ * @param waiter    The waiter to add to the queue
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int enqueueWaiter(WaitQueue *queue, Waiter *waiter)
+  __attribute__((warn_unused_result));
+
+/**
+ * Notify all the entries waiting in a queue to continue execution by invoking
+ * a callback function on each of them in turn. The queue is copied and
+ * emptied before invoking any callbacks, and only the waiters that were in
+ * the queue at the start of the call will be notified.
+ *
+ * @param queue     The wait queue containing the waiters to notify
+ * @param callback  The function to call to notify each waiter, or NULL
+ *                  to invoke the callback field registered in each waiter
+ * @param context   The context to pass to the callback function
+ **/
+void notifyAllWaiters(WaitQueue      *queue,
+                      WaiterCallback *callback,
+                      void           *context);
+
+/**
+ * Notify the next entry waiting in a queue to continue execution by invoking
+ * a callback function on it after removing it from the queue.
+ *
+ * @param queue     The wait queue containing the waiter to notify
+ * @param callback  The function to call to notify the waiter, or NULL
+ *                  to invoke the callback field registered in the waiter
+ * @param context   The context to pass to the callback function
+ *
+ * @return <code>true</code> if there was a waiter in the queue
+ **/
+bool notifyNextWaiter(WaitQueue      *queue,
+                      WaiterCallback *callback,
+                      void           *context);
+
+/**
+ * Transfer all waiters from one wait queue to a second queue, emptying the
+ * first queue.
+ *
+ * @param fromQueue  The queue containing the waiters to move
+ * @param toQueue    The queue that will receive the waiters from the
+ *                   the first queue
+ **/
+void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue);
+
+/**
+ * Return the waiter that is at the head end of a wait queue.
+ *
+ * @param queue  The queue from which to get the first waiter
+ *
+ * @return The first (oldest) waiter in the queue, or <code>NULL</code> if
+ *         the queue is empty
+ **/
+Waiter *getFirstWaiter(const WaitQueue *queue);
+
+/**
+ * Remove all waiters that match based on the specified matching method and
+ * append them to a WaitQueue.
+ *
+ * @param queue         The wait queue to process
+ * @param matchMethod   The method to determine matching
+ * @param matchContext  Contextual info for the match method
+ * @param matchedQueue  A WaitQueue to store matches
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int dequeueMatchingWaiters(WaitQueue   *queue,
+                           WaiterMatch *matchMethod,
+                           void        *matchContext,
+                           WaitQueue   *matchedQueue);
+
+/**
+ * Remove the first waiter from the head end of a wait queue. The caller will
+ * be responsible for waking the waiter by invoking the correct callback
+ * function to resume its execution.
+ *
+ * @param queue  The wait queue from which to remove the first entry
+ *
+ * @return The first (oldest) waiter in the queue, or <code>NULL</code> if
+ *         the queue is empty
+ **/
+Waiter *dequeueNextWaiter(WaitQueue *queue);
+
+/**
+ * Count the number of waiters in a wait queue.
+ *
+ * @param queue  The wait queue to query
+ *
+ * @return the number of waiters in the queue
+ **/
+__attribute__((warn_unused_result))
+static inline size_t countWaiters(const WaitQueue *queue)
+{
+  return queue->queueLength;
+}
+
+/**
+ * Get the waiter after this one, for debug iteration.
+ *
+ * @param queue   The wait queue
+ * @param waiter  A waiter
+ *
+ * @return the next waiter, or NULL
+ **/
+const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter)
+  __attribute__((warn_unused_result));
+
+#endif // WAIT_QUEUE_H
diff --git a/vdo/kernel/batchProcessor.c b/vdo/kernel/batchProcessor.c
new file mode 100644
index 0000000..5845960
--- /dev/null
+++ b/vdo/kernel/batchProcessor.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.c#2 $
+ */
+
+#include "batchProcessor.h"
+
+#include "memoryAlloc.h"
+
+#include "constants.h"
+
+#include "kernelLayer.h"
+
+/*
+ * On memory ordering:
+ *
+ * The producer thread does: enqueue item on queue (xchg, which is
+ * implicitly interlocked, then a store), memory barrier, then atomic
+ * cmpxchg of the state field. The x86 architecture spec says the
+ * xchg, store, lock-cmpxchg sequence cannot be reordered, but on
+ * architectures using load-linked and store-conditional for the
+ * cmpxchg, like AArch64, the LL can be reordered with the store, so
+ * we add a barrier.
+ *
+ * The consumer thread, when it is running out of work, does: read
+ * queue (find empty), set state, mfence, read queue again just to be
+ * sure. The set-state and read-queue cannot be reordered with respect
+ * to the mfence (but without the mfence, the read could be moved
+ * before the set).
+ *
+ * The xchg and mfence impose a total order across processors, and
+ * each processor sees the stores done by the other processor in the
+ * required order. If the xchg happens before the mfence, the
+ * consumer's "read queue again" operation will see the update. If the
+ * mfence happens first, the producer's "cmpxchg state" will see its
+ * updated value.
+ *
+ * These are the semantics implemented by memory set to WB (write-back
+ * caching) mode on x86-64. So, the simple analysis is that no wakeups
+ * should be missed.
+ *
+ * It's a little subtler with funnel queues, since one interrupted or
+ * delayed enqueue operation (see the commentary in funnelQueuePut)
+ * can cause another, concurrent enqueue operation to complete without
+ * actually making the entry visible to the consumer. In essence, one
+ * update makes no new work items visible to the consumer, and the
+ * other (when it eventually completes) makes two (or more) work items
+ * visible, and each one ensures that the consumer will process what
+ * it has made visible.
+ */
+
+typedef enum batchProcessorState {
+  BATCH_PROCESSOR_IDLE,
+  BATCH_PROCESSOR_ENQUEUED,
+} BatchProcessorState;
+
+struct batchProcessor {
+  spinlock_t              consumerLock;
+  FunnelQueue            *queue;
+  KvdoWorkItem            workItem;
+  atomic_t                state;
+  BatchProcessorCallback  callback;
+  void                   *closure;
+  KernelLayer            *layer;
+};
+
+static void scheduleBatchProcessing(BatchProcessor *batch);
+
+/**
+ * Apply the batch processing function to the accumulated set of
+ * objects.
+ *
+ * Runs in a "CPU queue".
+ *
+ * @param [in]  item  The work item embedded in the BatchProcessor
+ **/
+static void batchProcessorWork(KvdoWorkItem *item)
+{
+  BatchProcessor *batch = container_of(item, BatchProcessor, workItem);
+  spin_lock(&batch->consumerLock);
+  while (!isFunnelQueueEmpty(batch->queue)) {
+    batch->callback(batch, batch->closure);
+  }
+  atomic_set(&batch->state, BATCH_PROCESSOR_IDLE);
+  memoryFence();
+  bool needReschedule = !isFunnelQueueEmpty(batch->queue);
+  spin_unlock(&batch->consumerLock);
+  if (needReschedule) {
+    scheduleBatchProcessing(batch);
+  }
+}
+
+/**
+ * Ensure that the batch-processing function is scheduled to run.
+ *
+ * If we're the thread that switches the BatchProcessor state from
+ * idle to enqueued, we're the thread responsible for actually
+ * enqueueing it. If some other thread got there first, or it was
+ * already enqueued, it's not our problem.
+ *
+ * @param [in]  batch  The BatchProcessor control data
+ **/
+static void scheduleBatchProcessing(BatchProcessor *batch)
+{
+  /*
+   * We want this to be very fast in the common cases.
+   *
+   * In testing on our "mgh" class machines (HP ProLiant DL380p Gen8,
+   * Intel Xeon E5-2690, 2.9GHz), it appears that under some
+   * conditions it's a little faster to use a memory fence and then
+   * read the "state" field, skipping the cmpxchg if the state is
+   * already set to BATCH_PROCESSOR_ENQUEUED. (Sometimes slightly
+   * faster still if we prefetch the state field first.) Note that the
+   * read requires the fence, otherwise it could be executed before
+   * the preceding store by the FunnelQueue code to the "next"
+   * pointer, which can, very rarely, result in failing to issue a
+   * wakeup when needed.
+   *
+   * However, the gain is small, and in testing on our older "harvard"
+   * class machines (Intel Xeon X5680, 3.33GHz) it was a clear win to
+   * skip all of that and go right for the cmpxchg.
+   *
+   * Of course, the tradeoffs may be sensitive to the particular work
+   * going on, cache pressure, etc.
+   */
+  smp_mb();
+  BatchProcessorState oldState
+    = atomic_cmpxchg(&batch->state, BATCH_PROCESSOR_IDLE,
+                     BATCH_PROCESSOR_ENQUEUED);
+  bool doSchedule = (oldState == BATCH_PROCESSOR_IDLE);
+  if (doSchedule) {
+    enqueueCPUWorkQueue(batch->layer, &batch->workItem);
+  }
+}
+
+/**********************************************************************/
+int makeBatchProcessor(KernelLayer             *layer,
+                       BatchProcessorCallback   callback,
+                       void                    *closure,
+                       BatchProcessor         **batchPtr)
+{
+  BatchProcessor *batch;
+
+  int result = ALLOCATE(1, BatchProcessor, "batchProcessor", &batch);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  result = makeFunnelQueue(&batch->queue);
+  if (result != UDS_SUCCESS) {
+    FREE(batch);
+    return result;
+  }
+
+  spin_lock_init(&batch->consumerLock);
+  setupWorkItem(&batch->workItem, batchProcessorWork,
+                (KvdoWorkFunction) callback, CPU_Q_ACTION_COMPLETE_KVIO);
+  atomic_set(&batch->state, BATCH_PROCESSOR_IDLE);
+  batch->callback = callback;
+  batch->closure  = closure;
+  batch->layer    = layer;
+
+  *batchPtr = batch;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item)
+{
+  funnelQueuePut(batch->queue, &item->workQueueEntryLink);
+  scheduleBatchProcessing(batch);
+}
+
+/**********************************************************************/
+KvdoWorkItem *nextBatchItem(BatchProcessor *batch)
+{
+  FunnelQueueEntry *fqEntry = funnelQueuePoll(batch->queue);
+  if (fqEntry == NULL) {
+    return NULL;
+  }
+
+  return container_of(fqEntry, KvdoWorkItem, workQueueEntryLink);
+}
+
+/**********************************************************************/
+void condReschedBatchProcessor(BatchProcessor *batch)
+{
+  cond_resched_lock(&batch->consumerLock);
+}
+
+/**********************************************************************/
+void freeBatchProcessor(BatchProcessor **batchPtr)
+{
+  BatchProcessor *batch = *batchPtr;
+  if (batch) {
+    memoryFence();
+    BUG_ON(atomic_read(&batch->state) == BATCH_PROCESSOR_ENQUEUED);
+    freeFunnelQueue(batch->queue);
+    FREE(batch);
+    *batchPtr = NULL;
+  }
+}
diff --git a/vdo/kernel/batchProcessor.h b/vdo/kernel/batchProcessor.h
new file mode 100644
index 0000000..5e348c6
--- /dev/null
+++ b/vdo/kernel/batchProcessor.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.h#2 $
+ */
+
+#ifndef BATCHPROCESSOR_H
+#define BATCHPROCESSOR_H
+
+#include "kernelTypes.h"
+#include "util/funnelQueue.h"
+
+/**
+ * Control data for managing collections of objects to be operated on
+ * by a specified function. May be used when the work function is
+ * lightweight enough or cache-contentious enough that it makes sense
+ * to try to accumulate multiple objects and operate on them all at
+ * once in one thread.
+ *
+ * The work function is run in one of the kernel layer's "CPU queues",
+ * and care is taken to ensure that only one invocation can be running
+ * or scheduled at any given time. It can loop calling nextBatchItem
+ * repeatedly until there are no more objects to operate on. It should
+ * also call condReschedBatchProcessor now and then, to play nicely
+ * with the OS scheduler.
+ *
+ * Objects to operate on are manipulated through a FunnelQueueEntry
+ * object which must be contained within them.
+ **/
+typedef struct batchProcessor BatchProcessor;
+
+typedef void (*BatchProcessorCallback)(BatchProcessor *batch, void *closure);
+
+/**
+ * Creates a batch-processor control structure.
+ *
+ * @param [in]  layer     The kernel layer data, used to enqueue work items
+ * @param [in]  callback  A function to process the accumulated objects
+ * @param [in]  closure   A private data pointer for use by the callback
+ * @param [out] batchPtr  Where to store the pointer to the new object
+ *
+ * @return   UDS_SUCCESS or an error code
+ **/
+int makeBatchProcessor(KernelLayer             *layer,
+                       BatchProcessorCallback   callback,
+                       void                    *closure,
+                       BatchProcessor         **batchPtr);
+
+/**
+ * Adds an object to the processing queue.
+ *
+ * <p>If the callback function is not currently running or scheduled to be run,
+ * it gets queued up to run.
+ *
+ * @param [in] batch  The batch-processor data
+ * @param [in] item   The handle on the new object to add
+ **/
+void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item);
+
+/**
+ * Fetches the next object in the processing queue.
+ *
+ * @param [in]  batch  The batch-processor data
+ *
+ * @return   An object pointer or NULL
+ **/
+KvdoWorkItem *nextBatchItem(BatchProcessor *batch)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free the batch-processor data and null out the pointer.
+ *
+ * @param [in,out] batchPtr  Where the BatchProcessor pointer is stored
+ **/
+void freeBatchProcessor(BatchProcessor **batchPtr);
+
+/**
+ * Yield control to the scheduler if the kernel has indicated that
+ * other work needs to run on the current processor.
+ *
+ * The data structure is needed so that the spin lock can be
+ * (conditionally) released and re-acquired.
+ *
+ * @param [in]  batch  The batch-processor data
+ **/
+void condReschedBatchProcessor(BatchProcessor *batch);
+
+#endif // BATCHPROCESSOR_H
diff --git a/vdo/kernel/bio.c b/vdo/kernel/bio.c
new file mode 100644
index 0000000..a8e3a5e
--- /dev/null
+++ b/vdo/kernel/bio.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.c#8 $
+ */
+
+#include "bio.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "numeric.h"
+
+#include "flush.h"
+#include "recoveryJournal.h"
+
+#include "bioIterator.h"
+#include "ioSubmitter.h"
+
+/**
+ * Gets the raw buffer from a biovec.
+ *
+ * @param biovec  The biovec in question
+ *
+ * @return the buffer
+ **/
+static char *getBufferForBiovec(struct bio_vec *biovec)
+{
+  return (page_address(biovec->bv_page) + biovec->bv_offset);
+}
+
+/**********************************************************************/
+void bioCopyDataIn(BIO *bio, char *dataPtr)
+{
+  struct bio_vec *biovec;
+  for (BioIterator iter = createBioIterator(bio);
+       (biovec = getNextBiovec(&iter)) != NULL;
+       advanceBioIterator(&iter)) {
+    memcpy(dataPtr, getBufferForBiovec(biovec), biovec->bv_len);
+    dataPtr += biovec->bv_len;
+  }
+}
+
+/**********************************************************************/
+void bioCopyDataOut(BIO *bio, char *dataPtr)
+{
+  struct bio_vec *biovec;
+  for (BioIterator iter = createBioIterator(bio);
+       (biovec = getNextBiovec(&iter)) != NULL;
+       advanceBioIterator(&iter)) {
+    memcpy(getBufferForBiovec(biovec), dataPtr, biovec->bv_len);
+    flush_dcache_page(biovec->bv_page);
+    dataPtr += biovec->bv_len;
+  }
+}
+
+/**********************************************************************/
+void setBioOperation(BIO *bio, unsigned int operation)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio->bi_opf &= ~REQ_OP_MASK;
+  bio->bi_opf |= operation;
+#else
+
+  unsigned int OPERATION_MASK = WRITE | REQ_DISCARD | REQ_FLUSH;
+
+  // Clear the relevant bits
+  bio->bi_rw &= ~OPERATION_MASK;
+  // Set the operation we care about
+  bio->bi_rw |= operation;
+#endif
+}
+
+/**********************************************************************/
+void freeBio(BIO *bio, KernelLayer *layer)
+{
+  bio_put(bio);
+}
+
+/**********************************************************************/
+void countBios(AtomicBioStats *bioStats, BIO *bio)
+{
+  if (isWriteBio(bio)) {
+    atomic64_inc(&bioStats->write);
+  } else {
+    atomic64_inc(&bioStats->read);
+  }
+  if (isDiscardBio(bio)) {
+    atomic64_inc(&bioStats->discard);
+  }
+  if (isFlushBio(bio)) {
+    atomic64_inc(&bioStats->flush);
+  }
+  if (isFUABio(bio)) {
+    atomic64_inc(&bioStats->fua);
+  }
+}
+
+/**
+ * The function determines whether a buffer contains all zeroes.
+ *
+ * @param buffer  The buffer to check
+ * @param length  The length of the buffer
+ *
+ * @return true is all zeroes, false otherwise
+ **/
+static inline bool isAllZeros(const char *buffer, unsigned int length)
+{
+  /*
+   * Handle expected common case of even the first word being nonzero,
+   * without getting into the more expensive (for one iteration) loop
+   * below.
+   */
+  if (likely(length >= sizeof(uint64_t))) {
+    if (GET_UNALIGNED(uint64_t, buffer) != 0) {
+      return false;
+    }
+
+    unsigned int wordCount = length / sizeof(uint64_t);
+
+    // Unroll to process 64 bytes at a time
+    unsigned int chunkCount = wordCount / 8;
+    while (chunkCount-- > 0) {
+      uint64_t word0 = GET_UNALIGNED(uint64_t, buffer);
+      uint64_t word1 = GET_UNALIGNED(uint64_t, buffer + 1 * sizeof(uint64_t));
+      uint64_t word2 = GET_UNALIGNED(uint64_t, buffer + 2 * sizeof(uint64_t));
+      uint64_t word3 = GET_UNALIGNED(uint64_t, buffer + 3 * sizeof(uint64_t));
+      uint64_t word4 = GET_UNALIGNED(uint64_t, buffer + 4 * sizeof(uint64_t));
+      uint64_t word5 = GET_UNALIGNED(uint64_t, buffer + 5 * sizeof(uint64_t));
+      uint64_t word6 = GET_UNALIGNED(uint64_t, buffer + 6 * sizeof(uint64_t));
+      uint64_t word7 = GET_UNALIGNED(uint64_t, buffer + 7 * sizeof(uint64_t));
+      uint64_t or = (word0 | word1 | word2 | word3
+                     | word4 | word5 | word6 | word7);
+      // Prevent compiler from using 8*(cmp;jne).
+      __asm__ __volatile__ ("" : : "g" (or));
+      if (or != 0) {
+        return false;
+      }
+      buffer += 8 * sizeof(uint64_t);
+    }
+    wordCount %= 8;
+
+    // Unroll to process 8 bytes at a time.
+    // (Is this still worthwhile?)
+    while (wordCount-- > 0) {
+      if (GET_UNALIGNED(uint64_t, buffer) != 0) {
+        return false;
+      }
+      buffer += sizeof(uint64_t);
+    }
+    length %= sizeof(uint64_t);
+    // Fall through to finish up anything left over.
+  }
+
+  while (length-- > 0) {
+    if (*buffer++ != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**********************************************************************/
+bool bioIsZeroData(BIO *bio)
+{
+  struct bio_vec *biovec;
+  for (BioIterator iter = createBioIterator(bio);
+       (biovec = getNextBiovec(&iter)) != NULL;
+       advanceBioIterator(&iter)) {
+    if (!isAllZeros(getBufferForBiovec(biovec), biovec->bv_len)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**********************************************************************/
+void bioZeroData(BIO *bio)
+{
+  zero_fill_bio(bio);
+}
+
+/**********************************************************************/
+static void setBioSize(BIO *bio, BlockSize bioSize)
+{
+#ifdef USE_BI_ITER
+  bio->bi_iter.bi_size = bioSize;
+#else
+  bio->bi_size = bioSize;
+#endif
+}
+
+/**
+ * Initialize a bio.
+ *
+ * @param bio    The bio to initialize
+ * @param layer  The layer to which it belongs.
+ **/
+static void initializeBio(BIO *bio, KernelLayer *layer)
+{
+  // Save off important info so it can be set back later
+  unsigned short  vcnt = bio->bi_vcnt;
+  void           *pvt  = bio->bi_private;
+  bio_reset(bio);     // Memsets large portion of bio. Reset all needed fields.
+  bio->bi_private      = pvt;
+  bio->bi_vcnt         = vcnt;
+  bio->bi_end_io       = completeAsyncBio;
+  setBioSector(bio, (sector_t) -1);  // Sector will be set later on.
+  setBioBlockDevice(bio, getKernelLayerBdev(layer));
+}
+
+/**********************************************************************/
+void resetBio(BIO *bio, KernelLayer *layer)
+{
+  initializeBio(bio, layer);
+  setBioSize(bio, VDO_BLOCK_SIZE);
+}
+
+/**********************************************************************/
+int allocateBio(KernelLayer *layer, unsigned int bvecCount, BIO **bioPtr)
+{
+  BIO *bio = bio_alloc_bioset(GFP_NOIO, bvecCount, layer->bioset);
+  if (IS_ERR(bio)) {
+    logError("bio allocation failure %ld", PTR_ERR(bio));
+    return PTR_ERR(bio);
+  }
+
+  initializeBio(bio, layer);
+
+  *bioPtr = bio;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int createBio(KernelLayer *layer, char *data, BIO **bioPtr)
+{
+  BIO *bio = NULL;
+  if (data == NULL) {
+    int result = allocateBio(layer, 0, &bio);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+
+    *bioPtr = bio;
+    return VDO_SUCCESS;
+  }
+
+  unsigned int  len       = VDO_BLOCK_SIZE;
+  unsigned long kaddr     = (unsigned long) data;
+  unsigned long end       = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+  unsigned long start     = kaddr >> PAGE_SHIFT;
+  const int     bvecCount = end - start;
+
+  int result = allocateBio(layer, bvecCount, &bio);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  int offset = offset_in_page(kaddr);
+  for (unsigned int i = 0; (i < bvecCount) && (len > 0); i++) {
+    unsigned int bytes = PAGE_SIZE - offset;
+    if (bytes > len) {
+      bytes = len;
+    }
+
+  struct page *page
+    = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data);
+  int bytesAdded = bio_add_page(bio, page, bytes, offset);
+  if (bytesAdded != bytes) {
+    freeBio(bio, layer);
+    return logErrorWithStringError(VDO_BIO_CREATION_FAILED,
+                                   "Could only add %i bytes to bio",
+                                   bytesAdded);
+
+    }
+
+    data   += bytes;
+    len    -= bytes;
+    offset  = 0;
+  }
+
+  *bioPtr = bio;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void prepareFlushBIO(BIO                 *bio,
+                     void                *context,
+                     struct block_device *device,
+                     bio_end_io_t        *endIOCallback)
+{
+  clearBioOperationAndFlags(bio);
+  /*
+   * One would think we could use REQ_OP_FLUSH on new kernels, but some
+   * layers of the stack don't recognize that as a flush. So do it
+   * like blkdev_issue_flush() and make it a write+flush.
+   */
+  setBioOperationWrite(bio);
+  setBioOperationFlagPreflush(bio);
+  bio->bi_end_io  = endIOCallback;
+  bio->bi_private = context;
+  bio->bi_vcnt    = 0;
+  setBioBlockDevice(bio, device);
+  setBioSize(bio, 0);
+  setBioSector(bio, 0);
+}
diff --git a/vdo/kernel/bio.h b/vdo/kernel/bio.h
new file mode 100644
index 0000000..1ba8234
--- /dev/null
+++ b/vdo/kernel/bio.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.h#6 $
+ */
+
+#ifndef BIO_H
+#define BIO_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/version.h>
+
+#include "kernelTypes.h"
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)
+#define USE_BI_ITER 1
+#endif
+
+/**
+ * Copy the bio data to a char array.
+ *
+ * @param bio      The bio to copy the data from
+ * @param dataPtr  The local array to copy the data to
+ **/
+void bioCopyDataIn(BIO *bio, char *dataPtr);
+
+/**
+ * Copy a char array to the bio data.
+ *
+ * @param bio      The bio to copy the data to
+ * @param dataPtr  The local array to copy the data from
+ **/
+void bioCopyDataOut(BIO *bio, char *dataPtr);
+
+/**
+ * Set the bi_rw or equivalent field of a bio to a particular data
+ * operation. Intended to be called only by setBioOperationRead() etc.
+ *
+ * @param bio        The bio to modify
+ * @param operation  The operation to set it to
+ **/
+void setBioOperation(BIO *bio, unsigned int operation);
+
+/**********************************************************************/
+static inline void setBioOperationRead(BIO *bio)
+{
+  setBioOperation(bio, READ);
+}
+
+/**********************************************************************/
+static inline void setBioOperationWrite(BIO *bio)
+{
+  setBioOperation(bio, WRITE);
+}
+
+/**********************************************************************/
+static inline void clearBioOperationAndFlags(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio->bi_opf = 0;
+#else
+  bio->bi_rw = 0;
+#endif
+}
+
+/**********************************************************************/
+static inline void copyBioOperationAndFlags(BIO *to, BIO *from)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  to->bi_opf = from->bi_opf;
+#else
+  to->bi_rw = from->bi_rw;
+#endif
+}
+
+/**********************************************************************/
+static inline void setBioOperationFlag(BIO *bio, unsigned int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio->bi_opf |= flag;
+#else
+  bio->bi_rw |= flag;
+#endif
+}
+
+/**********************************************************************/
+static inline void clearBioOperationFlag(BIO *bio, unsigned int flag)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio->bi_opf &= ~flag;
+#else
+  bio->bi_rw &= ~flag;
+#endif
+}
+
+/**********************************************************************/
+static inline void setBioOperationFlagPreflush(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  setBioOperationFlag(bio, REQ_PREFLUSH);
+#else
+  // Preflushes and empty flushes are not currently distinguished.
+  setBioOperation(bio, WRITE_FLUSH);
+#endif
+}
+
+/**********************************************************************/
+static inline void setBioOperationFlagSync(BIO *bio)
+{
+  setBioOperationFlag(bio, REQ_SYNC);
+}
+
+/**********************************************************************/
+static inline void clearBioOperationFlagSync(BIO *bio)
+{
+  clearBioOperationFlag(bio, REQ_SYNC);
+}
+
+/**********************************************************************/
+static inline void setBioOperationFlagFua(BIO *bio)
+{
+  setBioOperationFlag(bio, REQ_FUA);
+}
+
+/**********************************************************************/
+static inline void clearBioOperationFlagFua(BIO *bio)
+{
+  clearBioOperationFlag(bio, REQ_FUA);
+}
+
+/**********************************************************************/
+static inline bool isDiscardBio(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  return (bio != NULL) && (bio_op(bio) == REQ_OP_DISCARD);
+#else
+  return (bio != NULL) && ((bio->bi_rw & REQ_DISCARD) != 0);
+#endif
+}
+
+/**********************************************************************/
+static inline bool isFlushBio(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  return (bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0);
+#else
+  return (bio->bi_rw & REQ_FLUSH) != 0;
+#endif
+}
+
+/**********************************************************************/
+static inline bool isFUABio(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  return (bio->bi_opf & REQ_FUA) != 0;
+#else
+  return (bio->bi_rw & REQ_FUA) != 0;
+#endif
+}
+
+/**********************************************************************/
+static inline bool isReadBio(BIO *bio)
+{
+  return bio_data_dir(bio) == READ;
+}
+
+/**********************************************************************/
+static inline bool isWriteBio(BIO *bio)
+{
+  return bio_data_dir(bio) == WRITE;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+/**
+ * Get the error from the bio.
+ *
+ * @param bio  The bio
+ *
+ * @return the bio's error if any
+ **/
+static inline int getBioResult(BIO *bio)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0)
+  return blk_status_to_errno(bio->bi_status);
+#else
+  return bio->bi_error;
+#endif
+}
+#endif // newer than 4.4
+
+/**
+ * Set the block device for a bio.
+ *
+ * @param bio     The bio to modify
+ * @param device  The new block device for the bio
+ **/
+static inline void setBioBlockDevice(BIO *bio, struct block_device *device)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0)
+  bio_set_dev(bio, device);
+#else
+  bio->bi_bdev = device;
+#endif
+}
+
+/**
+ * Get a bio's size.
+ *
+ * @param bio  The bio
+ *
+ * @return the bio's size
+ **/
+static inline unsigned int getBioSize(BIO *bio)
+{
+#ifdef USE_BI_ITER
+  return bio->bi_iter.bi_size;
+#else
+  return bio->bi_size;
+#endif
+}
+
+/**
+ * Set the bio's sector.
+ *
+ * @param bio     The bio
+ * @param sector  The sector
+ **/
+static inline void setBioSector(BIO *bio, sector_t sector)
+{
+#ifdef USE_BI_ITER
+  bio->bi_iter.bi_sector = sector;
+#else
+  bio->bi_sector = sector;
+#endif
+}
+
+/**
+ * Get the bio's sector.
+ *
+ * @param bio  The bio
+ *
+ * @return the sector
+ **/
+static inline sector_t getBioSector(BIO *bio)
+{
+#ifdef USE_BI_ITER
+  return bio->bi_iter.bi_sector;
+#else
+  return bio->bi_sector;
+#endif
+}
+
+/**
+ * Tell the kernel we've completed processing of this bio.
+ *
+ * @param bio    The bio to complete
+ * @param error  A system error code, or 0 for success
+ **/
+static inline void completeBio(BIO *bio, int error)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0)
+  bio->bi_status = errno_to_blk_status(error);
+  bio_endio(bio);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+  bio->bi_error = error;
+  bio_endio(bio);
+#else
+  bio_endio(bio, error);
+#endif
+}
+
+/**
+ * Frees up a bio structure
+ *
+ * @param bio    The bio to free
+ * @param layer  The layer the bio was created in
+ **/
+void freeBio(BIO *bio, KernelLayer *layer);
+
+/**
+ * Count the statistics for the bios.  This is used for calls into VDO and
+ * for calls out of VDO.
+ *
+ * @param bioStats  Statistics structure to update
+ * @param bio       The bio
+ **/
+void countBios(AtomicBioStats *bioStats, BIO *bio);
+
+/**
+ * Reset a bio so it can be used again.
+ *
+ * @param bio    The bio to reset
+ * @param layer  The physical layer
+ **/
+void resetBio(BIO *bio, KernelLayer *layer);
+
+/**
+ * Check to see whether a bio's data are all zeroes.
+ *
+ * @param bio  The bio
+ *
+ * @return true if the bio's data are all zeroes
+ **/
+bool bioIsZeroData(BIO *bio);
+
+/**
+ * Set a bio's data to all zeroes.
+ *
+ * @param [in] bio  The bio
+ **/
+void bioZeroData(BIO *bio);
+
+/**
+ * Create a new bio structure for kernel buffer storage.
+ *
+ * @param [in]  layer   The physical layer
+ * @param [in]  data    The buffer (can be NULL)
+ * @param [out] bioPtr  A pointer to hold new bio
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int createBio(KernelLayer *layer, char *data, BIO **bioPtr);
+
+/**
+ * Prepare a BIO to issue a flush to the device below.
+ *
+ * @param bio            The flush BIO
+ * @param context        The context for the callback
+ * @param device         The device to flush
+ * @param endIOCallback  The function to call when the flush is complete
+ **/
+void prepareFlushBIO(BIO                 *bio,
+                     void                *context,
+                     struct block_device *device,
+                     bio_end_io_t        *endIOCallback);
+
+/**
+ * Perform IO with a bio, waiting for completion and returning its result.
+ * The bio must already have its sector, block device, and operation set.
+ *
+ * @param bio  The bio to do IO with
+ *
+ * @return The bio result
+ **/
+static inline int submitBioAndWait(BIO *bio)
+{
+  submit_bio_wait(bio);
+  return getBioResult(bio);
+}
+
+#endif /* BIO_H */
diff --git a/vdo/kernel/bioIterator.h b/vdo/kernel/bioIterator.h
new file mode 100644
index 0000000..7445261
--- /dev/null
+++ b/vdo/kernel/bioIterator.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bioIterator.h#1 $
+ */
+
+#ifndef BIO_ITERATOR_H
+#define BIO_ITERATOR_H
+
+#include <linux/bio.h>
+
+#include "bio.h"
+#include "kernelTypes.h"
+
+typedef struct {
+  BIO              *bio;
+#ifdef USE_BI_ITER
+  struct bvec_iter  iter;
+  // Needed so we can store the return value of bio_iter_iovec.
+  struct bio_vec    temp;
+#else
+  int               index;
+#endif
+} BioIterator;
+
+/**
+ * Create an iterator over a bio's data.
+ *
+ * @param bio   The bio to iterate over
+ *
+ * @return An iterator over a bio
+ **/
+static BioIterator createBioIterator(BIO *bio)
+{
+  BioIterator iterator = {
+    .bio   = bio,
+#ifdef USE_BI_ITER
+    .iter  = bio->bi_iter,
+#else
+    .index = bio->bi_idx,
+#endif
+  };
+  return iterator;
+}
+
+/**
+ * Get the next biovec from the iterator, or NULL if there are no more.
+ *
+ * @param iterator      The iterator from which to get data
+ *
+ * @return The next biovec from the iterator, or NULL.
+ **/
+static struct bio_vec *getNextBiovec(BioIterator *iterator)
+{
+  BIO *bio = iterator->bio;
+#ifdef USE_BI_ITER
+  if (iterator->iter.bi_size == 0) {
+    return NULL;
+  }
+
+  iterator->temp = bio_iter_iovec(bio, iterator->iter);
+  return &iterator->temp;
+#else
+  if (iterator->index >= bio->bi_vcnt) {
+    return NULL;
+  }
+  return bio_iovec_idx(bio, iterator->index);
+#endif
+}
+
+/**
+ * Advance the iterator to the next biovec in the bio.
+ *
+ * @param [in,out] iterator     The iterator to advance
+ **/
+static void advanceBioIterator(BioIterator *iterator)
+{
+#ifdef USE_BI_ITER
+  bio_advance_iter(iterator->bio, &iterator->iter, iterator->temp.bv_len);
+#else
+  iterator->index++;
+#endif
+}
+
+#endif /* BIO_ITERATOR_H */
diff --git a/vdo/kernel/bufferPool.c b/vdo/kernel/bufferPool.c
new file mode 100644
index 0000000..9c950ca
--- /dev/null
+++ b/vdo/kernel/bufferPool.c
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.c#1 $
+ */
+
+#include "bufferPool.h"
+
+#include <linux/delay.h>
+#include <linux/sort.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "statusCodes.h"
+
+/*
+ * For list nodes on the free-object list, the data field describes
+ * the object available for reuse.
+ *
+ * For nodes on the "spare" list, the data field is meaningless;
+ * they're just nodes available for use when we need to add an object
+ * pointer to the freeObjectList.
+ *
+ * These are both "free lists", in a sense; don't get confused!
+ */
+typedef struct {
+  struct list_head  list;       // links in current list
+  void             *data;       // element data, if on free list
+} BufferElement;
+
+struct bufferPool {
+  const char              *name; // Pool name
+  void                    *data; // Associated pool data
+  spinlock_t               lock; // Locks this object
+  unsigned int             size; // Total number of buffers
+  struct list_head         freeObjectList; // List of free buffers
+  struct list_head         spareListNodes; // Unused list nodes
+  unsigned int             numBusy; // Number of buffers in use
+  unsigned int             maxBusy; // Maximum value of the above
+  BufferAllocateFunction  *alloc; // Allocate function for buffer data
+  BufferFreeFunction      *free; // Free function for buffer data
+  BufferDumpFunction      *dump; // Dump function for buffer data
+  BufferElement           *bhead; // Array of BufferElement structures
+  void                   **objects;
+};
+
+/*************************************************************************/
+int makeBufferPool(const char              *poolName,
+                   unsigned int             size,
+                   BufferAllocateFunction  *allocateFunction,
+                   BufferFreeFunction      *freeFunction,
+                   BufferDumpFunction      *dumpFunction,
+                   void                    *poolData,
+                   BufferPool             **poolPtr)
+{
+  BufferPool *pool;
+
+  int result = ALLOCATE(1, BufferPool, "buffer pool", &pool);
+  if (result != VDO_SUCCESS) {
+    logError("buffer pool allocation failure %d", result);
+    return result;
+  }
+
+  result = ALLOCATE(size, BufferElement, "buffer pool elements", &pool->bhead);
+  if (result != VDO_SUCCESS) {
+    logError("buffer element array allocation failure %d", result);
+    freeBufferPool(&pool);
+    return result;
+  }
+
+  result = ALLOCATE(size, void *, "object pointers", &pool->objects);
+  if (result != VDO_SUCCESS) {
+    logError("buffer object array allocation failure %d", result);
+    freeBufferPool(&pool);
+    return result;
+  }
+
+  pool->name  = poolName;
+  pool->alloc = allocateFunction;
+  pool->free  = freeFunction;
+  pool->dump  = dumpFunction;
+  pool->data  = poolData;
+  pool->size  = size;
+  spin_lock_init(&pool->lock);
+  INIT_LIST_HEAD(&pool->freeObjectList);
+  INIT_LIST_HEAD(&pool->spareListNodes);
+  BufferElement *bh = pool->bhead;
+  for (int i = 0; i < pool->size; i++) {
+    result = pool->alloc(pool->data, &bh->data);
+    if (result != VDO_SUCCESS) {
+      logError("verify buffer data allocation failure %d", result);
+      freeBufferPool(&pool);
+      return result;
+    }
+    pool->objects[i] = bh->data;
+    list_add(&bh->list, &pool->freeObjectList);
+    bh++;
+  }
+  pool->numBusy = pool->maxBusy = 0;
+
+  *poolPtr = pool;
+  return VDO_SUCCESS;
+}
+
+/*************************************************************************/
+void freeBufferPool(BufferPool **poolPtr)
+{
+  BufferPool *pool = *poolPtr;
+  if (pool == NULL) {
+    return;
+  }
+
+  ASSERT_LOG_ONLY((pool->numBusy == 0), "freeing busy buffer pool, numBusy=%d",
+                  pool->numBusy);
+  if (pool->objects != NULL) {
+    for (int i = 0; i < pool->size; i++) {
+      if (pool->objects[i] != NULL) {
+        pool->free(pool->data, pool->objects[i]);
+      }
+    }
+    FREE(pool->objects);
+  }
+  FREE(pool->bhead);
+  FREE(pool);
+  *poolPtr = NULL;
+}
+
+/*************************************************************************/
+static bool inFreeList(BufferPool *pool, void *data)
+{
+  struct list_head *node;
+  list_for_each(node, &pool->freeObjectList) {
+    if (container_of(node, BufferElement, list)->data == data) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*************************************************************************/
+void dumpBufferPool(BufferPool *pool, bool dumpElements)
+{
+  // In order that syslog can empty its buffer, sleep after 35 elements for
+  // 4ms (till the second clock tick).  These numbers chosen in October
+  // 2012 running on an lfarm.
+  enum { ELEMENTS_PER_BATCH = 35 };
+  enum { SLEEP_FOR_SYSLOG = 4 };
+
+  if (pool == NULL) {
+    return;
+  }
+  spin_lock(&pool->lock);
+  logInfo("%s: %u of %u busy (max %u)", pool->name, pool->numBusy, pool->size,
+          pool->maxBusy);
+  if (dumpElements && (pool->dump != NULL)) {
+    int dumped = 0;
+    for (int i = 0; i < pool->size; i++) {
+      if (!inFreeList(pool, pool->objects[i])) {
+        pool->dump(pool->data, pool->objects[i]);
+        if (++dumped >= ELEMENTS_PER_BATCH) {
+          spin_unlock(&pool->lock);
+          dumped = 0;
+          msleep(SLEEP_FOR_SYSLOG);
+          spin_lock(&pool->lock);
+        }
+      }
+    }
+  }
+  spin_unlock(&pool->lock);
+}
+
+/*************************************************************************/
+int allocBufferFromPool(BufferPool *pool, void **dataPtr)
+{
+  if (pool == NULL) {
+    return UDS_INVALID_ARGUMENT;
+  }
+
+  spin_lock(&pool->lock);
+  if (unlikely(list_empty(&pool->freeObjectList))) {
+    spin_unlock(&pool->lock);
+    logDebug("no free buffers");
+    return -ENOMEM;
+  }
+
+  BufferElement *bh = list_first_entry(&pool->freeObjectList, BufferElement,
+                                       list);
+  list_move(&bh->list, &pool->spareListNodes);
+  pool->numBusy++;
+  if (pool->numBusy > pool->maxBusy) {
+    pool->maxBusy = pool->numBusy;
+  }
+  *dataPtr = bh->data;
+  spin_unlock(&pool->lock);
+  return VDO_SUCCESS;
+
+}
+
+/*************************************************************************/
+static bool freeBufferToPoolInternal(BufferPool *pool, void *data)
+{
+  if (unlikely(list_empty(&pool->spareListNodes))) {
+    return false;
+  }
+  BufferElement *bh = list_first_entry(&pool->spareListNodes, BufferElement,
+                                       list);
+  list_move(&bh->list, &pool->freeObjectList);
+  bh->data = data;
+  pool->numBusy--;
+  return true;
+}
+
+/*************************************************************************/
+void freeBufferToPool(BufferPool *pool, void *data)
+{
+  spin_lock(&pool->lock);
+  bool success = freeBufferToPoolInternal(pool, data);
+  spin_unlock(&pool->lock);
+  if (!success) {
+    logDebug("trying to add to free list when already full");
+  }
+}
+
+/*************************************************************************/
+void freeBuffersToPool(BufferPool *pool, void **data, int count)
+{
+  spin_lock(&pool->lock);
+  bool success = true;
+  for (int i = 0; (i < count) && success; i++) {
+    success = freeBufferToPoolInternal(pool, data[i]);
+  }
+  spin_unlock(&pool->lock);
+  if (!success) {
+    logDebug("trying to add to free list when already full");
+  }
+}
diff --git a/vdo/kernel/bufferPool.h b/vdo/kernel/bufferPool.h
new file mode 100644
index 0000000..9c505c9
--- /dev/null
+++ b/vdo/kernel/bufferPool.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.h#1 $
+ */
+#ifndef BUFFERPOOL_H
+#define BUFFERPOOL_H
+
+/*
+ * We need bug.h because in 3.10, kernel.h (indirectly) defines
+ * ARRAY_SIZE as a macro which (indirectly and conditionally) uses
+ * BUILD_BUG_ON_ZERO, which is defined in bug.h, which is *not*
+ * included. In earlier versions like 3.2 it Just Worked.
+ */
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+typedef struct bufferPool BufferPool;
+
+typedef int BufferAllocateFunction(void *poolData, void **dataPtr);
+typedef void BufferFreeFunction(void *poolData, void *data);
+typedef void BufferDumpFunction(void *poolData, void *data);
+
+/**
+ * Creates a generic pool of buffer data. The elements in the pool are
+ * allocated up front and placed on a free list, which manages the
+ * reuse of the individual buffers in the pool.
+ *
+ * @param [in]  poolName         Name of the pool
+ * @param [in]  size             The number of elements to create for this pool
+ * @param [in]  allocateFunction The function to call to create the actual data
+ *                               for each element
+ * @param [in]  freeFunction     The function to call to free the actual data
+ *                               for each element
+ * @param [in]  dumpFunction     The function to call to dump the actual data
+ *                               for each element into the log
+ * @param [in]  poolData         A pointer to the pool's associated data
+ * @param [out] poolPtr          A pointer to hold the pool that was created
+ *
+ * @return a success or error code
+ */
+int makeBufferPool(const char              *poolName,
+                   unsigned int             size,
+                   BufferAllocateFunction  *allocateFunction,
+                   BufferFreeFunction      *freeFunction,
+                   BufferDumpFunction      *dumpFunction,
+                   void                    *poolData,
+                   BufferPool             **poolPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a buffer pool and null out the reference to it. This will free
+ * all the elements of the pool as well.
+ *
+ * @param [in]  poolPtr   The reference to the pool to free
+ **/
+void freeBufferPool(BufferPool **poolPtr);
+
+/**
+ * Dump a buffer pool to the log.
+ *
+ * @param [in] pool          The buffer pool to allocate from
+ * @param [in] dumpElements  True for complete output, or false for a
+ *                           one-line summary
+ **/
+void dumpBufferPool(BufferPool *pool, bool dumpElements);
+
+/**
+ * Acquires a free buffer from the free list of the pool and
+ * returns it's associated data.
+ *
+ * @param [in]  pool      The buffer pool to allocate from
+ * @param [out] dataPtr   A pointer to hold the buffer data
+ *
+ * @return a success or error code
+ */
+int allocBufferFromPool(BufferPool *pool, void **dataPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Returns a buffer to the free list of a pool
+ *
+ * @param [in] pool   The buffer pool to return the buffer to
+ * @param [in] data   The buffer data to return
+ */
+void freeBufferToPool(BufferPool *pool, void *data);
+
+/**
+ * Returns a set of buffers to the free list of a pool
+ *
+ * @param [in] pool   The buffer pool to return the buffer to
+ * @param [in] data   The buffer data to return
+ * @param [in] count  Number of entries in the data array
+ */
+void freeBuffersToPool(BufferPool *pool, void **data, int count);
+
+/**
+ * Control structure for freeing (releasing back to the pool) pointers
+ * in batches.
+ *
+ * Since the objects stored in a buffer pool are completely opaque,
+ * some external data structure is needed to manage a collection of
+ * them. This is a simple helper for doing that, since we're freeing
+ * batches of objects in a couple different places. Within the pool
+ * itself there's a pair of linked lists, but getting at them requires
+ * the locking that we're trying to minimize.
+ *
+ * We collect pointers until the array is full or until there are no
+ * more available, and we call freeBuffersToPool to release a batch
+ * all at once.
+ **/
+typedef struct freeBufferPointers {
+  BufferPool *pool;
+  int         index;
+  void       *pointers[30]; // size is arbitrary
+} FreeBufferPointers;
+
+/**
+ * Initialize the control structure for batching buffer pointers to be
+ * released to their pool.
+ *
+ * @param [out] fbp   The (caller-allocated) control structure
+ * @param [in]  pool  The buffer pool to return objects to.
+ **/
+static inline void initFreeBufferPointers(FreeBufferPointers *fbp,
+                                          BufferPool         *pool)
+{
+  fbp->index = 0;
+  fbp->pool  = pool;
+}
+
+/**
+ * Release any buffers left in the collection.
+ *
+ * @param [in]  fbp  The control structure
+ **/
+static inline void freeBufferPointers(FreeBufferPointers *fbp)
+{
+  freeBuffersToPool(fbp->pool, fbp->pointers, fbp->index);
+  fbp->index = 0;
+}
+
+/**
+ * Add another buffer pointer to the collection, and if we're full,
+ * release the whole batch to the pool.
+ *
+ * @param [in]  fbp      The control structure
+ * @param [in]  pointer  The buffer pointer to release
+ **/
+static inline void addFreeBufferPointer(FreeBufferPointers *fbp,
+                                        void               *pointer)
+{
+  fbp->pointers[fbp->index] = pointer;
+  fbp->index++;
+  if (fbp->index == ARRAY_SIZE(fbp->pointers)) {
+    freeBufferPointers(fbp);
+  }
+}
+
+#endif /* BUFFERPOOL_H */
diff --git a/vdo/kernel/dataKVIO.c b/vdo/kernel/dataKVIO.c
new file mode 100644
index 0000000..ba9c8e8
--- /dev/null
+++ b/vdo/kernel/dataKVIO.c
@@ -0,0 +1,1192 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.c#18 $
+ */
+
+#include "dataKVIO.h"
+
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "murmur/MurmurHash3.h"
+
+#include "dataVIO.h"
+#include "compressedBlock.h"
+#include "hashLock.h"
+#include "lz4.h"
+
+#include "bio.h"
+#include "dedupeIndex.h"
+#include "kvdoFlush.h"
+#include "kvio.h"
+#include "ioSubmitter.h"
+#include "vdoCommon.h"
+#include "verify.h"
+
+static void dumpPooledDataKVIO(void *poolData, void *data);
+
+enum {
+  WRITE_PROTECT_FREE_POOL = 0,
+  WP_DATA_KVIO_SIZE       = (sizeof(DataKVIO) + PAGE_SIZE - 1
+                             - ((sizeof(DataKVIO) + PAGE_SIZE - 1)
+                                % PAGE_SIZE))
+};
+
+/**
+ * Alter the write-access permission to a page of memory, so that
+ * objects in the free pool may no longer be modified.
+ *
+ * To do: Deny read access as well.
+ *
+ * @param address    The starting address to protect, which must be on a
+ *                   page boundary
+ * @param byteCount  The number of bytes to protect, which must be a multiple
+ *                   of the page size
+ * @param mode       The write protection mode (true means read-only)
+ **/
+static __always_inline void
+setWriteProtect(void   *address,
+                size_t  byteCount,
+                bool    mode __attribute__((unused)))
+{
+  BUG_ON((((long) address) % PAGE_SIZE) != 0);
+  BUG_ON((byteCount % PAGE_SIZE) != 0);
+  BUG(); // only works in internal code, sorry
+}
+
+/**********************************************************************/
+static void maybeLogDataKVIOTrace(DataKVIO *dataKVIO)
+{
+  if (dataKVIO->kvio.layer->traceLogging) {
+    logKvioTrace(&dataKVIO->kvio);
+  }
+}
+
+/**
+ * First tracing hook for VIO completion.
+ *
+ * If the SystemTap script vdotrace.stp is in use, it does stage 1 of
+ * its processing here. We must not call addTraceRecord between the
+ * two tap functions.
+ *
+ * @param dataKVIO  The VIO we're finishing up
+ **/
+static void kvioCompletionTap1(DataKVIO *dataKVIO)
+{
+  /*
+   * Ensure that dataKVIO doesn't get optimized out, even under inline
+   * expansion. Also, make sure the compiler has to emit debug info
+   * for baseTraceLocation, which some of our SystemTap scripts will
+   * use here.
+   *
+   * First, make it look as though all memory could be clobbered; then
+   * require that a value be read into a register. That'll force at
+   * least one instruction to exist (so SystemTap can hook in) where
+   * dataKVIO is live. We use a field that the caller would've
+   * accessed recently anyway, so it may be cached.
+   */
+  barrier();
+  __asm__ __volatile__(""
+                       :
+                       : "g" (dataKVIO), "g" (baseTraceLocation),
+                         "r" (dataKVIO->kvio.layer));
+}
+
+/**
+ * Second tracing hook for VIO completion.
+ *
+ * The SystemTap script vdotrace.stp splits its VIO-completion work
+ * into two stages, to reduce lock contention for script variables.
+ * Hence, it needs two hooks in the code.
+ *
+ * @param dataKVIO  The VIO we're finishing up
+ **/
+static void kvioCompletionTap2(DataKVIO *dataKVIO)
+{
+  // Hack to ensure variable doesn't get optimized out.
+  barrier();
+  __asm__ __volatile__("" : : "g" (dataKVIO), "r" (dataKVIO->kvio.layer));
+}
+
+/**********************************************************************/
+static void kvdoAcknowledgeDataKVIO(DataKVIO *dataKVIO)
+{
+  KernelLayer       *layer             = dataKVIO->kvio.layer;
+  ExternalIORequest *externalIORequest = &dataKVIO->externalIORequest;
+  BIO               *bio               = externalIORequest->bio;
+  if (bio == NULL) {
+    return;
+  }
+
+  externalIORequest->bio = NULL;
+
+  int error
+    = mapToSystemError(dataVIOAsCompletion(&dataKVIO->dataVIO)->result);
+  bio->bi_end_io  = externalIORequest->endIO;
+  bio->bi_private = externalIORequest->private;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio->bi_opf     = externalIORequest->rw;
+#else
+  bio->bi_rw      = externalIORequest->rw;
+#endif
+
+  countBios(&layer->biosAcknowledged, bio);
+  if (dataKVIO->isPartial) {
+    countBios(&layer->biosAcknowledgedPartial, bio);
+  }
+
+
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+  completeBio(bio, error);
+}
+
+/**********************************************************************/
+static noinline void cleanDataKVIO(DataKVIO *dataKVIO, FreeBufferPointers *fbp)
+{
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+  kvdoAcknowledgeDataKVIO(dataKVIO);
+
+  KVIO *kvio = dataKVIOAsKVIO(dataKVIO);
+  kvio->bio  = NULL;
+
+  if (unlikely(kvio->vio->trace != NULL)) {
+    maybeLogDataKVIOTrace(dataKVIO);
+    kvioCompletionTap1(dataKVIO);
+    kvioCompletionTap2(dataKVIO);
+    freeTraceToPool(kvio->layer, kvio->vio->trace);
+  }
+
+  addFreeBufferPointer(fbp, dataKVIO);
+}
+
+/**********************************************************************/
+void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure)
+{
+  KernelLayer *layer = closure;
+  uint32_t     count = 0;
+  ASSERT_LOG_ONLY(batch != NULL, "batch not null");
+  ASSERT_LOG_ONLY(layer != NULL, "layer not null");
+
+  FreeBufferPointers fbp;
+  initFreeBufferPointers(&fbp, layer->dataKVIOPool);
+
+  KvdoWorkItem *item;
+  while ((item = nextBatchItem(batch)) != NULL) {
+    cleanDataKVIO(workItemAsDataKVIO(item), &fbp);
+    condReschedBatchProcessor(batch);
+    count++;
+  }
+
+  if (fbp.index > 0) {
+    freeBufferPointers(&fbp);
+  }
+
+  completeManyRequests(layer, count);
+}
+
+/**********************************************************************/
+static void kvdoAcknowledgeThenCompleteDataKVIO(KvdoWorkItem *item)
+{
+  DataKVIO *dataKVIO = workItemAsDataKVIO(item);
+  kvdoAcknowledgeDataKVIO(dataKVIO);
+  addToBatchProcessor(dataKVIO->kvio.layer->dataKVIOReleaser, item);
+}
+
+/**********************************************************************/
+void kvdoCompleteDataKVIO(VDOCompletion *completion)
+{
+  DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion));
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+
+  KernelLayer *layer = getLayerFromDataKVIO(dataKVIO);
+  if (useBioAckQueue(layer) && USE_BIO_ACK_QUEUE_FOR_READ
+      && (dataKVIO->externalIORequest.bio != NULL)) {
+    launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeThenCompleteDataKVIO,
+                                NULL, BIO_ACK_Q_ACTION_ACK);
+  } else {
+    addToBatchProcessor(layer->dataKVIOReleaser,
+                        workItemFromDataKVIO(dataKVIO));
+  }
+}
+
+/**
+ * Copy the uncompressed data from a compressed block read into the user
+ * bio which requested the read.
+ *
+ * @param workItem  The DataKVIO which requested the read
+ **/
+static void copyReadBlockData(KvdoWorkItem *workItem)
+{
+  DataKVIO *dataKVIO = workItemAsDataKVIO(workItem);
+
+  // For a read-modify-write, copy the data into the dataBlock buffer so it
+  // will be set up for the write phase.
+  if (isReadModifyWriteVIO(dataKVIO->kvio.vio)) {
+    bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data);
+    kvdoEnqueueDataVIOCallback(dataKVIO);
+    return;
+  }
+
+  // For a partial read, the callback will copy the requested data from the
+  // read block.
+  if (dataKVIO->isPartial) {
+    kvdoEnqueueDataVIOCallback(dataKVIO);
+    return;
+  }
+
+  // For a full block read, copy the data to the bio and acknowledge.
+  bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data);
+  kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO);
+}
+
+/**
+ * Finish reading data for a compressed block.
+ *
+ * @param dataKVIO  The DataKVIO which requested the read
+ **/
+static void readDataKVIOReadBlockCallback(DataKVIO *dataKVIO)
+{
+  if (dataKVIO->readBlock.status != VDO_SUCCESS) {
+    setCompletionResult(dataVIOAsCompletion(&dataKVIO->dataVIO),
+                        dataKVIO->readBlock.status);
+    kvdoEnqueueDataVIOCallback(dataKVIO);
+    return;
+  }
+
+  launchDataKVIOOnCPUQueue(dataKVIO, copyReadBlockData, NULL,
+                           CPU_Q_ACTION_COMPRESS_BLOCK);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+/**
+ * Complete and reset a bio that was supplied by the user and then used for a
+ * read (so that we can complete it with the user's callback).
+ *
+ * @param bio   The bio to complete
+ **/
+static void resetUserBio(BIO *bio)
+#else
+/**
+ * Complete and reset a bio that was supplied by the user and then used for a
+ * read (so that we can complete it with the user's callback).
+ *
+ * @param bio   The bio to complete
+ * @param error Possible error from underlying block device
+ **/
+static void resetUserBio(BIO *bio, int error)
+#endif
+{
+#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)) \
+     && (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)))
+  // This is a user bio, and the device just called bio_endio() on it, so
+  // we need to re-increment bi_remaining so we too can call bio_endio().
+  atomic_inc(&bio->bi_remaining);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+  completeAsyncBio(bio);
+#else
+  completeAsyncBio(bio, error);
+#endif
+}
+
+/**
+ * Uncompress the data that's just been read and then call back the requesting
+ * DataKVIO.
+ *
+ * @param workItem  The DataKVIO requesting the data
+ **/
+static void uncompressReadBlock(KvdoWorkItem *workItem)
+{
+  DataKVIO  *dataKVIO  = workItemAsDataKVIO(workItem);
+  ReadBlock *readBlock = &dataKVIO->readBlock;
+  BlockSize  blockSize = VDO_BLOCK_SIZE;
+
+  // The DataKVIO's scratch block will be used to contain the
+  // uncompressed data.
+  uint16_t fragmentOffset, fragmentSize;
+  char *compressedData = readBlock->data;
+  int result = getCompressedBlockFragment(readBlock->mappingState,
+                                          compressedData, blockSize,
+                                          &fragmentOffset,
+                                          &fragmentSize);
+  if (result != VDO_SUCCESS) {
+    logDebug("%s: frag err %d", __func__, result);
+    readBlock->status = result;
+    readBlock->callback(dataKVIO);
+    return;
+  }
+
+  char *fragment = compressedData + fragmentOffset;
+  int size = LZ4_uncompress_unknownOutputSize(fragment, dataKVIO->scratchBlock,
+                                              fragmentSize, blockSize);
+  if (size == blockSize) {
+    readBlock->data = dataKVIO->scratchBlock;
+  } else {
+    logDebug("%s: lz4 error", __func__);
+    readBlock->status = VDO_INVALID_FRAGMENT;
+  }
+
+  readBlock->callback(dataKVIO);
+}
+
+/**
+ * Now that we have gotten the data from storage, uncompress the data if
+ * necessary and then call back the requesting DataKVIO.
+ *
+ * @param dataKVIO  The DataKVIO requesting the data
+ * @param result    The result of the read operation
+ **/
+static void completeRead(DataKVIO *dataKVIO, int result)
+{
+  ReadBlock *readBlock = &dataKVIO->readBlock;
+  readBlock->status = result;
+
+  if ((result == VDO_SUCCESS) && isCompressed(readBlock->mappingState)) {
+    launchDataKVIOOnCPUQueue(dataKVIO, uncompressReadBlock, NULL,
+                             CPU_Q_ACTION_COMPRESS_BLOCK);
+    return;
+  }
+
+  readBlock->callback(dataKVIO);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+/**
+ * Callback for a bio doing a read.
+ *
+ * @param bio     The bio
+ */
+static void readBioCallback(BIO *bio)
+#else
+/**
+ * Callback for a bio doing a read.
+ *
+ * @param bio     The bio
+ * @param result  The result of the read operation
+ */
+static void readBioCallback(BIO *bio, int result)
+#endif
+{
+  KVIO *kvio = (KVIO *) bio->bi_private;
+  DataKVIO *dataKVIO = kvioAsDataKVIO(kvio);
+  dataKVIO->readBlock.data = dataKVIO->readBlock.buffer;
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+  countCompletedBios(bio);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+  completeRead(dataKVIO, getBioResult(bio));
+#else
+  completeRead(dataKVIO, result);
+#endif
+}
+
+/**********************************************************************/
+void kvdoReadBlock(DataVIO             *dataVIO,
+                   PhysicalBlockNumber  location,
+                   BlockMappingState    mappingState,
+                   BioQAction           action,
+                   DataKVIOCallback     callback)
+{
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+
+  DataKVIO    *dataKVIO  = dataVIOAsDataKVIO(dataVIO);
+  ReadBlock   *readBlock = &dataKVIO->readBlock;
+  KernelLayer *layer     = getLayerFromDataKVIO(dataKVIO);
+
+  readBlock->callback     = callback;
+  readBlock->status       = VDO_SUCCESS;
+  readBlock->mappingState = mappingState;
+
+  BUG_ON(getBIOFromDataKVIO(dataKVIO)->bi_private != &dataKVIO->kvio);
+  // Read the data directly from the device using the read bio.
+  BIO *bio = readBlock->bio;
+  resetBio(bio, layer);
+  setBioSector(bio, blockToSector(layer, location));
+  setBioOperationRead(bio);
+  bio->bi_end_io = readBioCallback;
+  submitBio(bio, action);
+}
+
+/**********************************************************************/
+void kvdoReadDataVIO(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(!isWriteVIO(dataVIOAsVIO(dataVIO)),
+                  "operation set correctly for data read");
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=readData"));
+
+  if (isCompressed(dataVIO->mapped.state)) {
+    kvdoReadBlock(dataVIO, dataVIO->mapped.pbn, dataVIO->mapped.state,
+                  BIO_Q_ACTION_COMPRESSED_DATA, readDataKVIOReadBlockCallback);
+    return;
+  }
+
+  KVIO *kvio = dataVIOAsKVIO(dataVIO);
+  BIO  *bio  = kvio->bio;
+  bio->bi_end_io = resetUserBio;
+  setBioSector(bio, blockToSector(kvio->layer, dataVIO->mapped.pbn));
+  submitBio(bio, BIO_Q_ACTION_DATA);
+}
+
+/**********************************************************************/
+static void kvdoAcknowledgeDataKVIOThenContinue(KvdoWorkItem *item)
+{
+  DataKVIO *dataKVIO = workItemAsDataKVIO(item);
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+  kvdoAcknowledgeDataKVIO(dataKVIO);
+  // Even if we're not using bio-ack threads, we may be in the wrong
+  // base-code thread.
+  kvdoEnqueueDataVIOCallback(dataKVIO);
+}
+
+/**********************************************************************/
+void kvdoAcknowledgeDataVIO(DataVIO *dataVIO)
+{
+  DataKVIO    *dataKVIO = dataVIOAsDataKVIO(dataVIO);
+  KernelLayer *layer    = getLayerFromDataKVIO(dataKVIO);
+
+  // If the remaining discard work is not completely processed by this VIO,
+  // don't acknowledge it yet.
+  if (isDiscardBio(dataKVIO->externalIORequest.bio)
+      && (dataKVIO->remainingDiscard
+          > (VDO_BLOCK_SIZE - dataKVIO->offset))) {
+    invokeCallback(dataVIOAsCompletion(dataVIO));
+    return;
+  }
+
+  // We've finished with the KVIO; acknowledge completion of the bio to the
+  // kernel.
+  if (useBioAckQueue(layer)) {
+    dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+    launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeDataKVIOThenContinue,
+                                NULL, BIO_ACK_Q_ACTION_ACK);
+  } else {
+    kvdoAcknowledgeDataKVIOThenContinue(workItemFromDataKVIO(dataKVIO));
+  }
+}
+
+/**********************************************************************/
+void kvdoWriteDataVIO(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(isWriteVIO(dataVIOAsVIO(dataVIO)),
+                  "kvdoWriteDataVIO() called on write DataVIO");
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=writeData;j=normal"));
+
+  KVIO *kvio  = dataVIOAsKVIO(dataVIO);
+  BIO  *bio   = kvio->bio;
+  setBioOperationWrite(bio);
+  setBioSector(bio, blockToSector(kvio->layer, dataVIO->newMapped.pbn));
+  submitBio(bio, BIO_Q_ACTION_DATA);
+}
+
+/**********************************************************************/
+void kvdoModifyWriteDataVIO(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  DataKVIO    *dataKVIO = dataVIOAsDataKVIO(dataVIO);
+  BIO         *bio      = dataKVIO->externalIORequest.bio;
+  KernelLayer *layer    = getLayerFromDataKVIO(dataKVIO);
+  resetBio(dataKVIO->dataBlockBio, layer);
+
+  if (!isDiscardBio(bio)) {
+    bioCopyDataIn(bio, dataKVIO->dataBlock + dataKVIO->offset);
+  } else {
+    memset(dataKVIO->dataBlock + dataKVIO->offset, '\0',
+           min(dataKVIO->remainingDiscard,
+               (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset)));
+  }
+
+  dataVIO->isZeroBlock               = bioIsZeroData(dataKVIO->dataBlockBio);
+  dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio;
+  copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio);
+  // Make the bio a write, not (potentially) a discard.
+  setBioOperationWrite(dataKVIO->dataBlockBio);
+}
+
+/**********************************************************************/
+void kvdoZeroDataVIO(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("zeroDataVIO;io=readData"));
+  bioZeroData(dataVIOAsKVIO(dataVIO)->bio);
+}
+
+/**********************************************************************/
+void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination)
+{
+  dataVIOAddTraceRecord(destination, THIS_LOCATION(NULL));
+  bioCopyDataOut(dataVIOAsKVIO(destination)->bio,
+                 dataVIOAsDataKVIO(source)->dataBlock);
+}
+
+/**********************************************************************/
+static void kvdoCompressWork(KvdoWorkItem *item)
+{
+  DataKVIO    *dataKVIO = workItemAsDataKVIO(item);
+  KernelLayer *layer    = getLayerFromDataKVIO(dataKVIO);
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+
+  char *context = getWorkQueuePrivateData();
+  if (unlikely(context == NULL)) {
+    uint32_t index = atomicAdd32(&layer->compressionContextIndex, 1) - 1;
+    BUG_ON(index >= layer->deviceConfig->threadCounts.cpuThreads);
+    context = layer->compressionContext[index];
+    setWorkQueuePrivateData(context);
+  }
+
+  int size = LZ4_compress_ctx_limitedOutput(context, dataKVIO->dataBlock,
+                                            dataKVIO->scratchBlock,
+                                            VDO_BLOCK_SIZE,
+                                            VDO_BLOCK_SIZE);
+  DataVIO *dataVIO = &dataKVIO->dataVIO;
+  if (size > 0) {
+    // The scratch block will be used to contain the compressed data.
+    dataVIO->compression.data = dataKVIO->scratchBlock;
+    dataVIO->compression.size = size;
+  } else {
+    // Use block size plus one as an indicator for uncompressible data.
+    dataVIO->compression.size = VDO_BLOCK_SIZE + 1;
+  }
+
+  kvdoEnqueueDataVIOCallback(dataKVIO);
+}
+
+/**********************************************************************/
+void kvdoCompressDataVIO(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO,
+                        THIS_LOCATION("compressDataVIO;"
+                                      "io=compress;cb=compress"));
+
+  /*
+   * If the orignal bio was a discard, but we got this far because the discard
+   * was a partial one (r/m/w), and it is part of a larger discard, we cannot
+   * compress this VIO. We need to make sure the VIO completes ASAP.
+   */
+  DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO);
+  if (isDiscardBio(dataKVIO->externalIORequest.bio)
+      && (dataKVIO->remainingDiscard > 0)) {
+    dataVIO->compression.size = VDO_BLOCK_SIZE + 1;
+    kvdoEnqueueDataVIOCallback(dataKVIO);
+    return;
+  }
+
+  launchDataKVIOOnCPUQueue(dataKVIO, kvdoCompressWork, NULL,
+                           CPU_Q_ACTION_COMPRESS_BLOCK);
+}
+
+/**
+ * Construct a DataKVIO.
+ *
+ * @param [in]  layer        The physical layer
+ * @param [in]  bio          The bio to associate with this DataKVIO
+ * @param [out] dataKVIOPtr  A pointer to hold the new DataKVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int makeDataKVIO(KernelLayer *layer, BIO *bio, DataKVIO **dataKVIOPtr)
+{
+  DataKVIO *dataKVIO;
+  int result = allocBufferFromPool(layer->dataKVIOPool, (void **) &dataKVIO);
+  if (result != VDO_SUCCESS) {
+    return logErrorWithStringError(result, "data kvio allocation failure");
+  }
+
+  if (WRITE_PROTECT_FREE_POOL) {
+    setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false);
+  }
+
+  KVIO *kvio = &dataKVIO->kvio;
+  kvio->vio = dataVIOAsVIO(&dataKVIO->dataVIO);
+  memset(&kvio->enqueueable, 0, sizeof(KvdoEnqueueable));
+  memset(&dataKVIO->dedupeContext.pendingList, 0, sizeof(struct list_head));
+  memset(&dataKVIO->dataVIO, 0, sizeof(DataVIO));
+  kvio->bioToSubmit = NULL;
+  bio_list_init(&kvio->biosMerged);
+
+  // The dataBlock is only needed for writes and some partial reads.
+  if (isWriteBio(bio) || (getBioSize(bio) < VDO_BLOCK_SIZE)) {
+    resetBio(dataKVIO->dataBlockBio, layer);
+  }
+
+  initializeKVIO(kvio, layer, VIO_TYPE_DATA, VIO_PRIORITY_DATA, NULL, bio);
+  *dataKVIOPtr = dataKVIO;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Creates a new DataVIO structure. A DataVIO represents a single logical
+ * block of data. It is what most VDO operations work with. This function also
+ * creates a wrapping DataKVIO structure that is used when we want to
+ * physically read or write the data associated with the DataVIO.
+ *
+ * @param [in]  layer        The physical layer
+ * @param [in]  bio          The BIO from the request the new DataKVIO will
+ *                           service
+ * @param [in]  arrivalTime  The arrival time of the BIO
+ * @param [out] dataKVIOPtr  A pointer to hold the new DataKVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int kvdoCreateKVIOFromBio(KernelLayer  *layer,
+                                 BIO          *bio,
+                                 Jiffies       arrivalTime,
+                                 DataKVIO    **dataKVIOPtr)
+{
+  ExternalIORequest externalIORequest = {
+    .bio         = bio,
+    .private     = bio->bi_private,
+    .endIO       = bio->bi_end_io,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+    .rw          = bio->bi_opf,
+#else
+    .rw          = bio->bi_rw,
+#endif
+  };
+
+  // We will handle FUA at the end of the request (after we restore the
+  // bi_rw field from externalIORequest.rw).
+  clearBioOperationFlagFua(bio);
+
+  DataKVIO *dataKVIO = NULL;
+  int       result   = makeDataKVIO(layer, bio, &dataKVIO);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  dataKVIO->externalIORequest = externalIORequest;
+  dataKVIO->offset = sectorToBlockOffset(layer, getBioSector(bio));
+  dataKVIO->isPartial = ((getBioSize(bio) < VDO_BLOCK_SIZE)
+                         || (dataKVIO->offset != 0));
+
+  if (dataKVIO->isPartial) {
+    countBios(&layer->biosInPartial, bio);
+  } else {
+    /*
+     * Note that we unconditionally fill in the dataBlock array for
+     * non-read operations. There are places like kvdoCopyVIO that may
+     * look at kvio->dataBlock for a zero block (and maybe for
+     * discards?). We could skip filling in dataBlock for such cases,
+     * but only once we're sure all such places are fixed to check the
+     * isZeroBlock flag first.
+     */
+    if (isDiscardBio(bio)) {
+      /*
+       * This is a discard/trim operation. This is treated much like the zero
+       * block, but we keep different stats and distinguish it in the block
+       * map.
+       */
+      memset(dataKVIO->dataBlock, 0, VDO_BLOCK_SIZE);
+    } else if (bio_data_dir(bio) == WRITE) {
+      dataKVIO->dataVIO.isZeroBlock = bioIsZeroData(bio);
+      // Copy the bio data to a char array so that we can continue to use
+      // the data after we acknowledge the bio.
+      bioCopyDataIn(bio, dataKVIO->dataBlock);
+    }
+  }
+
+  if (dataKVIO->isPartial || isWriteBio(bio)) {
+    /*
+     * dataKVIO->bio will point at kvio->dataBlockBio for all writes and
+     * partial block I/O so the rest of the kernel code doesn't need to
+     * make a decision as to what to use.
+     */
+    dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio;
+    if (dataKVIO->isPartial && isWriteBio(bio)) {
+      clearBioOperationAndFlags(dataKVIO->dataBlockBio);
+      setBioOperationRead(dataKVIO->dataBlockBio);
+    } else {
+      copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio);
+    }
+    dataKVIOAsKVIO(dataKVIO)->bio = dataKVIO->dataBlockBio;
+    dataKVIO->readBlock.data      = dataKVIO->dataBlock;
+  }
+
+  setBioBlockDevice(bio, getKernelLayerBdev(layer));
+  bio->bi_end_io = completeAsyncBio;
+  *dataKVIOPtr   = dataKVIO;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void launchDataKVIOWork(KvdoWorkItem *item)
+{
+  runCallback(vioAsCompletion(workItemAsKVIO(item)->vio));
+}
+
+/**
+ * Continue discard processing for requests that span multiple physical blocks.
+ * If all have been processed the KVIO is completed.  If we have already seen
+ * an error, we skip the rest of the discard and fail immediately.
+ *
+ * <p>Invoked in a request-queue thread after the discard of a block has
+ * completed.
+ *
+ * @param completion  A completion representing the discard KVIO
+ **/
+static void kvdoContinueDiscardKVIO(VDOCompletion *completion)
+{
+  DataVIO     *dataVIO  = asDataVIO(completion);
+  DataKVIO    *dataKVIO = dataVIOAsDataKVIO(dataVIO);
+  KernelLayer *layer    = getLayerFromDataKVIO(dataKVIO);
+  dataKVIO->remainingDiscard
+    -= min(dataKVIO->remainingDiscard,
+           (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset));
+  if ((completion->result != VDO_SUCCESS)
+      || (dataKVIO->remainingDiscard == 0)) {
+    if (dataKVIO->hasDiscardPermit) {
+      limiterRelease(&layer->discardLimiter);
+      dataKVIO->hasDiscardPermit = false;
+    }
+    kvdoCompleteDataKVIO(completion);
+    return;
+  }
+
+  BIO *bio = getBIOFromDataKVIO(dataKVIO);
+  resetBio(bio, layer);
+  dataKVIO->isPartial = (dataKVIO->remainingDiscard < VDO_BLOCK_SIZE);
+  dataKVIO->offset    = 0;
+
+  VIOOperation operation;
+  if (dataKVIO->isPartial) {
+    operation  = VIO_READ_MODIFY_WRITE;
+    setBioOperationRead(bio);
+  } else {
+    operation  = VIO_WRITE;
+  }
+
+  if (requestorSetFUA(dataKVIO)) {
+    operation |= VIO_FLUSH_AFTER;
+  }
+
+  prepareDataVIO(dataVIO, dataVIO->logical.lbn + 1, operation,
+                 !dataKVIO->isPartial, kvdoContinueDiscardKVIO);
+  enqueueDataKVIO(dataKVIO, launchDataKVIOWork, completion->callback,
+                  REQ_Q_ACTION_MAP_BIO);
+}
+
+/**
+ * Finish a partial read.
+ *
+ * @param completion  The partial read KVIO
+ **/
+static void kvdoCompletePartialRead(VDOCompletion *completion)
+{
+  DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion));
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+
+  bioCopyDataOut(dataKVIO->externalIORequest.bio,
+                 dataKVIO->readBlock.data + dataKVIO->offset);
+  kvdoCompleteDataKVIO(completion);
+  return;
+}
+
+/**********************************************************************/
+int kvdoLaunchDataKVIOFromBio(KernelLayer *layer,
+                              BIO         *bio,
+                              uint64_t     arrivalTime,
+                              bool         hasDiscardPermit)
+{
+
+  DataKVIO *dataKVIO = NULL;
+  int result = kvdoCreateKVIOFromBio(layer, bio, arrivalTime, &dataKVIO);
+  if (unlikely(result != VDO_SUCCESS)) {
+    logInfo("%s: KVIO allocation failure", __func__);
+    if (hasDiscardPermit) {
+      limiterRelease(&layer->discardLimiter);
+    }
+    limiterRelease(&layer->requestLimiter);
+    return mapToSystemError(result);
+  }
+
+  /*
+   * Discards behave very differently than other requests when coming
+   * in from device-mapper. We have to be able to handle any size discards
+   * and with various sector offsets within a block.
+   */
+  KVIO         *kvio      = &dataKVIO->kvio;
+  VDOAction    *callback  = kvdoCompleteDataKVIO;
+  VIOOperation  operation = VIO_WRITE;
+  bool          isTrim    = false;
+  if (isDiscardBio(bio)) {
+    dataKVIO->hasDiscardPermit = hasDiscardPermit;
+    dataKVIO->remainingDiscard = getBioSize(bio);
+    callback                   = kvdoContinueDiscardKVIO;
+    if (dataKVIO->isPartial) {
+      operation = VIO_READ_MODIFY_WRITE;
+    } else {
+      isTrim = true;
+    }
+  } else if (dataKVIO->isPartial) {
+    if (bio_data_dir(bio) == READ) {
+      callback  = kvdoCompletePartialRead;
+      operation = VIO_READ;
+    } else {
+      operation = VIO_READ_MODIFY_WRITE;
+    }
+  } else if (bio_data_dir(bio) == READ) {
+    operation = VIO_READ;
+  }
+
+  if (requestorSetFUA(dataKVIO)) {
+    operation |= VIO_FLUSH_AFTER;
+  }
+
+  LogicalBlockNumber lbn
+    = sectorToBlock(layer, getBioSector(bio) - layer->startingSectorOffset);
+  prepareDataVIO(&dataKVIO->dataVIO, lbn, operation, isTrim, callback);
+  enqueueKVIO(kvio, launchDataKVIOWork, vioAsCompletion(kvio->vio)->callback,
+              REQ_Q_ACTION_MAP_BIO);
+  return VDO_SUCCESS;
+}
+
+/**
+ * Hash a DataKVIO and set its chunk name.
+ *
+ * @param item  The DataKVIO to be hashed
+ **/
+static void kvdoHashDataWork(KvdoWorkItem *item)
+{
+  DataKVIO *dataKVIO = workItemAsDataKVIO(item);
+  DataVIO  *dataVIO  = &dataKVIO->dataVIO;
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+
+  MurmurHash3_x64_128(dataKVIO->dataBlock, VDO_BLOCK_SIZE, 0x62ea60be,
+                      &dataVIO->chunkName);
+  dataKVIO->dedupeContext.chunkName = &dataVIO->chunkName;
+
+  kvdoEnqueueDataVIOCallback(dataKVIO);
+}
+
+/**********************************************************************/
+void kvdoHashDataVIO(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL));
+  launchDataKVIOOnCPUQueue(dataVIOAsDataKVIO(dataVIO), kvdoHashDataWork, NULL,
+                           CPU_Q_ACTION_HASH_BLOCK);
+}
+
+/**********************************************************************/
+void kvdoCheckForDuplication(DataVIO *dataVIO)
+{
+  dataVIOAddTraceRecord(dataVIO,
+                        THIS_LOCATION("checkForDuplication;dup=post"));
+  ASSERT_LOG_ONLY(!dataVIO->isZeroBlock,
+                  "zero block not checked for duplication");
+  ASSERT_LOG_ONLY(dataVIO->newMapped.state != MAPPING_STATE_UNMAPPED,
+                  "discard not checked for duplication");
+
+  DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO);
+  if (hasAllocation(dataVIO)) {
+    postDedupeAdvice(dataKVIO);
+  } else {
+    // This block has not actually been written (presumably because we are
+    // full), so attempt to dedupe without posting bogus advice.
+    queryDedupeAdvice(dataKVIO);
+  }
+}
+
+/**********************************************************************/
+void kvdoUpdateDedupeAdvice(DataVIO *dataVIO)
+{
+  updateDedupeAdvice(dataVIOAsDataKVIO(dataVIO));
+}
+
+/**
+ * Implements BufferFreeFunction.
+ **/
+static void freePooledDataKVIO(void *poolData, void *data)
+{
+  if (data == NULL) {
+    return;
+  }
+
+  DataKVIO    *dataKVIO = (DataKVIO *) data;
+  KernelLayer *layer    = (KernelLayer *) poolData;
+  if (WRITE_PROTECT_FREE_POOL) {
+    setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false);
+  }
+
+  if (dataKVIO->dataBlockBio != NULL) {
+    freeBio(dataKVIO->dataBlockBio, layer);
+  }
+
+  if (dataKVIO->readBlock.bio != NULL) {
+    freeBio(dataKVIO->readBlock.bio, layer);
+  }
+
+  FREE(dataKVIO->readBlock.buffer);
+  FREE(dataKVIO->dataBlock);
+  FREE(dataKVIO->scratchBlock);
+  FREE(dataKVIO);
+}
+
+/**
+ * Allocate a DataKVIO. This function is the internals of makePooledDataKVIO().
+ *
+ * @param [in]  layer        The layer in which the DataKVIO will operate
+ * @param [out] dataKVIOPtr  A pointer to hold the newly allocated DataKVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+static int allocatePooledDataKVIO(KernelLayer *layer, DataKVIO **dataKVIOPtr)
+{
+  DataKVIO *dataKVIO;
+  int result;
+  if (WRITE_PROTECT_FREE_POOL) {
+    STATIC_ASSERT(WP_DATA_KVIO_SIZE >= sizeof(DataKVIO));
+    result = allocateMemory(WP_DATA_KVIO_SIZE, 0, __func__, &dataKVIO);
+    if (result == VDO_SUCCESS) {
+      BUG_ON((((size_t) dataKVIO) & (PAGE_SIZE - 1)) != 0);
+    }
+  } else {
+    result = ALLOCATE(1, DataKVIO, __func__, &dataKVIO);
+  }
+
+  if (result != VDO_SUCCESS) {
+    return logErrorWithStringError(result, "DataKVIO allocation failure");
+  }
+
+  STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE);
+  result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio data",
+                          &dataKVIO->dataBlock);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(layer, dataKVIO);
+    return logErrorWithStringError(result, "DataKVIO data allocation failure");
+  }
+
+  result = createBio(layer, dataKVIO->dataBlock, &dataKVIO->dataBlockBio);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(layer, dataKVIO);
+    return logErrorWithStringError(result,
+                                   "DataKVIO data bio allocation failure");
+  }
+
+  result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio read buffer",
+                          &dataKVIO->readBlock.buffer);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(layer, dataKVIO);
+    return logErrorWithStringError(result,
+                                   "DataKVIO read allocation failure");
+  }
+
+  result = createBio(layer, dataKVIO->readBlock.buffer,
+                     &dataKVIO->readBlock.bio);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(layer, dataKVIO);
+    return logErrorWithStringError(result,
+                                   "DataKVIO read bio allocation failure");
+  }
+
+  dataKVIO->readBlock.bio->bi_private = &dataKVIO->kvio;
+
+  result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio scratch",
+                          &dataKVIO->scratchBlock);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(layer, dataKVIO);
+    return logErrorWithStringError(result,
+                                   "DataKVIO scratch allocation failure");
+  }
+
+  *dataKVIOPtr = dataKVIO;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Implements BufferAllocateFunction.
+ **/
+static int makePooledDataKVIO(void *poolData, void **dataPtr)
+{
+  DataKVIO *dataKVIO = NULL;
+  int result = allocatePooledDataKVIO((KernelLayer *) poolData, &dataKVIO);
+  if (result != VDO_SUCCESS) {
+    freePooledDataKVIO(poolData, dataKVIO);
+    return result;
+  }
+
+  *dataPtr = dataKVIO;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Dump out the waiters on each DataVIO in the DataVIO buffer pool.
+ *
+ * @param queue   The queue to check (logical or physical)
+ * @param waitOn  The label to print for queue (logical or physical)
+ **/
+static void dumpVIOWaiters(WaitQueue *queue, char *waitOn)
+{
+  Waiter *first = getFirstWaiter(queue);
+  if (first == NULL) {
+    return;
+  }
+
+  DataVIO *dataVIO = waiterAsDataVIO(first);
+  logInfo("      %s is locked. Waited on by: VIO %" PRIptr " pbn %" PRIu64
+          " lbn %llu d-pbn %llu lastOp %s",
+          waitOn, dataVIO, getDataVIOAllocation(dataVIO),
+          dataVIO->logical.lbn, dataVIO->duplicate.pbn,
+          getOperationName(dataVIO));
+
+  Waiter *waiter;
+  for (waiter = first->nextWaiter;
+       waiter != first;
+       waiter = waiter->nextWaiter) {
+    dataVIO = waiterAsDataVIO(waiter);
+    logInfo("     ... and : VIO %" PRIptr " pbn %llu lbn %"
+            PRIu64 " d-pbn %llu lastOp %s",
+            dataVIO, getDataVIOAllocation(dataVIO), dataVIO->logical.lbn,
+            dataVIO->duplicate.pbn, getOperationName(dataVIO));
+  }
+}
+
+/**
+ * Encode various attributes of a VIO as a string of one-character flags for
+ * dump logging. This encoding is for logging brevity:
+ *
+ * R => VIO completion result not VDO_SUCCESS
+ * W => VIO is on a wait queue
+ * D => VIO is a duplicate
+ *
+ * <p>The common case of no flags set will result in an empty, null-terminated
+ * buffer. If any flags are encoded, the first character in the string will be
+ * a space character.
+ *
+ * @param dataVIO  The VIO to encode
+ * @param buffer   The buffer to receive a null-terminated string of encoded
+ *                 flag character
+ **/
+static void encodeVIODumpFlags(DataVIO *dataVIO, char buffer[8])
+{
+  char *pFlag = buffer;
+  *pFlag++ = ' ';
+  if (dataVIOAsCompletion(dataVIO)->result != VDO_SUCCESS) {
+    *pFlag++ = 'R';
+  }
+  if (dataVIOAsAllocatingVIO(dataVIO)->waiter.nextWaiter != NULL) {
+    *pFlag++ = 'W';
+  }
+  if (dataVIO->isDuplicate) {
+    *pFlag++ = 'D';
+  }
+  if (pFlag == &buffer[1]) {
+    // No flags, so remove the blank space.
+    pFlag = buffer;
+  }
+  *pFlag = '\0';
+}
+
+/**
+ * Dump out info on a DataKVIO from the DataKVIO pool.
+ *
+ * <p>Implements BufferDumpFunction.
+ *
+ * @param poolData  The pool data
+ * @param data      The DataKVIO to dump
+ **/
+static void dumpPooledDataKVIO(void *poolData __attribute__((unused)),
+                               void *data)
+{
+  DataKVIO *dataKVIO = (DataKVIO *) data;
+  DataVIO  *dataVIO  = &dataKVIO->dataVIO;
+
+  /*
+   * This just needs to be big enough to hold a queue (thread) name
+   * and a function name (plus a separator character and NUL). The
+   * latter is limited only by taste.
+   *
+   * In making this static, we're assuming only one "dump" will run at
+   * a time. If more than one does run, the log output will be garbled
+   * anyway.
+   */
+  static char vioWorkItemDumpBuffer[100 + MAX_QUEUE_NAME_LEN];
+  /*
+   * We're likely to be logging a couple thousand of these lines, and
+   * in some circumstances syslogd may have trouble keeping up, so
+   * keep it BRIEF rather than user-friendly.
+   */
+  dumpWorkItemToBuffer(&dataKVIO->kvio.enqueueable.workItem,
+                       vioWorkItemDumpBuffer, sizeof(vioWorkItemDumpBuffer));
+  // Another static buffer...
+  // log10(256) = 2.408+, round up:
+  enum { DECIMAL_DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) };
+  static char vioBlockNumberDumpBuffer[sizeof("P L D")
+                                       + 3 * DECIMAL_DIGITS_PER_UINT64_T];
+  if (dataVIO->isDuplicate) {
+    snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer),
+             "P%llu L%llu D%llu",
+             getDataVIOAllocation(dataVIO), dataVIO->logical.lbn,
+             dataVIO->duplicate.pbn);
+  } else if (hasAllocation(dataVIO)) {
+    snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer),
+             "P%llu L%llu",
+             getDataVIOAllocation(dataVIO), dataVIO->logical.lbn);
+  } else {
+    snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer),
+             "L%llu",
+             dataVIO->logical.lbn);
+  }
+
+  static char vioFlushGenerationBuffer[sizeof(" FG")
+                                       + DECIMAL_DIGITS_PER_UINT64_T] = "";
+  if (dataVIO->flushGeneration != 0) {
+    snprintf(vioFlushGenerationBuffer, sizeof(vioFlushGenerationBuffer),
+             " FG%llu", dataVIO->flushGeneration);
+  }
+
+  // Encode VIO attributes as a string of one-character flags, usually empty.
+  static char flagsDumpBuffer[8];
+  encodeVIODumpFlags(dataVIO, flagsDumpBuffer);
+
+  logInfo("  kvio %" PRIptr " %s%s %s %s%s",
+          dataKVIO, vioBlockNumberDumpBuffer, vioFlushGenerationBuffer,
+          getOperationName(dataVIO), vioWorkItemDumpBuffer, flagsDumpBuffer);
+  // might want info on: wantAlbireoAnswer / operation / status
+  // might want info on: bio / bioToSubmit / biosMerged
+
+  dumpVIOWaiters(&dataVIO->logical.waiters, "lbn");
+
+  // might want to dump more info from VIO here
+}
+
+/**********************************************************************/
+int makeDataKVIOBufferPool(KernelLayer  *layer,
+                           uint32_t      poolSize,
+                           BufferPool  **bufferPoolPtr)
+{
+  return makeBufferPool("DataKVIO Pool", poolSize,
+                        makePooledDataKVIO, freePooledDataKVIO,
+                        dumpPooledDataKVIO, layer, bufferPoolPtr);
+}
+
+/**********************************************************************/
+DataLocation getDedupeAdvice(const DedupeContext *context)
+{
+  DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext);
+  return (DataLocation) {
+    .state = dataKVIO->dataVIO.newMapped.state,
+    .pbn   = dataKVIO->dataVIO.newMapped.pbn,
+  };
+}
+
+/**********************************************************************/
+void setDedupeAdvice(DedupeContext *context, const DataLocation *advice)
+{
+  DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext);
+  receiveDedupeAdvice(&dataKVIO->dataVIO, advice);
+}
diff --git a/vdo/kernel/dataKVIO.h b/vdo/kernel/dataKVIO.h
new file mode 100644
index 0000000..c3989f4
--- /dev/null
+++ b/vdo/kernel/dataKVIO.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.h#5 $
+ */
+
+#ifndef DATA_KVIO_H
+#define DATA_KVIO_H
+
+#include "dataVIO.h"
+#include "kvio.h"
+#include "uds-block.h"
+
+typedef struct {
+  /*
+   * The BIO which was received from the device mapper to initiate an I/O
+   * request. This field will be non-NULL only until the request is
+   * acknowledged.
+   */
+  BIO           *bio;
+  // Cached copies of fields from the bio which will need to be reset after
+  // we're done.
+  void          *private;
+  void          *endIO;
+  // This is a copy of the bi_rw field of the BIO which sadly is not just
+  // a boolean read-write flag, but also includes other flag bits.
+  unsigned long  rw;
+} ExternalIORequest;
+
+/* Dedupe support */
+struct dedupeContext {
+  UdsRequest          udsRequest;
+  struct list_head    pendingList;
+  Jiffies             submissionTime;
+  Atomic32            requestState;
+  int                 status;
+  bool                isPending;
+  /** Hash of the associated VIO (NULL if not calculated) */
+  const UdsChunkName *chunkName;
+};
+
+typedef struct {
+  /**
+   * A pointer to a block that holds the data from the last read operation.
+   **/
+  char                *data;
+  /**
+   * Temporary storage for doing reads from the underlying device.
+   **/
+  char                *buffer;
+  /**
+   * A bio structure wrapping the buffer.
+   **/
+  BIO                 *bio;
+  /**
+   * Callback to invoke after completing the read I/O operation.
+   **/
+  DataKVIOCallback     callback;
+  /**
+   * Mapping state passed to kvdoReadBlock(), used to determine whether
+   * the data must be uncompressed.
+   **/
+  BlockMappingState    mappingState;
+  /**
+   * The result code of the read attempt.
+   **/
+  int                  status;
+} ReadBlock;
+
+struct dataKVIO {
+  /* The embedded base code's DataVIO */
+  DataVIO            dataVIO;
+  /* The embedded KVIO */
+  KVIO               kvio;
+  /* The BIO from the request which is being serviced by this KVIO. */
+  ExternalIORequest  externalIORequest;
+  /* Dedupe */
+  DedupeContext      dedupeContext;
+  /* Read cache */
+  ReadBlock          readBlock;
+  /* partial block support */
+  BlockSize          offset;
+  bool               isPartial;
+  /* discard support */
+  bool               hasDiscardPermit;
+  DiscardSize        remainingDiscard;
+  /**
+   * A copy of user data written, so we can do additional processing
+   * (dedupe, compression) after acknowledging the I/O operation and
+   * thus losing access to the original data.
+   *
+   * Also used as buffer space for read-modify-write cycles when
+   * emulating smaller-than-blockSize I/O operations.
+   **/
+  char              *dataBlock;
+  /** A bio structure describing the #dataBlock buffer. */
+  BIO               *dataBlockBio;
+  /** A block used as output during compression or uncompression. */
+  char              *scratchBlock;
+};
+
+/**
+ * Convert a KVIO to a DataKVIO.
+ *
+ * @param kvio  The KVIO to convert
+ *
+ * @return The KVIO as a DataKVIO
+ **/
+static inline DataKVIO *kvioAsDataKVIO(KVIO *kvio)
+{
+  ASSERT_LOG_ONLY(isData(kvio), "KVIO is a DataKVIO");
+  return container_of(kvio, DataKVIO, kvio);
+}
+
+/**
+ * Convert a DataKVIO to a KVIO.
+ *
+ * @param dataKVIO  The DataKVIO to convert
+ *
+ * @return The DataKVIO as a KVIO
+ **/
+static inline KVIO *dataKVIOAsKVIO(DataKVIO *dataKVIO)
+{
+  return &dataKVIO->kvio;
+}
+
+/**
+ * Returns a pointer to the DataKVIO wrapping a DataVIO.
+ *
+ * @param dataVIO  the DataVIO
+ *
+ * @return the DataKVIO
+ **/
+static inline DataKVIO *dataVIOAsDataKVIO(DataVIO *dataVIO)
+{
+  return container_of(dataVIO, DataKVIO, dataVIO);
+}
+
+/**
+ * Returns a pointer to the KVIO associated with a DataVIO.
+ *
+ * @param dataVIO  the DataVIO
+ *
+ * @return the KVIO
+ **/
+static inline KVIO *dataVIOAsKVIO(DataVIO *dataVIO)
+{
+  return dataKVIOAsKVIO(dataVIOAsDataKVIO(dataVIO));
+}
+
+/**
+ * Returns a pointer to the DataKVIO wrapping a work item.
+ *
+ * @param item  the work item
+ *
+ * @return the DataKVIO
+ **/
+static inline DataKVIO *workItemAsDataKVIO(KvdoWorkItem *item)
+{
+  return kvioAsDataKVIO(workItemAsKVIO(item));
+}
+
+/**
+ * Get the WorkItem from a DataKVIO.
+ *
+ * @param dataKVIO  The DataKVIO
+ *
+ * @return the DataKVIO's work item
+ **/
+static inline KvdoWorkItem *workItemFromDataKVIO(DataKVIO *dataKVIO)
+{
+  return &dataKVIOAsKVIO(dataKVIO)->enqueueable.workItem;
+}
+
+/**
+ * Get the BIO from a DataKVIO.
+ *
+ * @param dataKVIO  The DataKVIO from which to get the BIO
+ *
+ * @return The DataKVIO's BIO
+ **/
+static inline BIO *getBIOFromDataKVIO(DataKVIO *dataKVIO)
+{
+  return dataKVIOAsKVIO(dataKVIO)->bio;
+}
+
+/**
+ * Get the KernelLayer from a DataKVIO.
+ *
+ * @param dataKVIO  The DataKVIO from which to get the KernelLayer
+ *
+ * @return The DataKVIO's KernelLayer
+ **/
+static inline KernelLayer *getLayerFromDataKVIO(DataKVIO *dataKVIO)
+{
+  return dataKVIOAsKVIO(dataKVIO)->layer;
+}
+
+/**
+ * Set up and enqueue a DataKVIO's work item to be processed in the base code
+ * context.
+ *
+ * @param dataKVIO       The DataKVIO with the work item to be run
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ **/
+static inline void enqueueDataKVIO(DataKVIO         *dataKVIO,
+                                   KvdoWorkFunction  work,
+                                   void             *statsFunction,
+                                   unsigned int      action)
+{
+  enqueueKVIO(dataKVIOAsKVIO(dataKVIO), work, statsFunction, action);
+}
+
+/**
+ * Enqueue a DataKVIO on a work queue.
+ *
+ * @param queue     The queue
+ * @param dataKVIO  The DataKVIO
+ **/
+static inline void enqueueDataKVIOWork(KvdoWorkQueue *queue,
+                                       DataKVIO      *dataKVIO)
+{
+  enqueueKVIOWork(queue, dataKVIOAsKVIO(dataKVIO));
+}
+
+/**
+ * Add a trace record for the current source location.
+ *
+ * @param dataKVIO  The DataKVIO structure to be updated
+ * @param location  The source-location descriptor to be recorded
+ **/
+static inline void dataKVIOAddTraceRecord(DataKVIO      *dataKVIO,
+                                          TraceLocation  location)
+{
+  dataVIOAddTraceRecord(&dataKVIO->dataVIO, location);
+}
+
+/**
+ * Set up and enqueue a DataKVIO on the CPU queue.
+ *
+ * @param dataKVIO       The DataKVIO to set up
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ **/
+static inline void launchDataKVIOOnCPUQueue(DataKVIO         *dataKVIO,
+                                            KvdoWorkFunction  work,
+                                            void             *statsFunction,
+                                            unsigned int      action)
+{
+  KVIO *kvio = dataKVIOAsKVIO(dataKVIO);
+  launchKVIO(kvio, work, statsFunction, action, kvio->layer->cpuQueue);
+}
+
+/**
+ * Set up and enqueue a DataKVIO on the BIO Ack queue.
+ *
+ * @param dataKVIO       The DataKVIO to set up
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ **/
+static inline void launchDataKVIOOnBIOAckQueue(DataKVIO         *dataKVIO,
+                                               KvdoWorkFunction  work,
+                                               void             *statsFunction,
+                                               unsigned int      action)
+{
+  KVIO *kvio = dataKVIOAsKVIO(dataKVIO);
+  launchKVIO(kvio, work, statsFunction, action, kvio->layer->bioAckQueue);
+}
+
+/**
+ * Move a DataKVIO back to the base threads.
+ *
+ * @param dataKVIO The DataKVIO to enqueue
+ **/
+static inline void kvdoEnqueueDataVIOCallback(DataKVIO *dataKVIO)
+{
+  kvdoEnqueueVIOCallback(dataKVIOAsKVIO(dataKVIO));
+}
+
+/**
+ * Check whether the external request bio had FUA set.
+ *
+ * @param dataKVIO  The DataKVIO to check
+ *
+ * @return <code>true</code> if the external request bio had FUA set
+ **/
+static inline bool requestorSetFUA(DataKVIO *dataKVIO)
+{
+  return ((dataKVIO->externalIORequest.rw & REQ_FUA) == REQ_FUA);
+}
+
+/**
+ * Associate a KVIO with a BIO passed in from the block layer, and start
+ * processing the KVIO.
+ *
+ * If setting up a KVIO fails, a message is logged, and the limiter permits
+ * (request and maybe discard) released, but the caller is responsible for
+ * disposing of the bio.
+ *
+ * @param layer                 The physical layer
+ * @param bio                   The bio for which to create KVIO
+ * @param arrivalTime           The time (in jiffies) when the external request
+ *                              entered the device mapbio function
+ * @param hasDiscardPermit      Whether we got a permit from the discardLimiter
+ *                              of the kernel layer
+ *
+ * @return VDO_SUCCESS or a system error code
+ **/
+int kvdoLaunchDataKVIOFromBio(KernelLayer *layer,
+                              BIO         *bio,
+                              Jiffies      arrivalTime,
+                              bool         hasDiscardPermit)
+  __attribute__((warn_unused_result));
+
+/**
+ * Return a batch of DataKVIOs to the pool.
+ *
+ * <p>Implements BatchProcessorCallback.
+ *
+ * @param batch    The batch processor
+ * @param closure  The kernal layer
+ **/
+void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure);
+
+/**
+ * Implements DataVIOZeroer.
+ *
+ * @param dataVIO  The DataVIO to zero
+ **/
+void kvdoZeroDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DataCopier.
+ *
+ * @param source       The DataVIO to copy from
+ * @param destination  The DataVIO to copy to
+ **/
+void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination);
+
+/**
+ * Fetch the data for a block from storage. The fetched data will be
+ * uncompressed when the callback is called, and the result of the read
+ * operation will be stored in the ReadBlock's status field. On success,
+ * the data will be in the ReadBlock's data pointer.
+ *
+ * @param dataVIO       The DataVIO to read a block in for
+ * @param location      The physical block number to read from
+ * @param mappingState  The mapping state of the block to read
+ * @param action        The bio queue action
+ * @param callback      The function to call when the read is done
+ **/
+void kvdoReadBlock(DataVIO             *dataVIO,
+                   PhysicalBlockNumber  location,
+                   BlockMappingState    mappingState,
+                   BioQAction           action,
+                   DataKVIOCallback     callback);
+
+/**
+ * Implements DataReader.
+ *
+ * @param dataVIO  The DataVIO to read
+ **/
+void kvdoReadDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DataWriter.
+ *
+ * @param dataVIO  The DataVIO to write
+ **/
+void kvdoWriteDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DataModifier.
+ *
+ * @param dataVIO  The DataVIO to modify
+ **/
+void kvdoModifyWriteDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DataHasher.
+ *
+ * @param dataVIO  The DataVIO to hash
+ **/
+void kvdoHashDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DuplicationChecker.
+ *
+ * @param dataVIO  The DataVIO containing the block to check
+ **/
+void kvdoCheckForDuplication(DataVIO *dataVIO);
+
+/**
+ * Implements DataAcknowledger.
+ *
+ * @param dataVIO  The DataVIO to acknowledge
+ **/
+void kvdoAcknowledgeDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements DataCompressor.
+ *
+ * @param dataVIO  The DataVIO to compress
+ **/
+void kvdoCompressDataVIO(DataVIO *dataVIO);
+
+/**
+ * Implements AlbireoUpdater.
+ *
+ * @param dataVIO  The DataVIO which needs to change the entry for its data
+ **/
+void kvdoUpdateDedupeAdvice(DataVIO *dataVIO);
+
+/**
+ * Allocate a buffer pool of DataKVIOs.
+ *
+ * @param [in]  layer          The layer in which the DataKVIOs will operate
+ * @param [in]  poolSize       The number of DataKVIOs in the pool
+ * @param [out] bufferPoolPtr  A pointer to hold the new buffer pool
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeDataKVIOBufferPool(KernelLayer  *layer,
+                           uint32_t      poolSize,
+                           BufferPool  **bufferPoolPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Get the state needed to generate UDS metadata from the DataKVIO
+ * associated with a DedupeContext.
+ *
+ * @param context  The DedupeContext
+ *
+ * @return the advice to store in the UDS index
+ **/
+DataLocation getDedupeAdvice(const DedupeContext *context)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set the result of a dedupe query for the DataKVIO associated with a
+ * DedupeContext.
+ *
+ * @param context  The context receiving advice
+ * @param advice   A data location at which the chunk named in the context
+ *                 might be stored (will be NULL if no advice was found)
+ **/
+void setDedupeAdvice(DedupeContext *context, const DataLocation *advice);
+
+#endif /* DATA_KVIO_H */
diff --git a/vdo/kernel/deadlockQueue.c b/vdo/kernel/deadlockQueue.c
new file mode 100644
index 0000000..2350b35
--- /dev/null
+++ b/vdo/kernel/deadlockQueue.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.c#1 $
+ */
+
+#include "deadlockQueue.h"
+
+/**********************************************************************/
+void initializeDeadlockQueue(DeadlockQueue *queue)
+{
+  spin_lock_init(&queue->lock);
+  bio_list_init(&queue->list);
+}
+
+/**********************************************************************/
+void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime)
+{
+  spin_lock(&queue->lock);
+  if (bio_list_empty(&queue->list)) {
+    /*
+     * If we get more than one pending at once, this will be inaccurate for
+     * some of them. Oh well. If we've gotten here, we're trying to avoid a
+     * deadlock; stats are a secondary concern.
+     */
+    queue->arrivalTime = arrivalTime;
+  }
+  bio_list_add(&queue->list, bio);
+  spin_unlock(&queue->lock);
+}
diff --git a/vdo/kernel/deadlockQueue.h b/vdo/kernel/deadlockQueue.h
new file mode 100644
index 0000000..85e0b46
--- /dev/null
+++ b/vdo/kernel/deadlockQueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.h#1 $
+ */
+
+#ifndef DEADLOCK_QUEUE_H
+#define DEADLOCK_QUEUE_H
+
+#include <linux/kernel.h>
+
+#include "bio.h"
+
+/**
+ * A holding space for incoming bios if we're not able to block until VIOs
+ * become available to process them.
+ **/
+typedef struct deadlockQueue {
+  /* Protection for the other fields. */
+  spinlock_t      lock;
+  /* List of bios we had to accept but don't have VIOs for. */
+  struct bio_list list;
+  /*
+   * Arrival time to use for statistics tracking for the above bios, since we
+   * haven't the space to store individual arrival times for each.
+   */
+  Jiffies         arrivalTime;
+} DeadlockQueue;
+
+/**
+ * Initialize the DeadlockQueue structure.
+ *
+ * @param queue  The structure to initialize
+ **/
+void initializeDeadlockQueue(DeadlockQueue *queue);
+
+/**
+ * Add an incoming bio to the list of saved-up bios we're not ready to start
+ * processing yet.
+ *
+ * This excess buffering on top of what the caller implements is generally a
+ * bad idea, and should be used only when necessary, such as to avoid a
+ * possible deadlock situation.
+ *
+ * @param queue        The incoming-bio queue structure
+ * @param bio          The new incoming bio to save
+ * @param arrivalTime  The arrival time of this new bio
+ **/
+void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime);
+
+/**
+ * Pull an incoming bio off the queue.
+ *
+ * The arrival time returned may be incorrect if multiple bios were saved, as
+ * there is no per-bio storage used, only one saved arrival time for the whole
+ * queue.
+ *
+ * @param [in]  queue        The incoming-bio queue
+ * @param [out] arrivalTime  The arrival time to use for this bio
+ *
+ * @return  a BIO pointer, or NULL if none were queued
+ **/
+static inline BIO *pollDeadlockQueue(DeadlockQueue *queue,
+                                     Jiffies       *arrivalTime)
+{
+  spin_lock(&queue->lock);
+  BIO *bio = bio_list_pop(&queue->list);
+  if (unlikely(bio != NULL)) {
+    *arrivalTime = queue->arrivalTime;
+  }
+  spin_unlock(&queue->lock);
+  return bio;
+}
+
+#endif // DEADLOCK_QUEUE_H
diff --git a/vdo/kernel/dedupeIndex.c b/vdo/kernel/dedupeIndex.c
new file mode 100644
index 0000000..811cd93
--- /dev/null
+++ b/vdo/kernel/dedupeIndex.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.c#1 $
+ */
+
+#include "dedupeIndex.h"
+
+#include "numeric.h"
+
+#include "udsIndex.h"
+
+// These times are in milliseconds
+unsigned int albireoTimeoutInterval  = 5000;
+unsigned int minAlbireoTimerInterval = 100;
+
+// These times are in jiffies
+Jiffies albireoTimeoutJiffies = 0;
+static Jiffies minAlbireoTimerJiffies = 0;
+
+/**********************************************************************/
+Jiffies getAlbireoTimeout(Jiffies startJiffies)
+{
+  return maxULong(startJiffies + albireoTimeoutJiffies,
+                  jiffies + minAlbireoTimerJiffies);
+}
+
+/**********************************************************************/
+void setAlbireoTimeoutInterval(unsigned int value)
+{
+  // Arbitrary maximum value is two minutes
+  if (value > 120000) {
+    value = 120000;
+  }
+  // Arbitrary minimum value is 2 jiffies
+  Jiffies albJiffies = msecs_to_jiffies(value);
+  if (albJiffies < 2) {
+    albJiffies = 2;
+    value      = jiffies_to_msecs(albJiffies);
+  }
+  albireoTimeoutInterval = value;
+  albireoTimeoutJiffies  = albJiffies;
+}
+
+/**********************************************************************/
+void setMinAlbireoTimerInterval(unsigned int value)
+{
+  // Arbitrary maximum value is one second
+  if (value > 1000) {
+    value = 1000;
+  }
+
+  // Arbitrary minimum value is 2 jiffies
+  Jiffies minJiffies = msecs_to_jiffies(value);
+  if (minJiffies < 2) {
+    minJiffies = 2;
+    value = jiffies_to_msecs(minJiffies);
+  }
+
+  minAlbireoTimerInterval = value;
+  minAlbireoTimerJiffies  = minJiffies;
+}
+
+/**********************************************************************/
+int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer)
+{
+  if (albireoTimeoutJiffies == 0) {
+    setAlbireoTimeoutInterval(albireoTimeoutInterval);
+  }
+
+  if (minAlbireoTimerJiffies == 0) {
+    setMinAlbireoTimerInterval(minAlbireoTimerInterval);
+  }
+
+  return makeUDSIndex(layer, indexPtr);
+}
diff --git a/vdo/kernel/dedupeIndex.h b/vdo/kernel/dedupeIndex.h
new file mode 100644
index 0000000..31d7631
--- /dev/null
+++ b/vdo/kernel/dedupeIndex.h
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.h#5 $
+ */
+
+#ifndef DEDUPE_INDEX_H
+#define DEDUPE_INDEX_H
+
+#include "dataKVIO.h"
+
+struct dedupeIndex {
+
+  /**
+   * Do the dedupe section of dmsetup message vdo0 0 dump ...
+   *
+   * @param index      The dedupe index
+   * @param showQueue  true to dump a dedupe work queue
+   **/
+  void (*dump)(DedupeIndex *index, bool showQueue);
+
+  /**
+   * Free a dedupe index. The "finish" method must have been called
+   * first.
+   *
+   * @param index  The dedupe index
+   **/
+  void (*free)(DedupeIndex *index);
+
+  /**
+   * Get the name of the deduplication state
+   *
+   * @param index  The dedupe index
+   *
+   * @return the dedupe state name
+   **/
+  const char *(*getDedupeStateName)(DedupeIndex *index);
+
+  /**
+   * Get the index statistics
+   *
+   * @param index  The dedupe index
+   * @param stats  The index statistics
+   **/
+  void (*getStatistics)(DedupeIndex *index, IndexStatistics *stats);
+
+  /**
+   * Process a dmsetup message directed to the index.
+   *
+   * @param index  The dedupe index
+   * @param name   The message name
+   *
+   * @return 0 or an error code
+   **/
+  int (*message)(DedupeIndex *index, const char *name);
+
+  /**
+   * Look up the chunkname of the DataKVIO. If found, return the PBN
+   * previously associated with the name. If not found, associate the
+   * new PBN with the name.
+   *
+   * @param dataKVIO  The DataKVIO
+   **/
+  void (*post)(DataKVIO *dataKVIO);
+
+  /**
+   * Look up the chunkname of the DataKVIO. If found, return the PBN
+   * previously associated with the name. If not found, do nothing.
+   *
+   * @param dataKVIO  The DataKVIO
+   **/
+  void (*query)(DataKVIO *dataKVIO);
+
+  /**
+   * Start the dedupe index.
+   *
+   * @param index       The dedupe index
+   * @param createFlag  If true, create a new index without first attempting
+   *                    to load an existing index
+   **/
+  void (*start)(DedupeIndex *index, bool createFlag);
+
+  /**
+   * Stop the dedupe index.  May be called by any thread, but will wait for
+   * the shutdown to be completed.
+   *
+   * @param index  The dedupe index
+   **/
+  void (*stop)(DedupeIndex *index);
+
+  /**
+   * Suspend the dedupe index. If there are any outstanding index
+   * requests, wait for them to finish. If the index is doing any
+   * asynchronous writing, wait for the I/O to complete. If the index
+   * is not open yet and we are doing a rebuild of the master index,
+   * pause the rebuild so that it can be resumed later. May be called
+   * from any thread.
+   *
+   * @param index     The dedupe index
+   * @param saveFlag  True if we should save the index
+   **/
+  void (*suspend)(DedupeIndex *index, bool saveFlag);
+ 
+  /**
+   * Resume a suspended dedupe index. May be called from any thread.
+   *
+   * @param index  The dedupe index
+   **/
+  void (*resume)(DedupeIndex *index);
+
+ /**
+   * Finish the dedupe index; shuts it down for good and prepares to
+   * free resources. After this point, no more requests may be sent to
+   * it.
+   *
+   * @param index   The dedupe index
+   **/
+  void (*finish)(DedupeIndex *index);
+
+  /**
+   * Look up the chunkname of the DataKVIO and associate the new PBN with the
+   * name.
+   *
+   * @param dataKVIO  The DataKVIO
+   **/
+  void (*update)(DataKVIO *dataKVIO);
+};
+
+/**
+ * Make a dedupe index
+ *
+ * @param indexPtr  dedupe index returned here
+ * @param layer     the kernel layer
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer)
+  __attribute__((warn_unused_result));
+
+
+/**
+ * Do the dedupe section of dmsetup message vdo0 0 dump ...
+ *
+ * @param index  The dedupe index
+ * @param showQueue  true to dump a dedupe work queue
+ **/
+static inline void dumpDedupeIndex(DedupeIndex *index, bool showQueue)
+{
+  index->dump(index, showQueue);
+}
+
+/**
+ * Free the dedupe index
+ *
+ * @param index  The dedupe index
+ **/
+static inline void freeDedupeIndex(DedupeIndex **index)
+{
+  if (*index != NULL) {
+    (*index)->free(*index);
+    *index = NULL;
+  }
+}
+
+/**
+ * Get the name of the deduplication state
+ *
+ * @param index  The dedupe index
+ *
+ * @return the dedupe state name
+ **/
+static inline const char *getDedupeStateName(DedupeIndex *index)
+{
+  return index->getDedupeStateName(index);
+}
+
+/**
+ * Get the index statistics
+ *
+ * @param index  The dedupe index
+ * @param stats  The index statistics
+ **/
+static inline void getIndexStatistics(DedupeIndex     *index,
+                                      IndexStatistics *stats)
+{
+  return index->getStatistics(index, stats);
+}
+
+/**
+ * Return from a dedupe operation by invoking the callback function
+ *
+ * @param dataKVIO  The DataKVIO
+ **/
+static inline void invokeDedupeCallback(DataKVIO *dataKVIO)
+{
+
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F($dup);cb=dedupe($dup)"));
+  kvdoEnqueueDataVIOCallback(dataKVIO);
+}
+
+/**
+ * Process a dmsetup message directed to the index.
+ *
+ * @param index  The dedupe index
+ * @param name   The message name
+ *
+ * @return 0 or an error code
+ **/
+static inline int messageDedupeIndex(DedupeIndex *index, const char *name)
+{
+  return index->message(index, name);
+}
+
+/**
+ * Look up the chunkname of the DataKVIO and identify duplicated chunks.
+ *
+ * @param dataKVIO  The DataKVIO. These fields are used:
+ *                  dedupeContext.chunkName is the chunk name.
+ *                  The advice to offer to the index will be obtained
+ *                  via getDedupeAdvice(). The advice found in the index
+ *                  (or NULL if none) will be returned via setDedupeAdvice().
+ *                  dedupeContext.status is set to the return status code of
+ *                  any asynchronous index processing.
+ **/
+static inline void postDedupeAdvice(DataKVIO *dataKVIO)
+{
+  KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer;
+  layer->dedupeIndex->post(dataKVIO);
+}
+
+/**
+ * Look up the chunkname of the DataKVIO and identify duplicated chunks.
+ *
+ * @param dataKVIO  The DataKVIO. These fields are used:
+ *                  dedupeContext.chunkName is the chunk name.
+ *                  The advice found in the index (or NULL if none) will
+ *                  be returned via setDedupeAdvice().
+ *                  dedupeContext.status is set to the return status code of
+ *                  any asynchronous index processing.
+ **/
+static inline void queryDedupeAdvice(DataKVIO *dataKVIO)
+{
+  KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer;
+  layer->dedupeIndex->query(dataKVIO);
+}
+
+/**
+ * Start the dedupe index.
+ *
+ * @param index       The dedupe index
+ * @param createFlag  If true, create a new index without first attempting
+ *                    to load an existing index
+ **/
+static inline void startDedupeIndex(DedupeIndex *index, bool createFlag)
+{
+  index->start(index, createFlag);
+}
+
+/**
+ * Stop the dedupe index.  May be called by any thread, but will wait for
+ * the shutdown to be completed.
+ *
+ * @param index  The dedupe index
+ **/
+static inline void stopDedupeIndex(DedupeIndex *index)
+{
+  return index->stop(index);
+}
+
+/**
+ * Suspend the dedupe index. If there are any outstanding index
+ * requests, wait for them to finish. If the index is doing any
+ * asynchronous writing, wait for the I/O to complete. If the index is
+ * not open yet and we are doing a rebuild of the master index, pause
+ * the rebuild so that it can be resumed later. May be called from any
+ * thread.
+ *
+ * @param index     The dedupe index
+ * @param saveFlag  True if we should save the index
+ **/
+static inline void suspendDedupeIndex(DedupeIndex *index, bool saveFlag)
+{
+  index->suspend(index, saveFlag);
+}
+
+/**
+ * Resume a suspended dedupe index. May be called from any thread.
+ *
+ * @param index  The dedupe index
+ **/
+static inline void resumeDedupeIndex(DedupeIndex *index)
+{
+  index->resume(index);
+}
+
+/**
+ * Finish the dedupe index.
+ *
+ * @param index  The dedupe index
+ **/
+static inline void finishDedupeIndex(DedupeIndex *index)
+{
+  return index->finish(index);
+}
+
+/**
+ * Look up the chunkname of the DataKVIO and associate the new PBN with the
+ * name.
+ *
+ * @param dataKVIO  The DataKVIO. These fields are used:
+ *                  dedupeContext.chunkName is the chunk name.
+ *                  The advice to offer to the index will be obtained
+ *                  via getDedupeAdvice(). dedupeContext.status is set to the
+ *                  return status code of any asynchronous index processing.
+ **/
+static inline void updateDedupeAdvice(DataKVIO *dataKVIO)
+{
+  KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer;
+  layer->dedupeIndex->update(dataKVIO);
+}
+
+// Interval (in milliseconds or jiffies) from submission until switching to
+// fast path and skipping Albireo.
+extern unsigned int albireoTimeoutInterval;
+extern Jiffies      albireoTimeoutJiffies;
+
+// Minimum time interval (in milliseconds) between timer invocations to
+// check for requests waiting for Albireo that should now time out.
+extern unsigned int minAlbireoTimerInterval;
+
+/**
+ * Calculate the actual end of a timer, taking into account the absolute
+ * start time and the present time.
+ *
+ * @param startJiffies  The absolute start time, in jiffies
+ *
+ * @return the absolute end time for the timer, in jiffies
+ **/
+Jiffies getAlbireoTimeout(Jiffies startJiffies);
+
+/**
+ * Set the interval from submission until switching to fast path and
+ * skipping Albireo.
+ *
+ * @param value  The number of milliseconds
+ **/
+void setAlbireoTimeoutInterval(unsigned int value);
+
+/**
+ * Set the minimum time interval between timer invocations to check for
+ * requests waiting for Albireo that should now time out.
+ *
+ * @param value  The number of milliseconds
+ **/
+void setMinAlbireoTimerInterval(unsigned int value);
+
+#endif /* DEDUPE_INDEX_H */
diff --git a/vdo/kernel/deviceConfig.c b/vdo/kernel/deviceConfig.c
new file mode 100644
index 0000000..08e864c
--- /dev/null
+++ b/vdo/kernel/deviceConfig.c
@@ -0,0 +1,769 @@
+/**
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.c#14 $
+ */
+
+#include "deviceConfig.h"
+
+#include <linux/device-mapper.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+
+#include "kernelLayer.h"
+#include "vdoStringUtils.h"
+
+#include "constants.h"
+
+enum {
+  // If we bump this, update the arrays below
+  TABLE_VERSION = 2,
+  // Limits used when parsing thread-count config spec strings
+  BIO_ROTATION_INTERVAL_LIMIT = 1024,
+  LOGICAL_THREAD_COUNT_LIMIT  = 60,
+  PHYSICAL_THREAD_COUNT_LIMIT = 16,
+  THREAD_COUNT_LIMIT          = 100,
+  // XXX The bio-submission queue configuration defaults are temporarily
+  // still being defined here until the new runtime-based thread
+  // configuration has been fully implemented for managed VDO devices.
+
+  // How many bio submission work queues to use
+  DEFAULT_NUM_BIO_SUBMIT_QUEUES             = 4,
+  // How often to rotate between bio submission work queues
+  DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL  = 64,
+};
+
+// arrays for handling different table versions
+static const uint8_t REQUIRED_ARGC[] = {10, 12, 9};
+static const uint8_t POOL_NAME_ARG_INDEX[] = {8, 10, 8};
+
+/**
+ * Decide the version number from argv.
+ *
+ * @param [in]  argc         The number of table values
+ * @param [in]  argv         The array of table values
+ * @param [out] errorPtr     A pointer to return a error string in
+ * @param [out] versionPtr   A pointer to return the version
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+static int getVersionNumber(int            argc,
+                            char         **argv,
+                            char         **errorPtr,
+                            TableVersion  *versionPtr)
+{
+  // version, if it exists, is in a form of V<n>
+  if (sscanf(argv[0], "V%u", versionPtr) == 1) {
+    if (*versionPtr < 1 || *versionPtr > TABLE_VERSION) {
+      *errorPtr = "Unknown version number detected";
+      return VDO_BAD_CONFIGURATION;
+    }
+  } else {
+    // V0 actually has no version number in the table string
+    *versionPtr = 0;
+  }
+
+  // V0 and V1 have no optional parameters. There will always be
+  // a parameter for thread config, even if its a "." to show
+  // its an empty list.
+  if (*versionPtr <= 1) {
+    if (argc != REQUIRED_ARGC[*versionPtr]) {
+      *errorPtr = "Incorrect number of arguments for version";
+      return VDO_BAD_CONFIGURATION;
+    }
+  } else if (argc < REQUIRED_ARGC[*versionPtr]) {
+    *errorPtr = "Incorrect number of arguments for version";
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  if (*versionPtr != TABLE_VERSION) {
+    logWarning("Detected version mismatch between kernel module and tools "
+	       " kernel: %d, tool: %d", TABLE_VERSION, *versionPtr);
+    logWarning("Please consider upgrading management tools to match kernel.");
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int getPoolNameFromArgv(int    argc,
+                        char **argv,
+                        char **errorPtr,
+                        char **poolNamePtr)
+{
+  TableVersion version;
+  int result = getVersionNumber(argc, argv, errorPtr, &version);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  *poolNamePtr = argv[POOL_NAME_ARG_INDEX[version]];
+  return VDO_SUCCESS;
+}
+
+/**
+ * Resolve the config with write policy, physical size, and other unspecified
+ * fields based on the device, if needed.
+ *
+ * @param [in,out] config   The config possibly missing values
+ * @param [in]     verbose  Whether to log about the underlying device
+ **/
+static void resolveConfigWithDevice(DeviceConfig  *config,
+                                    bool           verbose)
+{
+  struct dm_dev *dev = config->ownedDevice;
+  struct request_queue *requestQueue = bdev_get_queue(dev->bdev);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0)
+  bool flushSupported
+    = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_WC)) != 0);
+  bool fuaSupported
+    = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_FUA)) != 0);
+#else
+  bool flushSupported = ((requestQueue->flush_flags & REQ_FLUSH) == REQ_FLUSH);
+  bool fuaSupported   = ((requestQueue->flush_flags & REQ_FUA) == REQ_FUA);
+#endif
+  if (verbose) {
+    logInfo("underlying device, REQ_FLUSH: %s, REQ_FUA: %s",
+            (flushSupported ? "supported" : "not supported"),
+            (fuaSupported ? "supported" : "not supported"));
+  } else {
+    // We should probably always log, but need to make sure that makes sense
+    // before changing behavior.
+  }
+
+  if (config->writePolicy == WRITE_POLICY_AUTO) {
+    config->writePolicy
+      = (flushSupported ? WRITE_POLICY_ASYNC : WRITE_POLICY_SYNC);
+    logInfo("Using write policy %s automatically.",
+            getConfigWritePolicyString(config));
+  } else {
+    logInfo("Using write policy %s.", getConfigWritePolicyString(config));
+  }
+
+  if (flushSupported && (config->writePolicy == WRITE_POLICY_SYNC)) {
+    logWarning("WARNING: Running in sync mode atop a device supporting flushes"
+               " is dangerous!");
+  }
+
+  if (config->version == 0) {
+    uint64_t deviceSize = i_size_read(dev->bdev->bd_inode);
+    config->physicalBlocks = deviceSize / VDO_BLOCK_SIZE;
+  }
+}
+
+/**
+ * Parse a two-valued option into a bool.
+ *
+ * @param [in]  boolStr    The string value to convert to a bool
+ * @param [in]  trueStr    The string value which should be converted to true
+ * @param [in]  falseStr   The string value which should be converted to false
+ * @param [out] boolPtr    A pointer to return the bool value in
+ *
+ * @return VDO_SUCCESS or an error if boolStr is neither trueStr nor falseStr
+ **/
+__attribute__((warn_unused_result))
+static inline int parseBool(const char *boolStr,
+                            const char *trueStr,
+                            const char *falseStr,
+                            bool       *boolPtr)
+{
+  bool value = false;
+  if (strcmp(boolStr, trueStr) == 0) {
+    value = true;
+  } else if (strcmp(boolStr, falseStr) == 0) {
+    value = false;
+  } else {
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  *boolPtr = value;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Process one component of a thread parameter configuration string and
+ * update the configuration data structure.
+ *
+ * If the thread count requested is invalid, a message is logged and
+ * -EINVAL returned. If the thread name is unknown, a message is logged
+ * but no error is returned.
+ *
+ * @param threadParamType  The type of thread specified
+ * @param count            The thread count requested
+ * @param config           The configuration data structure to update
+ *
+ * @return   VDO_SUCCESS or -EINVAL
+ **/
+static int processOneThreadConfigSpec(const char        *threadParamType,
+                                      unsigned int       count,
+                                      ThreadCountConfig *config)
+{
+  // Handle limited thread parameters
+  if (strcmp(threadParamType, "bioRotationInterval") == 0) {
+    if (count == 0) {
+      logError("thread config string error:"
+               " 'bioRotationInterval' of at least 1 is required");
+      return -EINVAL;
+    } else if (count > BIO_ROTATION_INTERVAL_LIMIT) {
+      logError("thread config string error:"
+               " 'bioRotationInterval' cannot be higher than %d",
+               BIO_ROTATION_INTERVAL_LIMIT);
+      return -EINVAL;
+    }
+    config->bioRotationInterval = count;
+    return VDO_SUCCESS;
+  } else if (strcmp(threadParamType, "logical") == 0) {
+    if (count > LOGICAL_THREAD_COUNT_LIMIT) {
+      logError("thread config string error: at most %d 'logical' threads"
+               " are allowed",
+               LOGICAL_THREAD_COUNT_LIMIT);
+      return -EINVAL;
+    }
+    config->logicalZones = count;
+    return VDO_SUCCESS;
+  } else if (strcmp(threadParamType, "physical") == 0) {
+    if (count > PHYSICAL_THREAD_COUNT_LIMIT) {
+      logError("thread config string error: at most %d 'physical' threads"
+               " are allowed",
+               PHYSICAL_THREAD_COUNT_LIMIT);
+      return -EINVAL;
+    }
+    config->physicalZones = count;
+    return VDO_SUCCESS;
+  } else {
+    // Handle other thread count parameters
+    if (count > THREAD_COUNT_LIMIT) {
+      logError("thread config string error: at most %d '%s' threads"
+               " are allowed",
+               THREAD_COUNT_LIMIT, threadParamType);
+      return -EINVAL;
+    }
+
+    if (strcmp(threadParamType, "hash") == 0) {
+      config->hashZones = count;
+      return VDO_SUCCESS;
+    } else if (strcmp(threadParamType, "cpu") == 0) {
+      if (count == 0) {
+        logError("thread config string error:"
+                 " at least one 'cpu' thread required");
+        return -EINVAL;
+      }
+      config->cpuThreads = count;
+      return VDO_SUCCESS;
+    } else if (strcmp(threadParamType, "ack") == 0) {
+      config->bioAckThreads = count;
+      return VDO_SUCCESS;
+    } else if (strcmp(threadParamType, "bio") == 0) {
+      if (count == 0) {
+        logError("thread config string error:"
+                 " at least one 'bio' thread required");
+        return -EINVAL;
+      }
+      config->bioThreads = count;
+      return VDO_SUCCESS;
+    }
+  }
+
+  // Don't fail, just log. This will handle version mismatches between
+  // user mode tools and kernel.
+  logInfo("unknown thread parameter type \"%s\"", threadParamType);
+  return VDO_SUCCESS;
+}
+
+/**
+ * Parse one component of a thread parameter configuration string and
+ * update the configuration data structure.
+ *
+ * @param spec    The thread parameter specification string
+ * @param config  The configuration data to be updated
+ **/
+static int parseOneThreadConfigSpec(const char        *spec,
+                                    ThreadCountConfig *config)
+{
+  char **fields;
+  int result = splitString(spec, '=', &fields);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) {
+    logError("thread config string error:"
+             " expected thread parameter assignment, saw \"%s\"",
+             spec);
+    freeStringArray(fields);
+    return -EINVAL;
+  }
+
+  unsigned int count;
+  result = stringToUInt(fields[1], &count);
+  if (result != UDS_SUCCESS) {
+    logError("thread config string error: integer value needed, found \"%s\"",
+             fields[1]);
+    freeStringArray(fields);
+    return result;
+  }
+
+  result = processOneThreadConfigSpec(fields[0], count, config);
+  freeStringArray(fields);
+  return result;
+}
+
+/**
+ * Parse the configuration string passed and update the specified
+ * counts and other parameters of various types of threads to be created.
+ *
+ * The configuration string should contain one or more comma-separated specs
+ * of the form "typename=number"; the supported type names are "cpu", "ack",
+ * "bio", "bioRotationInterval", "logical", "physical", and "hash".
+ *
+ * If an error occurs during parsing of a single key/value pair, we deem
+ * it serious enough to stop further parsing.
+ *
+ * This function can't set the "reason" value the caller wants to pass
+ * back, because we'd want to format it to say which field was
+ * invalid, and we can't allocate the "reason" strings dynamically. So
+ * if an error occurs, we'll log the details and pass back an error.
+ *
+ * @param string  Thread parameter configuration string
+ * @param config  The thread configuration data to update
+ *
+ * @return   VDO_SUCCESS or -EINVAL or -ENOMEM
+ **/
+static int parseThreadConfigString(const char        *string,
+                                   ThreadCountConfig *config)
+{
+  int result = VDO_SUCCESS;
+
+  char **specs;
+  if (strcmp(".", string) != 0) {
+    result = splitString(string, ',', &specs);
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+    for (unsigned int i = 0; specs[i] != NULL; i++) {
+      result = parseOneThreadConfigSpec(specs[i], config);
+      if (result != VDO_SUCCESS) {
+	break;
+      }
+    }
+    freeStringArray(specs);
+  }
+  return result;
+}
+
+/**
+ * Process one component of an optional parameter string and
+ * update the configuration data structure.
+ *
+ * If the value requested is invalid, a message is logged and
+ * -EINVAL returned. If the key is unknown, a message is logged
+ * but no error is returned.
+ *
+ * @param key    The optional parameter key name
+ * @param value  The optional parameter value
+ * @param config The configuration data structure to update
+ *
+ * @return   VDO_SUCCESS or -EINVAL
+ **/
+static int processOneKeyValuePair(const char   *key,
+                                  unsigned int  value,
+                                  DeviceConfig *config)
+{
+  // Non thread optional parameters
+  if (strcmp(key, "maxDiscard") == 0) {
+    if (value == 0) {
+      logError("optional parameter error:"
+               " at least one max discard block required");
+      return -EINVAL;
+    }
+    // Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9
+    if (value > (UINT_MAX / VDO_BLOCK_SIZE)) {
+      logError("optional parameter error: at most %d max discard"
+               " blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE);
+      return -EINVAL;
+    }
+    config->maxDiscardBlocks = value;
+    return VDO_SUCCESS;
+  }
+  // Handles unknown key names
+  return processOneThreadConfigSpec(key, value, &config->threadCounts);
+}
+
+/**
+ * Parse one key/value pair and update the configuration
+ * data structure.
+ *
+ * @param key     The optional key name
+ * @param value   The optional value
+ * @param config  The configuration data to be updated
+ *
+ * @return   VDO_SUCCESS or error
+ **/
+static int parseOneKeyValuePair(const char   *key,
+				const char   *value,
+                                DeviceConfig *config)
+{
+  if (strcmp(key, "deduplication") == 0) {
+    return parseBool(value, "on", "off", &config->deduplication);
+  }
+
+  // The remaining arguments must have integral values.
+  unsigned int count;
+  int result = stringToUInt(value, &count);
+  if (result != UDS_SUCCESS) {
+    logError("optional config string error: integer value needed, found \"%s\"",
+             value);
+    return result;
+  }
+  return processOneKeyValuePair(key, count, config);
+}
+
+/**
+ * Parse all key/value pairs from a list of arguments.
+ *
+ * If an error occurs during parsing of a single key/value pair, we deem
+ * it serious enough to stop further parsing.
+ *
+ * This function can't set the "reason" value the caller wants to pass
+ * back, because we'd want to format it to say which field was
+ * invalid, and we can't allocate the "reason" strings dynamically. So
+ * if an error occurs, we'll log the details and return the error.
+ *
+ * @param argc     The total number of arguments in list
+ * @param argv     The list of key/value pairs
+ * @param config   The device configuration data to update
+ *
+ * @return   VDO_SUCCESS or error
+ **/
+static int parseKeyValuePairs(int            argc,
+			      char         **argv,
+			      DeviceConfig  *config)
+{
+  int result = VDO_SUCCESS;
+  while (argc) {
+    result = parseOneKeyValuePair(argv[0], argv[1], config);
+    if (result != VDO_SUCCESS) {
+      break;
+    }
+
+    argc -= 2;
+    argv += 2;
+  }
+
+  return result;
+}
+
+/**
+ * Parse the configuration string passed in for optional arguments.
+ *
+ * For V0/V1 configurations, there will only be one optional parameter;
+ * the thread configuration. The configuration string should contain
+ * one or more comma-separated specs of the form "typename=number"; the
+ * supported type names are "cpu", "ack", "bio", "bioRotationInterval",
+ * "logical", "physical", and "hash".
+ *
+ * For V2 configurations and beyond, there could be any number of
+ * arguments. They should contain one or more key/value pairs
+ * separated by a space.
+ *
+ * @param argSet   The structure holding the arguments to parse
+ * @param errorPtr Pointer to a buffer to hold the error string
+ * @param config   Pointer to device configuration data to update
+ *
+ * @return   VDO_SUCCESS or error
+ */
+int parseOptionalArguments(struct dm_arg_set  *argSet,
+			   char              **errorPtr,
+			   DeviceConfig       *config)
+{
+  int result = VDO_SUCCESS;
+
+  if (config->version == 0 || config->version == 1) {
+    result = parseThreadConfigString(argSet->argv[0],
+				     &config->threadCounts);
+    if (result != VDO_SUCCESS) {
+      *errorPtr = "Invalid thread-count configuration";
+      return VDO_BAD_CONFIGURATION;
+    }
+  } else {
+    if ((argSet->argc % 2) != 0) {
+      *errorPtr = "Odd number of optional arguments given but they"
+	          " should be <key> <value> pairs";
+      return VDO_BAD_CONFIGURATION;
+    }
+    result = parseKeyValuePairs(argSet->argc, argSet->argv, config);
+    if (result != VDO_SUCCESS) {
+      *errorPtr = "Invalid optional argument configuration";
+      return VDO_BAD_CONFIGURATION;
+    }
+  }
+  return result;
+}
+
+/**
+ * Handle a parsing error.
+ *
+ * @param configPtr     A pointer to the config to free
+ * @param errorPtr      A place to store a constant string about the error
+ * @param errorStr      A constant string to store in errorPtr
+ **/
+static void handleParseError(DeviceConfig **configPtr,
+                             char         **errorPtr,
+                             char          *errorStr)
+{
+  freeDeviceConfig(configPtr);
+  *errorPtr = errorStr;
+}
+
+/**********************************************************************/
+int parseDeviceConfig(int                argc,
+                      char             **argv,
+                      struct dm_target  *ti,
+                      bool               verbose,
+                      DeviceConfig     **configPtr)
+{
+  char **errorPtr = &ti->error;
+  DeviceConfig *config = NULL;
+  int result = ALLOCATE(1, DeviceConfig, "DeviceConfig", &config);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Could not allocate config structure");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  config->owningTarget = ti;
+  initializeRing(&config->configNode);
+
+  // Save the original string.
+  result = joinStrings(argv, argc, ' ', &config->originalString);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Could not populate string");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Set defaults.
+  //
+  // XXX Defaults for bioThreads and bioRotationInterval are currently defined
+  // using the old configuration scheme of constants.  These values are relied
+  // upon for performance testing on MGH machines currently.
+  // This should be replaced with the normally used testing defaults being
+  // defined in the file-based thread-configuration settings.  The values used
+  // as defaults internally should really be those needed for VDO in its
+  // default shipped-product state.
+  config->threadCounts = (ThreadCountConfig) {
+    .bioAckThreads       = 1,
+    .bioThreads          = DEFAULT_NUM_BIO_SUBMIT_QUEUES,
+    .bioRotationInterval = DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL,
+    .cpuThreads          = 1,
+    .logicalZones        = 0,
+    .physicalZones       = 0,
+    .hashZones           = 0,
+  };
+  config->maxDiscardBlocks = 1;
+  config->deduplication    = true;
+
+  struct dm_arg_set argSet;
+
+  argSet.argc = argc;
+  argSet.argv = argv;
+
+  result = getVersionNumber(argc, argv, errorPtr, &config->version);
+  if (result != VDO_SUCCESS) {
+    // getVersionNumber sets errorPtr itself.
+    handleParseError(&config, errorPtr, *errorPtr);
+    return result;
+  }
+  // Move the arg pointer forward only if the argument was there.
+  if (config->version >= 1) {
+    dm_shift_arg(&argSet);
+  }
+
+  result = duplicateString(dm_shift_arg(&argSet), "parent device name",
+                           &config->parentDeviceName);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Could not copy parent device name");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the physical blocks, if known.
+  if (config->version >= 1) {
+    result = kstrtoull(dm_shift_arg(&argSet), 10, &config->physicalBlocks);
+    if (result != VDO_SUCCESS) {
+      handleParseError(&config, errorPtr, "Invalid physical block count");
+      return VDO_BAD_CONFIGURATION;
+    }
+  }
+
+  // Get the logical block size and validate
+  bool enable512e;
+  result = parseBool(dm_shift_arg(&argSet), "512", "4096", &enable512e);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Invalid logical block size");
+    return VDO_BAD_CONFIGURATION;
+  }
+  config->logicalBlockSize = (enable512e ? 512 : 4096);
+
+  // Skip past the two no longer used read cache options.
+  if (config->version <= 1) {
+    dm_consume_args(&argSet, 2);
+  }
+
+  // Get the page cache size.
+  result = stringToUInt(dm_shift_arg(&argSet), &config->cacheSize);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Invalid block map page cache size");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the block map era length.
+  result = stringToUInt(dm_shift_arg(&argSet), &config->blockMapMaximumAge);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Invalid block map maximum age");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the MD RAID5 optimization mode and validate
+  result = parseBool(dm_shift_arg(&argSet), "on", "off",
+                     &config->mdRaid5ModeEnabled);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Invalid MD RAID5 mode");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the write policy and validate.
+  if (strcmp(argSet.argv[0], "async") == 0) {
+    config->writePolicy = WRITE_POLICY_ASYNC;
+  } else if (strcmp(argSet.argv[0], "async-unsafe") == 0) {
+    config->writePolicy = WRITE_POLICY_ASYNC_UNSAFE;
+  } else if (strcmp(argSet.argv[0], "sync") == 0) {
+    config->writePolicy = WRITE_POLICY_SYNC;
+  } else if (strcmp(argSet.argv[0], "auto") == 0) {
+    config->writePolicy = WRITE_POLICY_AUTO;
+  } else {
+    handleParseError(&config, errorPtr, "Invalid write policy");
+    return VDO_BAD_CONFIGURATION;
+  }
+  dm_shift_arg(&argSet);
+
+  // Make sure the enum to get the pool name from argv directly is still in
+  // sync with the parsing of the table line.
+  if (&argSet.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) {
+    handleParseError(&config, errorPtr, "Pool name not in expected location");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the address where the albserver is running. Check for validation
+  // is done in dedupe.c code during startKernelLayer call
+  result = duplicateString(dm_shift_arg(&argSet), "pool name",
+			   &config->poolName);
+  if (result != VDO_SUCCESS) {
+    handleParseError(&config, errorPtr, "Could not copy pool name");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Get the optional arguments and validate.
+  result = parseOptionalArguments(&argSet, errorPtr, config);
+  if (result != VDO_SUCCESS) {
+    // parseOptionalArguments sets errorPtr itself.
+    handleParseError(&config, errorPtr, *errorPtr);
+    return result;
+  }
+
+  /*
+   * Logical, physical, and hash zone counts can all be zero; then we get one
+   * thread doing everything, our older configuration. If any zone count is
+   * non-zero, the others must be as well.
+   */
+  if (((config->threadCounts.logicalZones == 0)
+       != (config->threadCounts.physicalZones == 0))
+      || ((config->threadCounts.physicalZones == 0)
+          != (config->threadCounts.hashZones == 0))
+      ) {
+    handleParseError(&config, errorPtr,
+                     "Logical, physical, and hash zones counts must all be"
+                     " zero or all non-zero");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  result = dm_get_device(ti, config->parentDeviceName,
+                         dm_table_get_mode(ti->table), &config->ownedDevice);
+  if (result != 0) {
+    logError("couldn't open device \"%s\": error %d",
+             config->parentDeviceName, result);
+    handleParseError(&config, errorPtr, "Unable to open storage device");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  resolveConfigWithDevice(config, verbose);
+
+  *configPtr = config;
+  return result;
+}
+
+/**********************************************************************/
+void freeDeviceConfig(DeviceConfig **configPtr)
+{
+  if (configPtr == NULL) {
+    return;
+  }
+
+  DeviceConfig *config = *configPtr;
+  if (config == NULL) {
+    *configPtr = NULL;
+    return;
+  }
+
+  if (config->ownedDevice != NULL) {
+    dm_put_device(config->owningTarget, config->ownedDevice);
+  }
+
+  FREE(config->poolName);
+  FREE(config->parentDeviceName);
+  FREE(config->originalString);
+
+  // Reduce the chance a use-after-free (as in BZ 1669960) happens to work.
+  memset(config, 0, sizeof(*config));
+
+  FREE(config);
+  *configPtr = NULL;
+}
+
+/**********************************************************************/
+const char *getConfigWritePolicyString(DeviceConfig *config)
+{
+  switch (config->writePolicy) {
+  case WRITE_POLICY_AUTO:
+    return "auto";
+  case WRITE_POLICY_ASYNC:
+    return "async";
+  case WRITE_POLICY_ASYNC_UNSAFE:
+    return "async-unsafe";
+  case WRITE_POLICY_SYNC:
+    return "sync";
+  default:
+    return "unknown";
+  }
+}
+
+/**********************************************************************/
+void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer)
+{
+  unspliceRingNode(&config->configNode);
+  if (layer != NULL) {
+    pushRingNode(&layer->deviceConfigRing, &config->configNode);
+  }
+  config->layer = layer;
+}
diff --git a/vdo/kernel/deviceConfig.h b/vdo/kernel/deviceConfig.h
new file mode 100644
index 0000000..36199dd
--- /dev/null
+++ b/vdo/kernel/deviceConfig.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.h#11 $
+ */
+#ifndef DEVICE_CONFIG_H
+#define DEVICE_CONFIG_H
+
+#include <linux/device-mapper.h>
+
+#include "ringNode.h"
+
+#include "kernelTypes.h"
+
+// This structure is memcmp'd for equality. Keep it
+// packed and don't add any fields that are not
+// properly set in both extant and parsed configs.
+typedef struct {
+  int bioAckThreads;
+  int bioThreads;
+  int bioRotationInterval;
+  int cpuThreads;
+  int logicalZones;
+  int physicalZones;
+  int hashZones;
+} __attribute__((packed)) ThreadCountConfig;
+
+typedef uint32_t TableVersion;
+
+typedef struct {
+  struct dm_target  *owningTarget;
+  struct dm_dev     *ownedDevice;
+  KernelLayer       *layer;
+  /** All configs referencing a layer are kept on a ring in the layer */
+  RingNode           configNode;
+  char              *originalString;
+  TableVersion       version;
+  char              *parentDeviceName;
+  BlockCount         physicalBlocks;
+  unsigned int       logicalBlockSize;
+  WritePolicy        writePolicy;
+  unsigned int       cacheSize;
+  unsigned int       blockMapMaximumAge;
+  bool               mdRaid5ModeEnabled;
+  bool               deduplication;
+  char              *poolName;
+  ThreadCountConfig  threadCounts;
+  BlockCount         maxDiscardBlocks;
+} DeviceConfig;
+
+/**
+ * Convert a RingNode to the DeviceConfig that contains it.
+ *
+ * @param node  The RingNode to convert
+ *
+ * @return The DeviceConfig wrapping the RingNode
+ **/
+static inline DeviceConfig *asDeviceConfig(RingNode *node)
+{
+  if (node == NULL) {
+    return NULL;
+  }
+  return (DeviceConfig *) ((byte *) node - offsetof(DeviceConfig, configNode));
+}
+
+/**
+ * Grab a pointer to the pool name out of argv.
+ *
+ * @param [in]  argc         The number of table values
+ * @param [in]  argv         The array of table values
+ * @param [out] errorPtr     A pointer to return a error string in
+ * @param [out] poolNamePtr  A pointer to return the pool name
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int getPoolNameFromArgv(int    argc,
+                        char **argv,
+                        char **errorPtr,
+                        char **poolNamePtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Convert the dmsetup table into a DeviceConfig.
+ *
+ * @param [in]  argc        The number of table values
+ * @param [in]  argv        The array of table values
+ * @param [in]  ti          The target structure for this table
+ * @param [in]  verbose     Whether to log about the underlying device
+ * @param [out] configPtr   A pointer to return the allocated config
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int parseDeviceConfig(int                argc,
+                      char             **argv,
+                      struct dm_target  *ti,
+                      bool               verbose,
+                      DeviceConfig     **configPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a device config created by parseDeviceConfig().
+ *
+ * @param configPtr  The pointer holding the config, which will be nulled
+ **/
+void freeDeviceConfig(DeviceConfig **configPtr);
+
+/**
+ * Get the text describing the write policy.
+ *
+ * @param config  The device config
+ *
+ * @returns a pointer to a string describing the write policy
+ **/
+const char *getConfigWritePolicyString(DeviceConfig *config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Acquire or release a reference from the config to a kernel layer.
+ *
+ * @param config  The config in question
+ * @param layer   The kernel layer in question
+ **/
+void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer);
+
+#endif // DEVICE_CONFIG_H
diff --git a/vdo/kernel/deviceRegistry.c b/vdo/kernel/deviceRegistry.c
new file mode 100644
index 0000000..13764b4
--- /dev/null
+++ b/vdo/kernel/deviceRegistry.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.c#3 $
+ */
+
+#include "deviceRegistry.h"
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+
+#include "memoryAlloc.h"
+
+/*
+ * We don't expect this set to ever get really large, so a linked list
+ * is adequate. We can use a PointerMap if we need to later.
+ */
+typedef struct {
+  struct list_head links;
+  rwlock_t         lock;
+} DeviceRegistry;
+
+typedef struct {
+  struct list_head    links;
+  KernelLayer        *layer;
+} RegisteredDevice;
+
+static DeviceRegistry registry;
+
+/**********************************************************************/
+void initializeDeviceRegistryOnce(void)
+{
+  INIT_LIST_HEAD(&registry.links);
+  rwlock_init(&registry.lock);
+}
+
+/**
+ * Implements LayerFilter.
+ **/
+static bool layerIsEqual(KernelLayer *layer, void *context)
+{
+  return ((void *) layer == context);
+}
+
+/**
+ * Find a layer in the registry if it exists there. Must be called holding
+ * the lock.
+ *
+ * @param filter   The filter function to apply to devices
+ * @param context  A bit of context to provide the filter.
+ *
+ * @return the layer object found, if any
+ **/
+__attribute__((warn_unused_result))
+static KernelLayer *filterLayersLocked(LayerFilter *filter, void *context)
+{
+  RegisteredDevice *device;
+  list_for_each_entry(device, &registry.links, links) {
+    if (filter(device->layer, context)) {
+      return device->layer;
+    }
+  }
+  return NULL;
+}
+
+/**********************************************************************/
+int addLayerToDeviceRegistry(KernelLayer *layer)
+{
+  RegisteredDevice *newDevice;
+  int result = ALLOCATE(1, RegisteredDevice, __func__, &newDevice);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  INIT_LIST_HEAD(&newDevice->links);
+  newDevice->layer = layer;
+
+  write_lock(&registry.lock);
+  KernelLayer *oldLayer = filterLayersLocked(layerIsEqual, layer);
+  result = ASSERT(oldLayer == NULL, "Layer not already registered");
+  if (result == VDO_SUCCESS) {
+    list_add_tail(&newDevice->links, &registry.links);
+  }
+  write_unlock(&registry.lock);
+
+  return result;
+}
+
+/**********************************************************************/
+void removeLayerFromDeviceRegistry(KernelLayer *layer)
+{
+  write_lock(&registry.lock);
+  RegisteredDevice *device = NULL;
+  list_for_each_entry(device, &registry.links, links) {
+    if (device->layer == layer) {
+      list_del_init(&device->links);
+      FREE(device);
+      break;
+    }
+  }
+  write_unlock(&registry.lock);
+}
+
+/**********************************************************************/
+KernelLayer *findLayerMatching(LayerFilter *filter, void *context)
+{
+  read_lock(&registry.lock);
+  KernelLayer *layer = filterLayersLocked(filter, context);
+  read_unlock(&registry.lock);
+  return layer;
+}
diff --git a/vdo/kernel/deviceRegistry.h b/vdo/kernel/deviceRegistry.h
new file mode 100644
index 0000000..94c1635
--- /dev/null
+++ b/vdo/kernel/deviceRegistry.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.h#2 $
+ */
+
+#ifndef DEVICE_REGISTRY_H
+#define DEVICE_REGISTRY_H
+
+#include "kernelTypes.h"
+
+/**
+ * Initialize the necessary structures for the device registry.
+ **/
+void initializeDeviceRegistryOnce(void);
+
+/**
+ * Add a layer to the device registry. The layer must not already exist in the
+ * registry.
+ *
+ * @param layer  The layer to add
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int addLayerToDeviceRegistry(KernelLayer *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Remove a layer from the device registry.
+ *
+ * @param layer  The layer to remove
+ **/
+void removeLayerFromDeviceRegistry(KernelLayer *layer);
+
+/**
+ * Find and return the first (if any) layer matching a given filter function.
+ *
+ * @param filter   The filter function to apply to layers
+ * @param context  A bit of context to provide the filter.
+ **/
+KernelLayer *findLayerMatching(LayerFilter *filter, void *context);
+
+#endif // DEVICE_REGISTRY_H
diff --git a/vdo/kernel/dmvdo.c b/vdo/kernel/dmvdo.c
new file mode 100644
index 0000000..a6c7b98
--- /dev/null
+++ b/vdo/kernel/dmvdo.c
@@ -0,0 +1,889 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.c#42 $
+ */
+
+#include "dmvdo.h"
+
+#include <linux/module.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "constants.h"
+#include "ringNode.h"
+#include "threadConfig.h"
+#include "vdo.h"
+
+#include "dedupeIndex.h"
+#include "deviceRegistry.h"
+#include "dump.h"
+#include "instanceNumber.h"
+#include "ioSubmitter.h"
+#include "kernelLayer.h"
+#include "kvdoFlush.h"
+#include "memoryUsage.h"
+#include "statusProcfs.h"
+#include "stringUtils.h"
+#include "sysfs.h"
+#include "threadDevice.h"
+#include "threadRegistry.h"
+
+struct kvdoDevice kvdoDevice;   // global driver state (poorly named)
+
+/*
+ * Pre kernel version 4.3, we use the functionality in blkdev_issue_discard
+ * and the value in max_discard_sectors to split large discards into smaller
+ * ones. 4.3 to 4.18 kernels have removed the code in blkdev_issue_discard
+ * and so in place of that, we use the code in device mapper itself to
+ * split the discards. Unfortunately, it uses the same value to split large
+ * discards as it does to split large data bios.
+ *
+ * In kernel version 4.18, support for splitting discards was added
+ * back into blkdev_issue_discard. Since this mode of splitting
+ * (based on max_discard_sectors) is preferable to splitting always
+ * on 4k, we are turning off the device mapper splitting from 4.18
+ * on.
+ */
+#define HAS_NO_BLKDEV_SPLIT LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) \
+                            && LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0)
+
+/**********************************************************************/
+
+/**
+ * Get the kernel layer associated with a dm target structure.
+ *
+ * @param ti  The dm target structure
+ *
+ * @return The kernel layer, or NULL.
+ **/
+static KernelLayer *getKernelLayerForTarget(struct dm_target *ti)
+{
+  return ((DeviceConfig *) ti->private)->layer;
+}
+
+/**
+ * Begin VDO processing of a bio.  This is called by the device mapper
+ * through the "map" function, and has resulted from a call to either
+ * submit_bio or generic_make_request.
+ *
+ * @param ti      The dm_target.  We only need the "private" member to give
+ *                us the KernelLayer.
+ * @param bio     The bio.
+ *
+ * @return One of these values:
+ *
+ *         negative            A negative value is an error code.
+ *                             Usually -EIO.
+ *
+ *         DM_MAPIO_SUBMITTED  VDO will take care of this I/O, either
+ *                             processing it completely and calling
+ *                             bio_endio, or forwarding it onward by
+ *                             calling generic_make_request.
+ *
+ *         DM_MAPIO_REMAPPED   VDO has modified the bio and the device
+ *                             mapper will immediately forward the bio
+ *                             onward using generic_make_request.
+ *
+ *         DM_MAPIO_REQUEUE    We do not use this.  It is used by device
+ *                             mapper devices to defer an I/O request
+ *                             during suspend/resume processing.
+ **/
+static int vdoMapBio(struct dm_target *ti, BIO *bio)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  return kvdoMapBio(layer, bio);
+}
+
+/**********************************************************************/
+static void vdoIoHints(struct dm_target *ti, struct queue_limits *limits)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+
+  limits->logical_block_size  = layer->deviceConfig->logicalBlockSize;
+  limits->physical_block_size = VDO_BLOCK_SIZE;
+
+  // The minimum io size for random io
+  blk_limits_io_min(limits, VDO_BLOCK_SIZE);
+  // The optimal io size for streamed/sequential io
+  blk_limits_io_opt(limits, VDO_BLOCK_SIZE);
+
+  /*
+   * Sets the maximum discard size that will be passed into VDO. This value
+   * comes from a table line value passed in during dmsetup create.
+   *
+   * The value 1024 is the largest usable value on HD systems.  A 2048 sector
+   * discard on a busy HD system takes 31 seconds.  We should use a value no
+   * higher than 1024, which takes 15 to 16 seconds on a busy HD system.
+   *
+   * But using large values results in 120 second blocked task warnings in
+   * /var/log/kern.log.  In order to avoid these warnings, we choose to use the
+   * smallest reasonable value.  See VDO-3062 and VDO-3087.
+   *
+   * We allow setting of the value for max_discard_sectors even in situations
+   * where we only split on 4k (see comments for HAS_NO_BLKDEV_SPLIT) as the
+   * value is still used in other code, like sysfs display of queue limits and
+   * most especially in dm-thin to determine whether to pass down discards.
+   */
+  limits->max_discard_sectors
+    = layer->deviceConfig->maxDiscardBlocks * VDO_SECTORS_PER_BLOCK;
+
+  limits->discard_granularity = VDO_BLOCK_SIZE;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
+  limits->discard_zeroes_data = 1;
+#endif
+}
+
+/**********************************************************************/
+static int vdoIterateDevices(struct dm_target           *ti,
+                             iterate_devices_callout_fn  fn,
+                             void                       *data)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  sector_t len = blockToSector(layer, layer->deviceConfig->physicalBlocks);
+
+  return fn(ti, layer->deviceConfig->ownedDevice, 0, len, data);
+}
+
+/*
+ * Status line is:
+ *    <device> <operating mode> <in recovery> <index state>
+ *    <compression state> <used physical blocks> <total physical blocks>
+ */
+
+/**********************************************************************/
+static void vdoStatus(struct dm_target *ti,
+                      status_type_t     status_type,
+                      unsigned int      status_flags,
+                      char             *result,
+                      unsigned int      maxlen)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  char nameBuffer[BDEVNAME_SIZE];
+  // N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen".
+  int sz = 0;
+
+  switch (status_type) {
+  case STATUSTYPE_INFO:
+    // Report info for dmsetup status
+    mutex_lock(&layer->statsMutex);
+    getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+    VDOStatistics *stats = &layer->vdoStatsStorage;
+    DMEMIT("/dev/%s %s %s %s %s %llu %llu",
+           bdevname(getKernelLayerBdev(layer), nameBuffer),
+	   stats->mode,
+	   stats->inRecoveryMode ? "recovering" : "-",
+	   getDedupeStateName(layer->dedupeIndex),
+	   getKVDOCompressing(&layer->kvdo) ? "online" : "offline",
+	   stats->dataBlocksUsed + stats->overheadBlocksUsed,
+	   stats->physicalBlocks);
+    mutex_unlock(&layer->statsMutex);
+    break;
+
+  case STATUSTYPE_TABLE:
+    // Report the string actually specified in the beginning.
+    DMEMIT("%s", ((DeviceConfig *) ti->private)->originalString);
+    break;
+  }
+
+//  spin_unlock_irqrestore(&layer->lock, flags);
+}
+
+
+/**
+ * Get the size of the underlying device, in blocks.
+ *
+ * @param [in] layer  The layer
+ *
+ * @return The size in blocks
+ **/
+static BlockCount getUnderlyingDeviceBlockCount(KernelLayer *layer)
+{
+  uint64_t physicalSize = i_size_read(getKernelLayerBdev(layer)->bd_inode);
+  return physicalSize / VDO_BLOCK_SIZE;
+}
+
+/**********************************************************************/
+static int vdoPrepareToGrowLogical(KernelLayer *layer, char *sizeString)
+{
+  BlockCount logicalCount;
+  if (sscanf(sizeString, "%llu", &logicalCount) != 1) {
+    logWarning("Logical block count \"%s\" is not a number", sizeString);
+    return -EINVAL;
+  }
+
+  if (logicalCount > MAXIMUM_LOGICAL_BLOCKS) {
+    logWarning("Logical block count \"%llu\" exceeds the maximum (%"
+               PRIu64 ")", logicalCount, MAXIMUM_LOGICAL_BLOCKS);
+    return -EINVAL;
+  }
+
+  return prepareToResizeLogical(layer, logicalCount);
+}
+
+/**
+ * Process a dmsetup message now that we know no other message is being
+ * processed.
+ *
+ * @param layer The layer to which the message was sent
+ * @param argc  The argument count of the message
+ * @param argv  The arguments to the message
+ *
+ * @return -EINVAL if the message is unrecognized or the result of processing
+ *                 the message
+ **/
+__attribute__((warn_unused_result))
+static int processVDOMessageLocked(KernelLayer   *layer,
+                                   unsigned int   argc,
+                                   char         **argv)
+{
+  // Messages with variable numbers of arguments.
+  if (strncasecmp(argv[0], "x-", 2) == 0) {
+    int result = performKVDOExtendedCommand(&layer->kvdo, argc, argv);
+    if (result == VDO_UNKNOWN_COMMAND) {
+      logWarning("unknown extended command '%s' to dmsetup message", argv[0]);
+      result = -EINVAL;
+    }
+
+    return result;
+  }
+
+  // Messages with fixed numbers of arguments.
+  switch (argc) {
+  case 1:
+    if (strcasecmp(argv[0], "sync-dedupe") == 0) {
+      waitForNoRequestsActive(layer);
+      return 0;
+    }
+
+    if (strcasecmp(argv[0], "trace-on") == 0) {
+      logInfo("Tracing on");
+      layer->traceLogging = true;
+      return 0;
+    }
+
+    if (strcasecmp(argv[0], "trace-off") == 0) {
+      logInfo("Tracing off");
+      layer->traceLogging = false;
+      return 0;
+    }
+
+    if (strcasecmp(argv[0], "prepareToGrowPhysical") == 0) {
+      return prepareToResizePhysical(layer,
+                                     getUnderlyingDeviceBlockCount(layer));
+    }
+
+    if (strcasecmp(argv[0], "growPhysical") == 0) {
+      // The actual growPhysical will happen when the device is resumed.
+
+      if (layer->deviceConfig->version != 0) {
+        // XXX Uncomment this branch when new VDO manager is updated to not
+        // send this message.
+
+        // Old style message on new style table is unexpected; it means the
+        // user started the VDO with new manager and is growing with old.
+        // logInfo("Mismatch between growPhysical method and table version.");
+        // return -EINVAL;
+      } else {
+        layer->deviceConfig->physicalBlocks
+          = getUnderlyingDeviceBlockCount(layer);
+      }
+      return 0;
+    }
+
+    break;
+
+  case 2:
+    if (strcasecmp(argv[0], "compression") == 0) {
+      if (strcasecmp(argv[1], "on") == 0) {
+        setKVDOCompressing(&layer->kvdo, true);
+        return 0;
+      }
+
+      if (strcasecmp(argv[1], "off") == 0) {
+        setKVDOCompressing(&layer->kvdo, false);
+        return 0;
+      }
+
+      logWarning("invalid argument '%s' to dmsetup compression message",
+                 argv[1]);
+      return -EINVAL;
+    }
+
+    if (strcasecmp(argv[0], "prepareToGrowLogical") == 0) {
+      return vdoPrepareToGrowLogical(layer, argv[1]);
+    }
+
+    break;
+
+
+  default:
+    break;
+  }
+
+  logWarning("unrecognized dmsetup message '%s' received", argv[0]);
+  return -EINVAL;
+}
+
+/**
+ * Process a dmsetup message. If the message is a dump, just do it. Otherwise,
+ * check that no other message is being processed, and only proceed if so.
+ *
+ * @param layer The layer to which the message was sent
+ * @param argc  The argument count of the message
+ * @param argv  The arguments to the message
+ *
+ * @return -EBUSY if another message is being processed or the result of
+ *                processsing the message
+ **/
+__attribute__((warn_unused_result))
+static int processVDOMessage(KernelLayer   *layer,
+                             unsigned int   argc,
+                             char         **argv)
+{
+  /*
+   * All messages which may be processed in parallel with other messages should
+   * be handled here before the atomic check below. Messages which should be
+   * exclusive should be processed in processVDOMessageLocked().
+   */
+
+  // Dump messages should always be processed
+  if (strcasecmp(argv[0], "dump") == 0) {
+    return vdoDump(layer, argc, argv, "dmsetup message");
+  }
+
+  if (argc == 1) {
+    if (strcasecmp(argv[0], "dump-on-shutdown") == 0) {
+      layer->dumpOnShutdown = true;
+      return 0;
+    }
+
+    // Index messages should always be processed
+    if ((strcasecmp(argv[0], "index-close") == 0)
+        || (strcasecmp(argv[0], "index-create") == 0)
+        || (strcasecmp(argv[0], "index-disable") == 0)
+        || (strcasecmp(argv[0], "index-enable") == 0)) {
+      return messageDedupeIndex(layer->dedupeIndex, argv[0]);
+    }
+
+    // XXX - the "connect" messages are misnamed for the kernel index.  These
+    //       messages should go away when all callers have been fixed to use
+    //       "index-enable" or "index-disable".
+    if (strcasecmp(argv[0], "reconnect") == 0) {
+      return messageDedupeIndex(layer->dedupeIndex, "index-enable");
+    }
+
+    if (strcasecmp(argv[0], "connect") == 0) {
+      return messageDedupeIndex(layer->dedupeIndex, "index-enable");
+    }
+
+    if (strcasecmp(argv[0], "disconnect") == 0) {
+      return messageDedupeIndex(layer->dedupeIndex, "index-disable");
+    }
+  }
+
+  if (!compareAndSwapBool(&layer->processingMessage, false, true)) {
+    return -EBUSY;
+  }
+
+  int result = processVDOMessageLocked(layer, argc, argv);
+  atomicStoreBool(&layer->processingMessage, false);
+  return result;
+}
+
+/**********************************************************************/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0)
+static int vdoMessage(struct dm_target  *ti,
+                      unsigned int       argc,
+                      char             **argv,
+                      char              *resultBuffer,
+                      unsigned int       maxlen)
+#else
+static int vdoMessage(struct dm_target *ti, unsigned int argc, char **argv)
+#endif
+{
+  if (argc == 0) {
+    logWarning("unspecified dmsetup message");
+    return -EINVAL;
+  }
+
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  RegisteredThread allocatingThread, instanceThread;
+  registerAllocatingThread(&allocatingThread, NULL);
+  registerThreadDevice(&instanceThread, layer);
+  int result = processVDOMessage(layer, argc, argv);
+  unregisterThreadDeviceID();
+  unregisterAllocatingThread();
+  return mapToSystemError(result);
+}
+
+/**
+ * Configure the dm_target with our capabilities.
+ *
+ * @param ti    The device mapper target representing our device
+ * @param layer The kernel layer to get the write policy from
+ **/
+static void configureTargetCapabilities(struct dm_target *ti,
+                                        KernelLayer      *layer)
+{
+  ti->discards_supported = 1;
+
+  /**
+   * This may appear to indicate we don't support flushes in sync mode.
+   * However, dm will set up the request queue to accept flushes if any
+   * device in the stack accepts flushes. Hence if the device under VDO
+   * accepts flushes, we will receive flushes.
+   **/
+  ti->flush_supported = shouldProcessFlush(layer);
+  ti->num_discard_bios = 1;
+  ti->num_flush_bios = 1;
+
+  // If this value changes, please make sure to update the
+  // value for maxDiscardSectors accordingly.
+  BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0);
+
+/*
+ * Please see comments above where the macro is defined.
+ */
+#if HAS_NO_BLKDEV_SPLIT
+  ti->split_discard_bios = 1;
+#endif
+}
+
+/**
+ * Handle a vdoInitialize failure, freeing all appropriate structures.
+ *
+ * @param ti            The device mapper target representing our device
+ * @param threadConfig  The thread config (possibly NULL)
+ * @param layer         The kernel layer (possibly NULL)
+ * @param instance      The instance number to be released
+ * @param why           The reason for failure
+ **/
+static void cleanupInitialize(struct dm_target *ti,
+                              ThreadConfig     *threadConfig,
+                              KernelLayer      *layer,
+                              unsigned int      instance,
+                              char             *why)
+{
+  if (threadConfig != NULL) {
+    freeThreadConfig(&threadConfig);
+  }
+  if (layer != NULL) {
+    // This releases the instance number too.
+    freeKernelLayer(layer);
+  } else {
+    // With no KernelLayer taking ownership we have to release explicitly.
+    releaseKVDOInstance(instance);
+  }
+
+  ti->error = why;
+}
+
+/**
+ * Initializes a single VDO instance and loads the data from disk
+ *
+ * @param ti        The device mapper target representing our device
+ * @param instance  The device instantiation counter
+ * @param config    The parsed config for the instance
+ *
+ * @return  VDO_SUCCESS or an error code
+ *
+ **/
+static int vdoInitialize(struct dm_target *ti,
+                         unsigned int      instance,
+                         DeviceConfig     *config)
+{
+  logInfo("loading device '%s'", config->poolName);
+
+  uint64_t   blockSize      = VDO_BLOCK_SIZE;
+  uint64_t   logicalSize    = to_bytes(ti->len);
+  BlockCount logicalBlocks  = logicalSize / blockSize;
+
+  logDebug("Logical block size     = %llu",
+           (uint64_t) config->logicalBlockSize);
+  logDebug("Logical blocks         = %llu", logicalBlocks);
+  logDebug("Physical block size    = %llu", (uint64_t) blockSize);
+  logDebug("Physical blocks        = %llu", config->physicalBlocks);
+  logDebug("Block map cache blocks = %u", config->cacheSize);
+  logDebug("Block map maximum age  = %u", config->blockMapMaximumAge);
+  logDebug("MD RAID5 mode          = %s", (config->mdRaid5ModeEnabled
+                                           ? "on" : "off"));
+  logDebug("Write policy           = %s", getConfigWritePolicyString(config));
+  logDebug("Deduplication          = %s", (config->deduplication
+                                           ? "on" : "off"));
+
+  // The threadConfig will be copied by the VDO if it's successfully
+  // created.
+  VDOLoadConfig loadConfig = {
+    .cacheSize    = config->cacheSize,
+    .threadConfig = NULL,
+    .writePolicy  = config->writePolicy,
+    .maximumAge   = config->blockMapMaximumAge,
+  };
+
+  char        *failureReason;
+  KernelLayer *layer;
+  int result = makeKernelLayer(ti->begin, instance, config,
+                               &kvdoDevice.kobj, &loadConfig.threadConfig,
+                               &failureReason, &layer);
+  if (result != VDO_SUCCESS) {
+    logError("Could not create kernel physical layer. (VDO error %d,"
+             " message %s)", result, failureReason);
+    cleanupInitialize(ti, loadConfig.threadConfig, NULL, instance,
+                      failureReason);
+    return result;
+  }
+
+  // Now that we have read the geometry, we can finish setting up the
+  // VDOLoadConfig.
+  setLoadConfigFromGeometry(&layer->geometry, &loadConfig);
+
+  if (config->cacheSize < (2 * MAXIMUM_USER_VIOS
+                   * loadConfig.threadConfig->logicalZoneCount)) {
+    logWarning("Insufficient block map cache for logical zones");
+    cleanupInitialize(ti, loadConfig.threadConfig, layer, instance,
+                      "Insufficient block map cache for logical zones");
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  // Henceforth it is the kernel layer's responsibility to clean up the
+  // ThreadConfig.
+  result = preloadKernelLayer(layer, &loadConfig, &failureReason);
+  if (result != VDO_SUCCESS) {
+    logError("Could not start kernel physical layer. (VDO error %d,"
+             " message %s)", result, failureReason);
+    cleanupInitialize(ti, NULL, layer, instance, failureReason);
+    return result;
+  }
+
+  setDeviceConfigLayer(config, layer);
+  setKernelLayerActiveConfig(layer, config);
+  ti->private = config;
+  configureTargetCapabilities(ti, layer);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static int vdoCtr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+  int result = VDO_SUCCESS;
+  
+  RegisteredThread allocatingThread;
+  registerAllocatingThread(&allocatingThread, NULL);
+
+  const char *deviceName = dm_device_name(dm_table_get_md(ti->table));  
+  KernelLayer *oldLayer = findLayerMatching(layerIsNamed, (void *)deviceName);
+  unsigned int instance;
+  if (oldLayer == NULL) {
+    result = allocateKVDOInstance(&instance);
+    if (result != VDO_SUCCESS) {
+      unregisterAllocatingThread();
+      return -ENOMEM;
+    }
+  } else {
+    instance = oldLayer->instance;
+  }
+
+  RegisteredThread instanceThread;
+  registerThreadDeviceID(&instanceThread, &instance);
+
+  bool verbose = (oldLayer == NULL);
+  DeviceConfig *config = NULL;
+  result = parseDeviceConfig(argc, argv, ti, verbose, &config);
+  if (result != VDO_SUCCESS) {
+    unregisterThreadDeviceID();
+    unregisterAllocatingThread();
+    if (oldLayer == NULL) {
+      releaseKVDOInstance(instance);
+    }
+    return -EINVAL;
+  }
+
+  // Is there already a device of this name?
+  if (oldLayer != NULL) {
+    /*
+     * To preserve backward compatibility with old VDO Managers, we need to
+     * allow this to happen when either suspended or not. We could assert
+     * that if the config is version 0, we are suspended, and if not, we
+     * are not, but we can't do that till new VDO Manager does the right
+     * order.
+     */
+    logInfo("preparing to modify device '%s'", config->poolName);
+    result = prepareToModifyKernelLayer(oldLayer, config, &ti->error);
+    if (result != VDO_SUCCESS) {
+      result = mapToSystemError(result);
+      freeDeviceConfig(&config);
+    } else {
+      setDeviceConfigLayer(config, oldLayer);
+      ti->private = config;
+      configureTargetCapabilities(ti, oldLayer);
+    }
+    unregisterThreadDeviceID();
+    unregisterAllocatingThread();
+    return result;
+  }
+
+  result = vdoInitialize(ti, instance, config);
+  if (result != VDO_SUCCESS) {
+    // vdoInitialize calls into various VDO routines, so map error
+    result = mapToSystemError(result);
+    freeDeviceConfig(&config);
+  }
+
+  unregisterThreadDeviceID();
+  unregisterAllocatingThread();
+  return result;
+}
+
+/**********************************************************************/
+static void vdoDtr(struct dm_target *ti)
+{
+  DeviceConfig *config = ti->private;
+  KernelLayer  *layer  = config->layer;
+
+  setDeviceConfigLayer(config, NULL);
+
+  if (isRingEmpty(&layer->deviceConfigRing)) {
+    // This was the last config referencing the layer. Free it.
+    unsigned int instance = layer->instance;
+    RegisteredThread allocatingThread, instanceThread;
+    registerThreadDeviceID(&instanceThread, &instance);
+    registerAllocatingThread(&allocatingThread, NULL);
+
+    waitForNoRequestsActive(layer);
+    logInfo("stopping device '%s'", config->poolName);
+
+    if (layer->dumpOnShutdown) {
+      vdoDumpAll(layer, "device shutdown");
+    }
+
+    freeKernelLayer(layer);
+    logInfo("device '%s' stopped", config->poolName);
+    unregisterThreadDeviceID();
+    unregisterAllocatingThread();
+  } else if (config == layer->deviceConfig) {
+    // The layer still references this config. Give it a reference to a
+    // config that isn't being destroyed.
+    layer->deviceConfig = asDeviceConfig(layer->deviceConfigRing.next);
+  }
+
+  freeDeviceConfig(&config);
+  ti->private = NULL;
+}
+
+/**********************************************************************/
+static void vdoPresuspend(struct dm_target *ti)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  RegisteredThread instanceThread;
+  registerThreadDevice(&instanceThread, layer);
+  if (dm_noflush_suspending(ti)) {
+    layer->noFlushSuspend = true;
+  }
+  unregisterThreadDeviceID();
+}
+
+/**********************************************************************/
+static void vdoPostsuspend(struct dm_target *ti)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  RegisteredThread instanceThread;
+  registerThreadDevice(&instanceThread, layer);
+  const char *poolName = layer->deviceConfig->poolName;
+  logInfo("suspending device '%s'", poolName);
+  int result = suspendKernelLayer(layer);
+  if (result == VDO_SUCCESS) {
+    logInfo("device '%s' suspended", poolName);
+  } else {
+    logError("suspend of device '%s' failed with error: %d", poolName, result);
+  }
+  layer->noFlushSuspend = false;
+  unregisterThreadDeviceID();
+}
+
+/**********************************************************************/
+static int vdoPreresume(struct dm_target *ti)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  DeviceConfig *config = ti->private;
+  RegisteredThread instanceThread;
+
+  BlockCount backingBlocks = getUnderlyingDeviceBlockCount(layer);
+  if (backingBlocks < config->physicalBlocks) {
+    logError("resume of device '%s' failed: backing device has %" PRIu64
+             " blocks but VDO physical size is %llu blocks",
+             config->poolName, backingBlocks, config->physicalBlocks);
+    return -EINVAL;
+  }
+
+  registerThreadDevice(&instanceThread, layer);
+
+  if (getKernelLayerState(layer) == LAYER_STARTING) {
+    // This is the first time this device has been resumed, so run it.
+    logInfo("starting device '%s'", config->poolName);
+    char *failureReason;
+    int result = startKernelLayer(layer, &failureReason);
+    if (result != VDO_SUCCESS) {
+      logError("Could not run kernel physical layer. (VDO error %d,"
+               " message %s)", result, failureReason);
+      setKVDOReadOnly(&layer->kvdo, result);
+      unregisterThreadDeviceID();
+      return mapToSystemError(result);
+    }
+
+    logInfo("device '%s' started", config->poolName);
+  }
+
+  logInfo("resuming device '%s'", config->poolName);
+
+  // This is a noop if nothing has changed, and by calling it every time
+  // we capture old-style growPhysicals, which change the config in place.
+  int result = modifyKernelLayer(layer, config);
+  if (result != VDO_SUCCESS) {
+    logErrorWithStringError(result, "Commit of modifications to device '%s'"
+                            " failed", config->poolName);
+    setKernelLayerActiveConfig(layer, config);
+    setKVDOReadOnly(&layer->kvdo, result);
+  } else {
+    setKernelLayerActiveConfig(layer, config);
+    result = resumeKernelLayer(layer);
+    if (result != VDO_SUCCESS) {
+      logError("resume of device '%s' failed with error: %d",
+	       layer->deviceConfig->poolName, result);
+    }
+  }
+  unregisterThreadDeviceID();
+  return mapToSystemError(result);
+}
+
+/**********************************************************************/
+static void vdoResume(struct dm_target *ti)
+{
+  KernelLayer *layer = getKernelLayerForTarget(ti);
+  RegisteredThread instanceThread;
+  registerThreadDevice(&instanceThread, layer);
+  logInfo("device '%s' resumed", layer->deviceConfig->poolName);
+  unregisterThreadDeviceID();
+}
+
+/*
+ * If anything changes that affects how user tools will interact
+ * with vdo, update the version number and make sure
+ * documentation about the change is complete so tools can
+ * properly update their management code.
+ */
+static struct target_type vdoTargetBio = {
+  .features        = DM_TARGET_SINGLETON,
+  .name            = "vdo",
+  .version         = {6, 2, 3},
+  .module          = THIS_MODULE,
+  .ctr             = vdoCtr,
+  .dtr             = vdoDtr,
+  .io_hints        = vdoIoHints,
+  .iterate_devices = vdoIterateDevices,
+  .map             = vdoMapBio,
+  .message         = vdoMessage,
+  .status          = vdoStatus,
+  .presuspend      = vdoPresuspend,
+  .postsuspend     = vdoPostsuspend,
+  .preresume       = vdoPreresume,
+  .resume          = vdoResume,
+};
+
+static bool dmRegistered     = false;
+static bool sysfsInitialized = false;
+
+/**********************************************************************/
+static void vdoDestroy(void)
+{
+  logDebug("in %s", __func__);
+
+  kvdoDevice.status = SHUTTING_DOWN;
+
+  if (sysfsInitialized) {
+    vdoPutSysfs(&kvdoDevice.kobj);
+  }
+  vdoDestroyProcfs();
+
+  kvdoDevice.status = UNINITIALIZED;
+
+  if (dmRegistered) {
+    dm_unregister_target(&vdoTargetBio);
+  }
+
+  cleanUpInstanceNumberTracking();
+
+  logInfo("unloaded version %s", CURRENT_VERSION);
+}
+
+/**********************************************************************/
+static int __init vdoInit(void)
+{
+  int result = 0;
+
+  initializeThreadDeviceRegistry();
+  initializeStandardErrorBlocks();
+  initializeDeviceRegistryOnce();
+  logInfo("loaded version %s", CURRENT_VERSION);
+
+  result = dm_register_target(&vdoTargetBio);
+  if (result < 0) {
+    logError("dm_register_target failed %d", result);
+    vdoDestroy();
+    return result;
+  }
+  dmRegistered = true;
+
+  kvdoDevice.status = UNINITIALIZED;
+
+  vdoInitProcfs();
+  /*
+   * Set up global sysfs stuff
+   */
+  result = vdoInitSysfs(&kvdoDevice.kobj);
+  if (result < 0) {
+    logError("sysfs initialization failed %d", result);
+    vdoDestroy();
+    // vdoInitSysfs only returns system error codes
+    return result;
+  }
+  sysfsInitialized = true;
+
+  initWorkQueueOnce();
+  initializeTraceLoggingOnce();
+  initKernelVDOOnce();
+  initializeInstanceNumberTracking();
+
+  kvdoDevice.status = READY;
+  return result;
+}
+
+/**********************************************************************/
+static void __exit vdoExit(void)
+{
+  vdoDestroy();
+}
+
+module_init(vdoInit);
+module_exit(vdoExit);
+
+MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CURRENT_VERSION);
diff --git a/vdo/kernel/dmvdo.h b/vdo/kernel/dmvdo.h
new file mode 100644
index 0000000..a71e39d
--- /dev/null
+++ b/vdo/kernel/dmvdo.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.h#2 $
+ */
+
+#ifndef DMVDO_H
+#define DMVDO_H
+
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/types.h>
+
+#include "kernelLayer.h"
+
+typedef enum {
+  UNINITIALIZED = 0,
+  READY,
+  SHUTTING_DOWN,
+} KVDOStatus;
+
+/*
+ * The internal representation of our device.
+ */
+struct kvdoDevice {
+  KVDOStatus          status;
+  struct kobject      kobj;
+};
+
+extern struct kvdoDevice kvdoDevice;
+
+#endif /* DMVDO_H */
diff --git a/vdo/kernel/dump.c b/vdo/kernel/dump.c
new file mode 100644
index 0000000..b9b02e2
--- /dev/null
+++ b/vdo/kernel/dump.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.c#2 $
+ */
+
+#include "dump.h"
+
+#include <linux/module.h>
+
+#include "memoryAlloc.h"
+#include "typeDefs.h"
+
+#include "constants.h"
+#include "vdo.h"
+
+#include "dedupeIndex.h"
+#include "histogram.h"
+#include "ioSubmitter.h"
+#include "logger.h"
+
+enum dumpOptions {
+  // WorkQueues
+  SHOW_ALBIREO_QUEUE,
+  SHOW_BIO_ACK_QUEUE,
+  SHOW_BIO_QUEUE,
+  SHOW_CPU_QUEUES,
+  SHOW_REQUEST_QUEUE,
+  // MemoryPools
+  SHOW_VIO_POOL,
+  // Others
+  SHOW_VDO_STATUS,
+  // This one means an option overrides the "default" choices, instead
+  // of altering them.
+  SKIP_DEFAULT
+};
+
+enum dumpOptionFlags {
+  // WorkQueues
+  FLAG_SHOW_ALBIREO_QUEUE = (1 << SHOW_ALBIREO_QUEUE),
+  FLAG_SHOW_BIO_ACK_QUEUE = (1 << SHOW_BIO_ACK_QUEUE),
+  FLAG_SHOW_BIO_QUEUE     = (1 << SHOW_BIO_QUEUE),
+  FLAG_SHOW_CPU_QUEUES    = (1 << SHOW_CPU_QUEUES),
+  FLAG_SHOW_REQUEST_QUEUE = (1 << SHOW_REQUEST_QUEUE),
+  // MemoryPools
+  FLAG_SHOW_VIO_POOL      = (1 << SHOW_VIO_POOL),
+  // Others
+  FLAG_SHOW_VDO_STATUS    = (1 << SHOW_VDO_STATUS),
+  // Special
+  FLAG_SKIP_DEFAULT       = (1 << SKIP_DEFAULT)
+  };
+
+enum {
+  FLAGS_ALL_POOLS    = (FLAG_SHOW_VIO_POOL),
+  FLAGS_ALL_QUEUES   = (FLAG_SHOW_REQUEST_QUEUE
+                        | FLAG_SHOW_ALBIREO_QUEUE
+                        | FLAG_SHOW_BIO_ACK_QUEUE
+                        | FLAG_SHOW_BIO_QUEUE
+                        | FLAG_SHOW_CPU_QUEUES),
+  FLAGS_ALL_THREADS  = (FLAGS_ALL_QUEUES),
+  DEFAULT_DUMP_FLAGS = (FLAGS_ALL_THREADS | FLAG_SHOW_VDO_STATUS)
+};
+
+/**********************************************************************/
+static inline bool isArgString(const char *arg, const char *thisOption)
+{
+  // device-mapper convention seems to be case-independent options
+  return strncasecmp(arg, thisOption, strlen(thisOption)) == 0;
+}
+
+/**********************************************************************/
+static void doDump(KernelLayer  *layer,
+                   unsigned int  dumpOptionsRequested,
+                   const char   *why)
+{
+  logInfo("%s dump triggered via %s", THIS_MODULE->name, why);
+  // XXX Add in number of outstanding requests being processed by vdo
+  uint32_t active, maximum;
+  getLimiterValuesAtomically(&layer->requestLimiter, &active, &maximum);
+  int64_t outstanding = atomic64_read(&layer->biosSubmitted)
+                        - atomic64_read(&layer->biosCompleted);
+  logInfo("%" PRIu32 " device requests outstanding (max %" PRIu32 "), "
+          "%" PRId64 " bio requests outstanding, poolName '%s'",
+          active, maximum, outstanding, layer->deviceConfig->poolName);
+  if ((dumpOptionsRequested & FLAG_SHOW_REQUEST_QUEUE) != 0) {
+    dumpKVDOWorkQueue(&layer->kvdo);
+  }
+  if ((dumpOptionsRequested & FLAG_SHOW_BIO_QUEUE) != 0) {
+    dumpBioWorkQueue(layer->ioSubmitter);
+  }
+  if (useBioAckQueue(layer)
+      && ((dumpOptionsRequested & FLAG_SHOW_BIO_ACK_QUEUE) != 0)) {
+    dumpWorkQueue(layer->bioAckQueue);
+  }
+  if ((dumpOptionsRequested & FLAG_SHOW_CPU_QUEUES) != 0) {
+    dumpWorkQueue(layer->cpuQueue);
+  }
+  dumpDedupeIndex(layer->dedupeIndex,
+                  (dumpOptionsRequested & FLAG_SHOW_ALBIREO_QUEUE) != 0);
+  dumpBufferPool(layer->dataKVIOPool,
+                 (dumpOptionsRequested & FLAG_SHOW_VIO_POOL) != 0);
+  if ((dumpOptionsRequested & FLAG_SHOW_VDO_STATUS) != 0) {
+    // Options should become more fine-grained when we have more to
+    // display here.
+    dumpKVDOStatus(&layer->kvdo);
+  }
+  reportMemoryUsage();
+  logInfo("end of %s dump", THIS_MODULE->name);
+}
+
+/**********************************************************************/
+static int parseDumpOptions(unsigned int  argc,
+                            char * const *argv,
+                            unsigned int *dumpOptionsRequestedPtr)
+{
+  unsigned int dumpOptionsRequested = 0;
+
+  static const struct {
+    const char   *name;
+    unsigned int  flags;
+  } optionNames[] = {
+    // Should "albireo" mean sending queue + receiving thread + outstanding?
+    { "dedupe",      FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE },
+    { "dedupeq",     FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE },
+    { "kvdodedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE },
+    { "bioack",      FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE },
+    { "kvdobioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE },
+    { "bioackq",     FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE },
+    { "bio",         FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE },
+    { "kvdobioq",    FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE },
+    { "bioq",        FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE },
+    { "cpu",         FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES },
+    { "kvdocpuq",    FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES },
+    { "cpuq",        FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES },
+    { "request",     FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE },
+    { "kvdoreqq",    FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE },
+    { "reqq",        FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE },
+    { "viopool",     FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL },
+    { "vdo",         FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS },
+
+    { "pools",   FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS },
+    { "queues",  FLAG_SKIP_DEFAULT | FLAGS_ALL_QUEUES },
+    { "threads", FLAG_SKIP_DEFAULT | FLAGS_ALL_THREADS },
+    { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS },
+    { "all",      ~0 },
+  };
+
+  bool optionsOkay = true;
+  for (int i = 1; i < argc; i++) {
+    int j;
+    for (j = 0; j < COUNT_OF(optionNames); j++) {
+      if (isArgString(argv[i], optionNames[j].name)) {
+        dumpOptionsRequested |= optionNames[j].flags;
+        break;
+      }
+    }
+    if (j == COUNT_OF(optionNames)) {
+      logWarning("dump option name '%s' unknown", argv[i]);
+      optionsOkay = false;
+    }
+  }
+  if (!optionsOkay) {
+    return -EINVAL;
+  }
+  if ((dumpOptionsRequested & FLAG_SKIP_DEFAULT) == 0) {
+    dumpOptionsRequested |= DEFAULT_DUMP_FLAGS;
+  }
+  *dumpOptionsRequestedPtr = dumpOptionsRequested;
+  return 0;
+}
+
+/**********************************************************************/
+int vdoDump(KernelLayer  *layer,
+            unsigned int  argc,
+            char * const *argv,
+            const char   *why)
+{
+  unsigned int dumpOptionsRequested = 0;
+  int result = parseDumpOptions(argc, argv, &dumpOptionsRequested);
+  if (result != 0) {
+    return result;
+  }
+  doDump(layer, dumpOptionsRequested, why);
+  return 0;
+}
+
+/**********************************************************************/
+void vdoDumpAll(KernelLayer *layer, const char *why)
+{
+  doDump(layer, ~0, why);
+}
diff --git a/vdo/kernel/dump.h b/vdo/kernel/dump.h
new file mode 100644
index 0000000..5187d4f
--- /dev/null
+++ b/vdo/kernel/dump.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.h#1 $
+ */
+
+#ifndef DUMP_H
+#define DUMP_H
+
+#include "kernelLayer.h"
+
+/**
+ * Dump internal state and/or statistics to the kernel log, as
+ * specified by zero or more string arguments.
+ *
+ * @param layer   The kernel layer
+ * @param argc    Number of arguments
+ * @param argv    The argument list
+ * @param why     Reason for doing the dump
+ **/
+int vdoDump(KernelLayer  *layer,
+            unsigned int  argc,
+            char * const *argv,
+            const char   *why);
+
+/**
+ * Dump lots of internal state and statistics to the kernel log.
+ * Identical to "dump all", without each caller needing to set up the
+ * argument list.
+ *
+ * @param layer   The kernel layer
+ * @param why     Reason for doing the dump
+ **/
+void vdoDumpAll(KernelLayer *layer, const char *why);
+
+#endif // DUMP_H
diff --git a/vdo/kernel/errors.c b/vdo/kernel/errors.c
new file mode 100644
index 0000000..dc9303e
--- /dev/null
+++ b/vdo/kernel/errors.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.c#2 $
+ */
+
+#include "errors.h"
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include "permassert.h"
+#include "statusCodes.h"
+
+static const struct errorInfo errorList[] = {
+  { "UDS_UNINITIALIZED", "UDS library is not initialized" },
+  { "UDS_SHUTTINGDOWN", "UDS library is shutting down" },
+  { "UDS_EMODULE_LOAD", "Could not load modules" },
+  { "UDS_ENOTHREADS", "Could not create a new thread" },
+  { "UDS_NOCONTEXT", "Could not find the requested library context" },
+  { "UDS_DISABLED", "UDS library context is disabled" },
+  { "UDS_CORRUPT_FILE", "Corrupt file" },
+  { "UDS_UNKNOWN_ERROR", "Unknown error" },
+  { "UDS_GRID_NO_SERVERS", "No servers in grid configuration" },
+  { "UDS_GRID_CONFIG_INCONSISTENT", "Grid configuration inconsistent" },
+  { "UDS_UNSUPPORTED_VERSION", "Unsupported version" },
+  { "UDS_NO_INDEXSESSION", "Index session not known" },
+  { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" },
+  { "UDS_SHORT_READ", "Could not read requested number of bytes" },
+  { "UDS_AI_ERROR", "Network address and service translation error" },
+  { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" },
+  { "UDS_WRONG_CONTEXT_TYPE", "Context type mismatch" },
+  { "UDS_BLOCK_ADDRESS_REQUIRED", "A block address is required" },
+  { "UDS_CHUNK_DATA_REQUIRED", "Block data is required" },
+  { "UDS_CHUNK_NAME_REQUIRED", "A chunk name is required" },
+  { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" },
+  { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" },
+  { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" },
+  { "UDS_CONTEXT_PTR_REQUIRED", "A context pointer is required" },
+  { "UDS_FILEID_REQUIRED", "A file ID is required" },
+  { "UDS_STREAM_REQUIRED", "A stream is required" },
+  { "UDS_STREAMID_REQUIRED", "A stream ID is required" },
+  { "UDS_STREAM_PTR_REQUIRED", "A stream pointer is required" },
+  { "UDS_INVALID_MEMORY_SIZE",
+    "Configured memory too small or unsupported size" },
+  { "UDS_INVALID_METADATA_SIZE", "Invalid metadata size" },
+  { "UDS_INDEX_NAME_REQUIRED", "An index name is required" },
+  { "UDS_CONF_REQUIRED", "A configuration is required" },
+  { "UDS_BAD_FILE_DESCRIPTOR", "Bad file descriptor" },
+  { "UDS_INDEX_EXISTS", "Index already exists" },
+  { "UDS_REQUESTS_OUT_OF_RANGE", "Maximum request value out of range" },
+  { "UDS_BAD_NAMESPACE", "Bad namespace" },
+  { "UDS_MIGRATOR_MISMATCH",
+    "Migrator arguments do not match reader arguments" },
+  { "UDS_NO_INDEX", "No index found" },
+  { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" },
+  { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" },
+  { "UDS_INDEX_PATH_NOT_DIR", "Index path does not point to a directory" },
+  { "UDS_ALREADY_OPEN", "Open invoked on already opened connection" },
+  { "UDS_CALLBACK_ALREADY_REGISTERED", "Callback already registered" },
+  { "UDS_INDEX_PATH_TOO_LONG", "Index path too long" },
+  { "UDS_END_OF_FILE", "Unexpected end of file" },
+  { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" },
+};
+
+static const struct errorInfo internalErrorList[] = {
+  { "UDS_PROTOCOL_ERROR", "Client/server protocol error" },
+  { "UDS_OVERFLOW", "Index overflow" },
+  { "UDS_FILLDONE", "Fill phase done" },
+  { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" },
+  { "UDS_BAD_STATE", "UDS data structures are in an invalid state" },
+  { "UDS_DUPLICATE_NAME",
+    "Attempt to enter the same name into a delta index twice" },
+  { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" },
+  { "UDS_INJECTED_ERROR", "Injected error" },
+  { "UDS_ASSERTION_FAILED", "Assertion failed" },
+  { "UDS_UNSCANNABLE", "Unscannable" },
+  { "UDS_QUEUED", "Request queued" },
+  { "UDS_QUEUE_ALREADY_CONNECTED", "Queue already connected" },
+  { "UDS_BAD_FILL_PHASE", "Fill phase not supported" },
+  { "UDS_BUFFER_ERROR", "Buffer error" },
+  { "UDS_CONNECTION_LOST", "Lost connection to peer" },
+  { "UDS_TIMEOUT", "A time out has occurred" },
+  { "UDS_NO_DIRECTORY", "Expected directory is missing" },
+  { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" },
+  { "UDS_INVALID_RUN_ID", "Invalid albGenTest server run ID" },
+  { "UDS_RUN_CANCELED", "albGenTest server run canceled" },
+  { "UDS_ALREADY_REGISTERED", "error range already registered" },
+};
+
+/** Error attributes - or into top half of error code */
+enum {
+  UDS_UNRECOVERABLE = (1 << 17)
+};
+
+typedef struct errorBlock {
+  const char      *name;
+  int              base;
+  int              last;
+  int              max;
+  const ErrorInfo *infos;
+} ErrorBlock;
+
+enum {
+  MAX_ERROR_BLOCKS = 6          // needed for testing
+};
+
+static struct errorInformation {
+  int        allocated;
+  int        count;
+  ErrorBlock blocks[MAX_ERROR_BLOCKS];
+} registeredErrors;
+
+/**********************************************************************/
+void initializeStandardErrorBlocks(void)
+{
+  registeredErrors.allocated = MAX_ERROR_BLOCKS;
+  registeredErrors.count   = 0;
+
+
+  registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) {
+    .name  = "UDS Error",
+    .base  = UDS_ERROR_CODE_BASE,
+    .last  = UDS_ERROR_CODE_LAST,
+    .max   = UDS_ERROR_CODE_BLOCK_END,
+    .infos = errorList,
+  };
+
+  registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) {
+    .name  = "UDS Internal Error",
+    .base  = UDS_INTERNAL_ERROR_CODE_BASE,
+    .last  = UDS_INTERNAL_ERROR_CODE_LAST,
+    .max   = UDS_INTERNAL_ERROR_CODE_BLOCK_END,
+    .infos = internalErrorList,
+  };
+
+  registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) {
+    .name  = THIS_MODULE->name,
+    .base  = VDO_BLOCK_START,
+    .last  = VDO_STATUS_CODE_LAST,
+    .max   = VDO_BLOCK_END,
+    .infos = vdoStatusList,
+  };
+}
+
+/**
+ * Fetch the error info (if any) for the error number.
+ *
+ * @param errnum        the error number
+ * @param infoPtr       the place to store the info for this error (if known),
+ *                      otherwise set to NULL
+ *
+ * @return              the name of the error block (if known), NULL otherwise
+ **/
+static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr)
+{
+  for (ErrorBlock *block = registeredErrors.blocks;
+       block < registeredErrors.blocks + registeredErrors.count;
+       ++block) {
+    if ((errnum >= block->base) && (errnum < block->last)) {
+      if (infoPtr != NULL) {
+        *infoPtr = block->infos + (errnum - block->base);
+      }
+      return block->name;
+    } else if ((errnum >= block->last) && (errnum < block->max)) {
+      if (infoPtr != NULL) {
+        *infoPtr = NULL;
+      }
+      return block->name;
+    }
+  }
+  if (infoPtr != NULL) {
+    *infoPtr = NULL;
+  }
+  return NULL;
+}
+
+/*****************************************************************************/
+const char *stringError(int errnum, char *buf, size_t buflen)
+{
+  if (buf == NULL) {
+    return NULL;
+  }
+
+  const ErrorInfo *info      = NULL;
+  const char      *blockName = getErrorInfo(errnum, &info);
+
+  if (blockName != NULL) {
+    if (info != NULL) {
+      snprintf(buf, buflen, "%s: %s", blockName, info->message);
+    } else {
+      snprintf(buf, buflen, "Unknown %s %d", blockName, errnum);
+    }
+  } else {
+    snprintf(buf, buflen, "System error %d", errnum);
+  }
+  return buf;
+}
+
+/*****************************************************************************/
+const char *stringErrorName(int errnum, char *buf, size_t buflen)
+{
+  const ErrorInfo *info      = NULL;
+  const char      *blockName = getErrorInfo(errnum, &info);
+
+  if (blockName != NULL) {
+    if (info != NULL) {
+      snprintf(buf, buflen, "%s: %s", blockName, info->name);
+    } else {
+      snprintf(buf, buflen, "Unknown %s %d", blockName, errnum);
+    }
+  } else {
+    snprintf(buf, buflen, "System error %d", errnum);
+  }
+  return buf;
+}
+
+/*****************************************************************************/
+int makeUnrecoverable(int resultCode)
+{
+  return ((resultCode == UDS_SUCCESS)
+          ? resultCode
+          : (resultCode | UDS_UNRECOVERABLE));
+}
+
+/*****************************************************************************/
+int sansUnrecoverable(int resultCode)
+{
+  return resultCode & ~UDS_UNRECOVERABLE;
+}
+
+/*****************************************************************************/
+bool isUnrecoverable(int resultCode)
+{
+  return (bool)(resultCode & UDS_UNRECOVERABLE);
+}
+
+/*****************************************************************************/
+int registerErrorBlock(const char      *blockName,
+                       int              firstError,
+                       int              lastReservedError,
+                       const ErrorInfo *infos,
+                       size_t           infoSize)
+{
+  int result = ASSERT(firstError < lastReservedError,
+                      "bad error block range");
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  if (registeredErrors.count == registeredErrors.allocated) {
+    // could reallocate and grow, but should never happen
+    return UDS_OVERFLOW;
+  }
+
+  for (ErrorBlock *block = registeredErrors.blocks;
+       block < registeredErrors.blocks + registeredErrors.count;
+       ++block) {
+    if (strcmp(blockName, block->name) == 0) {
+      return UDS_DUPLICATE_NAME;
+    }
+    // check for overlap in error ranges
+    if ((firstError < block->max) && (lastReservedError > block->base)) {
+      return UDS_ALREADY_REGISTERED;
+    }
+  }
+
+  registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) {
+    .name  = blockName,
+    .base  = firstError,
+    .last  = firstError + (infoSize / sizeof(ErrorInfo)),
+    .max   = lastReservedError,
+    .infos = infos
+  };
+
+  return UDS_SUCCESS;
+}
diff --git a/vdo/kernel/errors.h b/vdo/kernel/errors.h
new file mode 100644
index 0000000..acfb777
--- /dev/null
+++ b/vdo/kernel/errors.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.h#1 $
+ */
+
+#ifndef ERRORS_H
+#define ERRORS_H
+
+#include <linux/types.h>
+#include "uds-error.h"
+
+enum udsInternalErrorCodes {
+  /** Used as a base value for reporting internal errors */
+  UDS_INTERNAL_ERROR_CODE_BASE = 66560,
+  /** Client/server protocol framing error */
+  UDS_PROTOCOL_ERROR           = UDS_INTERNAL_ERROR_CODE_BASE + 0,
+  /** Index overflow */
+  UDS_OVERFLOW                 = UDS_INTERNAL_ERROR_CODE_BASE + 1,
+  /** Fill phase done (intended for albfill only) */
+  UDS_FILLDONE                 = UDS_INTERNAL_ERROR_CODE_BASE + 2,
+  /** Invalid argument passed to internal routine */
+  UDS_INVALID_ARGUMENT         = UDS_INTERNAL_ERROR_CODE_BASE + 3,
+  /** UDS data structures are in an invalid state */
+  UDS_BAD_STATE                = UDS_INTERNAL_ERROR_CODE_BASE + 4,
+  /** Attempt to enter the same name into an internal structure twice */
+  UDS_DUPLICATE_NAME           = UDS_INTERNAL_ERROR_CODE_BASE + 5,
+  /** An internal protocol violation between system components */
+  UDS_UNEXPECTED_RESULT        = UDS_INTERNAL_ERROR_CODE_BASE + 6,
+  /** An error created by test case processing */
+  UDS_INJECTED_ERROR           = UDS_INTERNAL_ERROR_CODE_BASE + 7,
+  /** An assertion failed */
+  UDS_ASSERTION_FAILED         = UDS_INTERNAL_ERROR_CODE_BASE + 8,
+  /** A file or stream is not scannable with the current scanner */
+  UDS_UNSCANNABLE              = UDS_INTERNAL_ERROR_CODE_BASE + 9,
+  /** Not an actual error, but reporting that the result will be delayed */
+  UDS_QUEUED                   = UDS_INTERNAL_ERROR_CODE_BASE + 10,
+  /** Queue already connected */
+  UDS_QUEUE_ALREADY_CONNECTED  = UDS_INTERNAL_ERROR_CODE_BASE + 11,
+  /** Fill phase not supported */
+  UDS_BAD_FILL_PHASE           = UDS_INTERNAL_ERROR_CODE_BASE + 12,
+  /** A problem has occurred with a Buffer */
+  UDS_BUFFER_ERROR             = UDS_INTERNAL_ERROR_CODE_BASE + 13,
+  /** A network connection was lost */
+  UDS_CONNECTION_LOST          = UDS_INTERNAL_ERROR_CODE_BASE + 14,
+  /** A time out has occurred */
+  UDS_TIMEOUT                  = UDS_INTERNAL_ERROR_CODE_BASE + 15,
+  /** No directory was found where one was expected */
+  UDS_NO_DIRECTORY             = UDS_INTERNAL_ERROR_CODE_BASE + 16,
+  /** Checkpoint not completed */
+  UDS_CHECKPOINT_INCOMPLETE    = UDS_INTERNAL_ERROR_CODE_BASE + 17,
+  /** Invalid albGenTest server run ID */
+  UDS_INVALID_RUN_ID           = UDS_INTERNAL_ERROR_CODE_BASE + 18,
+  /** albGenTest server run canceled */
+  UDS_RUN_CANCELED             = UDS_INTERNAL_ERROR_CODE_BASE + 19,
+  /** this error range has already been registered */
+  UDS_ALREADY_REGISTERED       = UDS_INTERNAL_ERROR_CODE_BASE + 20,
+  /** One more than the last UDS_INTERNAL error code */
+  UDS_INTERNAL_ERROR_CODE_LAST,
+  /** One more than the last error this block will ever use */
+  UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440
+};
+
+enum {
+  ERRBUF_SIZE = 128 // default size for buffer passed to stringError
+};
+
+const char *stringError(int errnum, char *buf, size_t buflen);
+const char *stringErrorName(int errnum, char *buf, size_t buflen);
+
+int makeUnrecoverable(int resultCode) __attribute__((warn_unused_result));
+bool isUnrecoverable(int resultCode) __attribute__((warn_unused_result));
+int sansUnrecoverable(int resultCode) __attribute__((warn_unused_result));
+
+typedef struct errorInfo {
+  const char *name;
+  const char *message;
+} ErrorInfo;
+
+/**
+ * Initialize UDS error code blocks.
+ *
+ * @note Must be called once, before any of the other routines in this
+ * file.
+ **/
+void initializeStandardErrorBlocks(void);
+
+/**
+ * Register an error code block for stringError and stringErrorName.
+ *
+ * @param blockName         the name of the block of error codes
+ * @param firstError        the first error code in the block
+ * @param lastReservedError one past the highest possible error in the block
+ * @param infos             a pointer to the error info array for the block
+ * @param infoSize          the size of the error info array, which determines
+ *                          the last actual error for which information is
+ *                          available
+ *
+ * @return a success or error code, particularly UDS_DUPLICATE_NAME if the
+ *         block name is already present, or UDS_ALREADY_REGISTERED if a
+ *         block with the specified error code is present
+ **/
+int registerErrorBlock(const char      *blockName,
+                       int              firstError,
+                       int              lastReservedError,
+                       const ErrorInfo *infos,
+                       size_t           infoSize);
+
+#endif /* ERRORS_H */
diff --git a/vdo/kernel/histogram.c b/vdo/kernel/histogram.c
new file mode 100644
index 0000000..0e1a6ae
--- /dev/null
+++ b/vdo/kernel/histogram.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.c#2 $
+ */
+
+#include <linux/kobject.h>
+
+#include "memoryAlloc.h"
+#include "typeDefs.h"
+
+#include "histogram.h"
+#include "logger.h"
+#include "numUtils.h"
+
+/*
+ * Set NO_BUCKETS to streamline the histogram code by reducing it to
+ * tracking just minimum, maximum, mean, etc. Only one bucket counter
+ * (the final one for "bigger" values) will be used, no range checking
+ * is needed to find the right bucket, and no histogram will be
+ * reported. With newer compilers, the histogram output code will be
+ * optimized out.
+ */
+enum {
+  NO_BUCKETS = 1
+};
+
+/*
+ * Support histogramming in the VDO code.
+ *
+ * This is not a complete and general histogram package.  It follows the XP
+ * practice of implementing the "customer" requirements, and no more.  We can
+ * support other requirements after we know what they are.
+ *
+ * The code was originally borrowed from Albireo, and includes both linear and
+ * logarithmic histograms.  VDO only uses the logarithmic histograms.
+ *
+ * All samples are uint64_t values.
+ *
+ * A unit conversion option is supported internally to allow sample values to
+ * be supplied in "jiffies" and results to be reported via /sys in
+ * milliseconds. Depending on the system configuration, this could mean a
+ * factor of four (a bucket for values of 1 jiffy is reported as 4-7
+ * milliseconds). In theory it could be a non-integer ratio (including less
+ * than one), but as the x86-64 platforms we've encountered appear to use 1 or
+ * 4 milliseconds per jiffy, we don't support non-integer values yet.
+ *
+ * All internal processing uses the values as passed to enterHistogramSample.
+ * Conversions only affect the values seen or input through the /sys interface,
+ * including possibly rounding a "limit" value entered.
+ */
+
+struct histogram {
+  // These fields are ordered so that enterHistogramSample touches
+  // only the first cache line.
+  atomic64_t     *counters;     // Counter for each bucket
+  uint64_t        limit;        // We want to know how many samples are larger
+  atomic64_t      sum;          // Sum of all the samples
+  atomic64_t      count;        // Number of samples
+  atomic64_t      minimum;      // Minimum value
+  atomic64_t      maximum;      // Maximum value
+  atomic64_t      unacceptable; // Number of samples that exceed the limit
+  int             numBuckets;   // The number of buckets
+  bool            logFlag;      // True if the y scale should be logarithmic
+  // These fields are used only when reporting results.
+  const char     *label;        // Histogram label
+  const char     *countedItems; // Name for things being counted
+  const char     *metric;       // Term for value used to divide into buckets
+  const char     *sampleUnits;  // Unit for measuring metric; NULL for count
+  unsigned int    conversionFactor; // Converts input units to reporting units
+  struct kobject  kobj;
+};
+
+/*
+ * Fixed table defining the top value for each bucket of a logarithmic
+ * histogram.  We arbitrarily limit the histogram to 12 orders of magnitude.
+ */
+enum { MAX_LOG_SIZE = 12 };
+static const uint64_t bottomValue[1 + 10 * MAX_LOG_SIZE] = {
+  // 0 to 10 - The first 10 buckets are linear
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+  // 10 to 100 - From this point on, the Nth entry of the table is
+  //             floor(exp10((double)N/10.0)).
+  12, 15, 19, 25, 31, 39, 50, 63, 79, 100,
+  // 100 to 1K
+  125, 158, 199, 251, 316, 398, 501, 630, 794, 1000,
+  // 1K to 10K
+  1258, 1584, 1995, 2511, 3162, 3981, 5011, 6309, 7943, 10000,
+  // 10K to 100K
+  12589, 15848, 19952, 25118, 31622, 39810, 50118, 63095, 79432, 100000,
+  // 100K to 1M
+  125892, 158489, 199526, 251188, 316227,
+  398107, 501187, 630957, 794328, 1000000,
+  // 1M to 10M
+  1258925, 1584893, 1995262, 2511886, 3162277,
+  3981071, 5011872, 6309573, 7943282, 10000000,
+  // 10M to 100M
+  12589254, 15848931, 19952623, 25118864, 31622776,
+  39810717, 50118723, 63095734, 79432823, 100000000,
+  // 100M to 1G
+  125892541, 158489319, 199526231, 251188643, 316227766,
+  398107170, 501187233, 630957344, 794328234, 1000000000,
+  // 1G to 10G
+  1258925411L, 1584893192L, 1995262314L, 2511886431L, 3162277660L,
+  3981071705L, 5011872336L, 6309573444L, 7943282347L, 10000000000L,
+  // 10G to 100G
+  12589254117L, 15848931924L, 19952623149L, 25118864315L, 31622776601L,
+  39810717055L, 50118723362L, 63095734448L, 79432823472L, 100000000000L,
+  // 100G to 1T
+  125892541179L, 158489319246L, 199526231496L, 251188643150L, 316227766016L,
+  398107170553L, 501187233627L, 630957344480L, 794328234724L, 1000000000000L,
+};
+
+/***********************************************************************/
+static unsigned int divideRoundingToNearest(uint64_t number, uint64_t divisor)
+{
+  number += divisor / 2;
+  return number / divisor;
+}
+
+/***********************************************************************/
+static int maxBucket(Histogram *h)
+{
+  int max = h->numBuckets;
+  while ((max >= 0) && (atomic64_read(&h->counters[max]) == 0)) {
+    max--;
+  }
+  // max == -1 means that there were no samples
+  return max;
+}
+
+/***********************************************************************/
+
+typedef struct {
+  struct attribute attr;
+  ssize_t (*show)(Histogram *h, char *buf);
+  ssize_t (*store)(Histogram *h, const char *buf, size_t length);
+} HistogramAttribute;
+
+/***********************************************************************/
+static void histogramKobjRelease(struct kobject *kobj)
+{
+  Histogram *h = container_of(kobj, Histogram, kobj);
+  FREE(h->counters);
+  FREE(h);
+}
+
+/***********************************************************************/
+static ssize_t histogramShow(struct kobject   *kobj,
+                             struct attribute *attr,
+                             char             *buf)
+{
+  HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr);
+  if (ha->show == NULL) {
+    return -EINVAL;
+  }
+  Histogram *h = container_of(kobj, Histogram, kobj);
+  return ha->show(h, buf);
+}
+
+/***********************************************************************/
+static ssize_t histogramStore(struct kobject   *kobj,
+                              struct attribute *attr,
+                              const char       *buf,
+                              size_t            length)
+{
+  HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr);
+  if (ha->show == NULL) {
+    return -EINVAL;
+  }
+  Histogram *h = container_of(kobj, Histogram, kobj);
+  return ha->store(h, buf, length);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowCount(Histogram *h, char *buf)
+{
+  int64_t count = atomic64_read(&h->count);
+  return sprintf(buf, "%" PRId64 "\n", count);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowHistogram(Histogram *h, char *buffer)
+{
+  /*
+   * We're given one page in which to write. The caller logs a complaint if we
+   * report that we've written too much, so we'll truncate to PAGE_SIZE-1.
+   */
+  size_t bufferSize = PAGE_SIZE;
+  bool bars = true;
+  ssize_t length = 0;
+  int max = maxBucket(h);
+  // If max is -1, we'll fall through to reporting the total of zero.
+
+  enum { BAR_SIZE = 50 };
+  char bar[BAR_SIZE + 2];
+  bar[0] = ' ';
+  memset(bar + 1, '=', BAR_SIZE);
+  bar[BAR_SIZE + 1] = '\0';
+
+  uint64_t total = 0;
+  for (int i = 0; i <= max; i++) {
+    total += atomic64_read(&h->counters[i]);
+  }
+
+  length += snprintf(buffer, bufferSize, "%s Histogram - number of %s by %s",
+                     h->label, h->countedItems, h->metric);
+  if (length >= (bufferSize - 1)) {
+    return bufferSize - 1;
+  }
+  if (h->sampleUnits != NULL) {
+    length += snprintf(buffer + length, bufferSize - length, " (%s)",
+                       h->sampleUnits);
+    if (length >= (bufferSize - 1)) {
+      return bufferSize - 1;
+    }
+  }
+  length += snprintf(buffer + length, bufferSize - length, "\n");
+  if (length >= (bufferSize - 1)) {
+    return bufferSize - 1;
+  }
+  for (int i = 0; i <= max; i++) {
+    uint64_t value = atomic64_read(&h->counters[i]);
+
+    unsigned int barLength;
+    if (bars && (total != 0)) {
+      // +1 for the space at the beginning
+      barLength = (divideRoundingToNearest(value * BAR_SIZE, total) + 1);
+      if (barLength == 1) {
+        // Don't bother printing just the initial space.
+        barLength = 0;
+      }
+    } else {
+      // 0 means skip the space and the bar
+      barLength = 0;
+    }
+
+    if (h->logFlag) {
+      if (i == h->numBuckets) {
+        length += snprintf(buffer + length, bufferSize - length, "%-16s",
+                           "Bigger");
+      } else {
+        unsigned int lower = h->conversionFactor * bottomValue[i];
+        unsigned int upper = h->conversionFactor * bottomValue[i + 1] - 1;
+        length += snprintf(buffer + length, bufferSize - length, "%6u - %7u",
+                           lower, upper);
+      }
+    } else {
+      if (i == h->numBuckets) {
+        length += snprintf(buffer + length, bufferSize - length, "%6s",
+                           "Bigger");
+      } else {
+        length += snprintf(buffer + length, bufferSize - length, "%6d", i);
+      }
+    }
+    if (length >= (bufferSize - 1)) {
+      return bufferSize - 1;
+    }
+    length += snprintf(buffer + length, bufferSize - length,
+                       " : %12llu%.*s\n", value, barLength, bar);
+    if (length >= (bufferSize - 1)) {
+      return bufferSize - 1;
+    }
+  }
+
+  length += snprintf(buffer + length, bufferSize - length,
+                     "total %llu\n", total);
+  return minSizeT(bufferSize - 1, length);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowMaximum(Histogram *h, char *buf)
+{
+  // Maximum is initialized to 0.
+  unsigned long value = atomic64_read(&h->maximum);
+  return sprintf(buf, "%lu\n", h->conversionFactor * value);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowMinimum(Histogram *h, char *buf)
+{
+  // Minimum is initialized to -1.
+  unsigned long value = ((atomic64_read(&h->count) > 0)
+                         ? atomic64_read(&h->minimum)
+                         : 0);
+  return sprintf(buf, "%lu\n", h->conversionFactor * value);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowLimit(Histogram *h, char *buf)
+{
+  // Display the limit in the reporting units
+  return sprintf(buf, "%u\n", (unsigned int)(h->conversionFactor * h->limit));
+}
+
+/***********************************************************************/
+static ssize_t histogramStoreLimit(Histogram  *h,
+                                   const char *buf,
+                                   size_t      length)
+{
+  unsigned int value;
+  if ((length > 12) || (sscanf(buf, "%u", &value) != 1)) {
+    return -EINVAL;
+  }
+  /*
+   * Convert input from reporting units (e.g., milliseconds) to internal
+   * recording units (e.g., jiffies).
+   *
+   * computeBucketCount could also be called "divideRoundingUp".
+   */
+  h->limit = computeBucketCount(value, h->conversionFactor);
+  atomic64_set(&h->unacceptable, 0);
+  return length;
+}
+
+/***********************************************************************/
+static ssize_t histogramShowMean(Histogram *h, char *buf)
+{
+  uint64_t count = atomic64_read(&h->count);
+  if (count == 0) {
+    return sprintf(buf, "0/0\n");
+  }
+  // Compute mean, scaled up by 1000, in reporting units
+  unsigned long sumTimes1000InReportingUnits
+    = h->conversionFactor * atomic64_read(&h->sum) * 1000;
+  unsigned int meanTimes1000
+    = divideRoundingToNearest(sumTimes1000InReportingUnits, count);
+  // Print mean with fractional part
+  return sprintf(buf, "%u.%03u\n", meanTimes1000 / 1000,
+                 meanTimes1000 % 1000);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowUnacceptable(Histogram *h, char *buf)
+{
+  int64_t count = atomic64_read(&h->unacceptable);
+  return sprintf(buf, "%" PRId64 "\n", count);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowLabel(Histogram *h, char *buf)
+{
+  return sprintf(buf, "%s\n", h->label);
+}
+
+/***********************************************************************/
+static ssize_t histogramShowUnit(Histogram *h, char *buf)
+{
+  if (h->sampleUnits != NULL) {
+    return sprintf(buf, "%s\n", h->sampleUnits);
+  } else {
+    *buf = 0;
+    return 0;
+  }
+}
+
+/***********************************************************************/
+
+static struct sysfs_ops histogramSysfsOps = {
+  .show  = histogramShow,
+  .store = histogramStore,
+};
+
+static HistogramAttribute countAttribute = {
+  .attr = { .name = "count", .mode = 0444, },
+  .show = histogramShowCount,
+};
+
+static HistogramAttribute histogramAttribute = {
+  .attr = { .name = "histogram", .mode = 0444, },
+  .show = histogramShowHistogram,
+};
+
+static HistogramAttribute labelAttribute = {
+  .attr = { .name = "label", .mode = 0444, },
+  .show = histogramShowLabel,
+};
+
+static HistogramAttribute maximumAttribute = {
+  .attr = { .name = "maximum", .mode = 0444, },
+  .show = histogramShowMaximum,
+};
+
+static HistogramAttribute minimumAttribute = {
+  .attr = { .name = "minimum", .mode = 0444, },
+  .show = histogramShowMinimum,
+};
+
+static HistogramAttribute limitAttribute = {
+  .attr  = { .name = "limit", .mode = 0644, },
+  .show  = histogramShowLimit,
+  .store = histogramStoreLimit,
+};
+
+static HistogramAttribute meanAttribute = {
+  .attr = { .name = "mean", .mode = 0444, },
+  .show = histogramShowMean,
+};
+
+static HistogramAttribute unacceptableAttribute = {
+  .attr = { .name = "unacceptable", .mode = 0444, },
+  .show = histogramShowUnacceptable,
+};
+
+static HistogramAttribute unitAttribute = {
+  .attr = { .name = "unit", .mode = 0444, },
+  .show = histogramShowUnit,
+};
+
+// "Real" histogram plotting.
+static struct attribute *histogramAttributes[] = {
+  &countAttribute.attr,
+  &histogramAttribute.attr,
+  &labelAttribute.attr,
+  &limitAttribute.attr,
+  &maximumAttribute.attr,
+  &meanAttribute.attr,
+  &minimumAttribute.attr,
+  &unacceptableAttribute.attr,
+  &unitAttribute.attr,
+  NULL,
+};
+
+static struct kobj_type histogramKobjType = {
+  .release       = histogramKobjRelease,
+  .sysfs_ops     = &histogramSysfsOps,
+  .default_attrs = histogramAttributes,
+};
+
+static struct attribute *bucketlessHistogramAttributes[] = {
+  &countAttribute.attr,
+  &labelAttribute.attr,
+  &maximumAttribute.attr,
+  &meanAttribute.attr,
+  &minimumAttribute.attr,
+  &unitAttribute.attr,
+  NULL,
+};
+
+static struct kobj_type bucketlessHistogramKobjType = {
+  .release       = histogramKobjRelease,
+  .sysfs_ops     = &histogramSysfsOps,
+  .default_attrs = bucketlessHistogramAttributes,
+};
+
+/***********************************************************************/
+static Histogram *makeHistogram(struct kobject *parent,
+                                const char     *name,
+                                const char     *label,
+                                const char     *countedItems,
+                                const char     *metric,
+                                const char     *sampleUnits,
+                                int             numBuckets,
+                                unsigned long   conversionFactor,
+                                bool            logFlag)
+{
+  Histogram *h;
+  if (ALLOCATE(1, Histogram, "histogram", &h) != UDS_SUCCESS) {
+    return NULL;
+  }
+
+  if (NO_BUCKETS) {
+    numBuckets = 0;             // plus 1 for "bigger" bucket
+  }
+
+  if (numBuckets <= 10) {
+    /*
+     * The first buckets in a "logarithmic" histogram are still
+     * linear, but the bucket-search mechanism is a wee bit slower
+     * than for linear, so change the type.
+     */
+    logFlag = false;
+  }
+
+  h->label            = label;
+  h->countedItems     = countedItems;
+  h->metric           = metric;
+  h->sampleUnits      = sampleUnits;
+  h->logFlag          = logFlag;
+  h->numBuckets       = numBuckets;
+  h->conversionFactor = conversionFactor;
+  atomic64_set(&h->minimum, -1UL);
+
+  if (ALLOCATE(h->numBuckets + 1, atomic64_t, "histogram counters",
+               &h->counters) != UDS_SUCCESS) {
+    histogramKobjRelease(&h->kobj);
+    return NULL;
+  }
+
+  kobject_init(&h->kobj,
+               ((numBuckets > 0)
+                ? &histogramKobjType
+                : &bucketlessHistogramKobjType));
+  if (kobject_add(&h->kobj, parent, name) != 0) {
+    histogramKobjRelease(&h->kobj);
+    return NULL;
+  }
+  return h;
+}
+
+/***********************************************************************/
+Histogram *makeLinearHistogram(struct kobject *parent,
+                               const char     *name,
+                               const char     *initLabel,
+                               const char     *countedItems,
+                               const char     *metric,
+                               const char     *sampleUnits,
+                               int             size)
+{
+  return makeHistogram(parent, name, initLabel, countedItems,
+                       metric, sampleUnits, size, 1, false);
+}
+
+
+/**
+ * Intermediate routine for creating logarithmic histograms.
+ *
+ * Limits the histogram size, and computes the bucket count from the
+ * orders-of-magnitude count.
+ *
+ * @param parent            The parent kobject.
+ * @param name              The short name of the histogram.  This label is
+ *                          used for the sysfs node.
+ * @param initLabel         The label for the sampled data.  This label is used
+ *                          when we plot the data.
+ * @param countedItems      A name (plural) for the things being counted.
+ * @param metric            The measure being used to divide samples into
+ *                          buckets.
+ * @param sampleUnits       The units (plural) for the metric, or NULL if it's
+ *                          a simple counter.
+ * @param logSize           The number of buckets.  There are buckets for a
+ *                          range of sizes up to 10^logSize, and an extra
+ *                          bucket for larger samples.
+ * @param conversionFactor  Unit conversion factor for reporting.
+ *
+ * @return the histogram
+ **/
+static Histogram *
+makeLogarithmicHistogramWithConversionFactor(struct kobject *parent,
+                                             const char     *name,
+                                             const char     *initLabel,
+                                             const char     *countedItems,
+                                             const char     *metric,
+                                             const char     *sampleUnits,
+                                             int             logSize,
+                                             uint64_t        conversionFactor)
+{
+  if (logSize > MAX_LOG_SIZE) {
+    logSize = MAX_LOG_SIZE;
+  }
+  return makeHistogram(parent, name,
+                       initLabel, countedItems, metric, sampleUnits,
+                       10 * logSize, conversionFactor, true);
+}
+
+/***********************************************************************/
+Histogram *makeLogarithmicHistogram(struct kobject *parent,
+                                    const char     *name,
+                                    const char     *initLabel,
+                                    const char     *countedItems,
+                                    const char     *metric,
+                                    const char     *sampleUnits,
+                                    int             logSize)
+{
+  return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel,
+                                                      countedItems,
+                                                      metric, sampleUnits,
+                                                      logSize, 1);
+}
+
+/***********************************************************************/
+Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent,
+                                           const char     *name,
+                                           const char     *initLabel,
+                                           const char     *countedItems,
+                                           const char     *metric,
+                                           int             logSize)
+{
+  /*
+   * If these fail, we have a jiffy duration that is not an integral number of
+   * milliseconds, and the unit conversion code needs updating.
+   */
+  STATIC_ASSERT(HZ <= MSEC_PER_SEC);
+  STATIC_ASSERT((MSEC_PER_SEC % HZ) == 0);
+  return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel,
+                                                      countedItems,
+                                                      metric, "milliseconds",
+                                                      logSize,
+                                                      jiffies_to_msecs(1));
+}
+
+/***********************************************************************/
+void enterHistogramSample(Histogram *h, uint64_t sample)
+{
+  int bucket;
+  if (h->logFlag) {
+    int lo = 0;
+    int hi = h->numBuckets;
+    while (lo < hi) {
+      int middle = (lo + hi) / 2;
+      if (sample < bottomValue[middle + 1]) {
+        hi = middle;
+      } else {
+        lo = middle + 1;
+      }
+    }
+    bucket = lo;
+  } else {
+    bucket = sample < h->numBuckets ? sample : h->numBuckets;
+  }
+  atomic64_inc(&h->counters[bucket]);
+  atomic64_inc(&h->count);
+  atomic64_add(sample, &h->sum);
+  if ((h->limit > 0) && (sample > h->limit)) {
+    atomic64_inc(&h->unacceptable);
+  }
+
+  /*
+   * Theoretically this could loop a lot; in practice it should rarely
+   * do more than a single read, with no memory barrier, from a cache
+   * line we've already referenced above.
+   */
+  uint64_t oldMaximum = atomic64_read(&h->maximum);
+  while (oldMaximum < sample) {
+    uint64_t readValue = atomic64_cmpxchg(&h->maximum, oldMaximum, sample);
+    if (readValue == oldMaximum) {
+      break;
+    }
+    oldMaximum = readValue;
+  }
+
+  uint64_t oldMinimum = atomic64_read(&h->minimum);
+  while (oldMinimum > sample) {
+    uint64_t readValue = atomic64_cmpxchg(&h->minimum, oldMinimum, sample);
+    if (readValue == oldMinimum) {
+      break;
+    }
+    oldMinimum = readValue;
+  }
+}
+
+/***********************************************************************/
+void freeHistogram(Histogram **hp)
+{
+  if (*hp != NULL) {
+    Histogram *h = *hp;
+    kobject_put(&h->kobj);
+    *hp = NULL;
+  }
+}
diff --git a/vdo/kernel/histogram.h b/vdo/kernel/histogram.h
new file mode 100644
index 0000000..a177e0a
--- /dev/null
+++ b/vdo/kernel/histogram.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.h#1 $
+ */
+
+#ifndef HISTOGRAM_H
+#define HISTOGRAM_H
+
+#include <linux/types.h>
+
+typedef struct histogram Histogram;
+
+/**
+ * Allocate and initialize a histogram that uses linearly sized buckets.
+ *
+ * The histogram label reported via /sys is constructed from several of the
+ * values passed here; it will be something like "Init Label Histogram - number
+ * of countedItems grouped by metric (sampleUnits)", e.g., "Flush Forwarding
+ * Histogram - number of flushes grouped by latency (milliseconds)". Thus
+ * countedItems and sampleUnits should be plural.
+ *
+ * The sampleUnits string will also be reported separately via another /sys
+ * entry to aid in programmatic processing of the results, so the strings used
+ * should be consistent (e.g., always "milliseconds" and not "ms" for
+ * milliseconds).
+ *
+ * @param parent       The parent kobject.
+ * @param name         The short name of the histogram.  This label is used
+ *                     for the sysfs node.
+ * @param initLabel    The label for the sampled data.  This label is used
+ *                     when we plot the data.
+ * @param countedItems A name (plural) for the things being counted.
+ * @param metric       The measure being used to divide samples into buckets.
+ * @param sampleUnits  The unit (plural) for the metric, or NULL if it's a
+ *                     simple counter.
+ * @param size         The number of buckets.  There are buckets for every
+ *                     value from 0 up to size (but not including) size.
+ *                     There is an extra bucket for larger samples.
+ *
+ * @return the histogram
+ **/
+Histogram *makeLinearHistogram(struct kobject *parent,
+                               const char     *name,
+                               const char     *initLabel,
+                               const char     *countedItems,
+                               const char     *metric,
+                               const char     *sampleUnits,
+                               int             size);
+
+/**
+ * Allocate and initialize a histogram that uses logarithmically sized
+ * buckets.
+ *
+ * @param parent       The parent kobject.
+ * @param name         The short name of the histogram.  This label is used
+ *                     for the sysfs node.
+ * @param initLabel    The label for the sampled data.  This label is used
+ *                     when we plot the data.
+ * @param countedItems A name (plural) for the things being counted.
+ * @param metric       The measure being used to divide samples into buckets.
+ * @param sampleUnits  The unit (plural) for the metric, or NULL if it's a
+ *                     simple counter.
+ * @param logSize      The number of buckets.  There are buckets for a range
+ *                     of sizes up to 10^logSize, and an extra bucket for
+ *                     larger samples.
+ *
+ * @return the histogram
+ **/
+Histogram *makeLogarithmicHistogram(struct kobject *parent,
+                                    const char     *name,
+                                    const char     *initLabel,
+                                    const char     *countedItems,
+                                    const char     *metric,
+                                    const char     *sampleUnits,
+                                    int             logSize);
+
+/**
+ * Allocate and initialize a histogram that uses logarithmically sized
+ * buckets. Values are entered that count in jiffies, and they are
+ * reported in milliseconds.
+ *
+ * @param parent       The parent kobject.
+ * @param name         The short name of the histogram.  This label is used
+ *                     for the sysfs node.
+ * @param initLabel    The label for the sampled data.  This label is used
+ *                     when we plot the data.
+ * @param countedItems A name (plural) for the things being counted.
+ * @param metric       The measure being used to divide samples into buckets.
+ * @param logSize      The number of buckets.  There are buckets for a range
+ *                     of sizes up to 10^logSize, and an extra bucket for
+ *                     larger samples.
+ *
+ * @return the histogram
+ **/
+Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent,
+                                           const char     *name,
+                                           const char     *initLabel,
+                                           const char     *countedItems,
+                                           const char     *metric,
+                                           int             logSize);
+
+/**
+ * Enter a sample into a histogram
+ *
+ * @param h       The histogram
+ * @param sample  The sample
+ **/
+void enterHistogramSample(Histogram *h, uint64_t sample);
+
+/**
+ * Free a histogram and null out the reference to it.
+ *
+ * @param hp  The reference to the histogram.
+ **/
+void freeHistogram(Histogram **hp);
+
+#endif /* HISTOGRAM_H */
diff --git a/vdo/kernel/instanceNumber.c b/vdo/kernel/instanceNumber.c
new file mode 100644
index 0000000..178fd92
--- /dev/null
+++ b/vdo/kernel/instanceNumber.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.c#1 $
+ */
+
+#include "instanceNumber.h"
+
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+
+#include "memoryAlloc.h"
+#include "numUtils.h"
+#include "permassert.h"
+
+/*
+ * Track in-use instance numbers using a flat bit array.
+ *
+ * O(n) run time isn't ideal, but if we have 1000 VDO devices in use
+ * simultaneously we still only need to scan 16 words, so it's not
+ * likely to be a big deal compared to other resource usage.
+ */
+
+enum {
+  /**
+   * This minimum size for the bit array creates a numbering space of 0-999,
+   * which allows successive starts of the same volume to have different
+   * instance numbers in any reasonably-sized test. Changing instances on
+   * restart allows vdoMonReport to detect that the ephemeral stats have reset
+   * to zero.
+   **/
+  BIT_COUNT_MINIMUM   = 1000,
+  /** Grow the bit array by this many bits when needed */
+  BIT_COUNT_INCREMENT = 100,
+};
+
+static struct mutex   instanceNumberLock;
+static unsigned int   bitCount;
+static unsigned long *words;
+static unsigned int   instanceCount;
+static unsigned int   nextInstance;
+
+/**
+ * Return the number of bytes needed to store a bit array of the specified
+ * capacity in an array of unsigned longs.
+ *
+ * @param bitCount  The number of bits the array must hold
+ *
+ * @return the number of bytes needed for the array reperesentation
+ **/
+static size_t getBitArraySize(unsigned int bitCount)
+{
+  // Round up to a multiple of the word size and convert to a byte count.
+  return (computeBucketCount(bitCount, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+/**
+ * Re-allocate the bitmap word array so there will more instance numbers that
+ * can be allocated. Since the array is initially NULL, this also initializes
+ * the array the first time we allocate an instance number.
+ *
+ * @return UDS_SUCCESS or an error code from the allocation
+ **/
+static int growBitArray(void)
+{
+  unsigned int newCount = maxUInt(bitCount + BIT_COUNT_INCREMENT,
+                                  BIT_COUNT_MINIMUM);
+  unsigned long *newWords;
+  int result = reallocateMemory(words,
+                                getBitArraySize(bitCount),
+                                getBitArraySize(newCount),
+                                "instance number bit array",
+                                &newWords);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  bitCount = newCount;
+  words    = newWords;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+static int allocateKVDOInstanceLocked(unsigned int *instancePtr)
+{
+  // If there are no unallocated instances, grow the bit array.
+  if (instanceCount >= bitCount) {
+    int result = growBitArray();
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  // There must be a zero bit somewhere now. Find it, starting just after the
+  // last instance allocated.
+  unsigned int instance = find_next_zero_bit(words, bitCount, nextInstance);
+  if (instance >= bitCount) {
+    // Nothing free after nextInstance, so wrap around to instance zero.
+    instance = find_first_zero_bit(words, bitCount);
+    int result = ASSERT(instance < bitCount, "impossibly, no zero bit found");
+    if (result != UDS_SUCCESS) {
+      return result;
+    }
+  }
+
+  __set_bit(instance, words);
+  instanceCount += 1;
+  nextInstance = instance + 1;
+  *instancePtr = instance;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int allocateKVDOInstance(unsigned int *instancePtr)
+{
+  mutex_lock(&instanceNumberLock);
+  int result = allocateKVDOInstanceLocked(instancePtr);
+  mutex_unlock(&instanceNumberLock);
+  return result;
+}
+
+/**********************************************************************/
+void releaseKVDOInstance(unsigned int instance)
+{
+  mutex_lock(&instanceNumberLock);
+  if (instance >= bitCount) {
+    ASSERT_LOG_ONLY(false, "instance number %u must be less than bit count %u",
+                    instance, bitCount);
+  } else if (test_bit(instance, words) == 0) {
+    ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance);
+  } else {
+    __clear_bit(instance, words);
+    instanceCount -= 1;
+  }
+  mutex_unlock(&instanceNumberLock);
+}
+
+/**********************************************************************/
+void initializeInstanceNumberTracking(void)
+{
+  mutex_init(&instanceNumberLock);
+}
+
+/**********************************************************************/
+void cleanUpInstanceNumberTracking(void)
+{
+  ASSERT_LOG_ONLY(instanceCount == 0,
+                  "should have no instance numbers still in use, but have %u",
+                  instanceCount);
+  FREE(words);
+  words = NULL;
+  bitCount = 0;
+  instanceCount = 0;
+  nextInstance = 0;
+  mutex_destroy(&instanceNumberLock);
+}
diff --git a/vdo/kernel/instanceNumber.h b/vdo/kernel/instanceNumber.h
new file mode 100644
index 0000000..6d96bad
--- /dev/null
+++ b/vdo/kernel/instanceNumber.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.h#1 $
+ */
+
+/**
+ * Allocate an instance number.
+ *
+ * @param [out] instancePtr  An integer to hold the allocated instance number
+ *
+ * @result  UDS_SUCCESS or an error code
+ **/
+int allocateKVDOInstance(unsigned int *instancePtr);
+
+/**
+ * Release an instance number previously allocated.
+ *
+ * @param instance  The instance number to release
+ **/
+void releaseKVDOInstance(unsigned int instance);
+
+/**
+ * Initialize the instance-number tracking data structures.
+ **/
+void initializeInstanceNumberTracking(void);
+
+/**
+ * Free up the instance-number tracking data structures.
+ **/
+void cleanUpInstanceNumberTracking(void);
diff --git a/vdo/kernel/ioSubmitter.c b/vdo/kernel/ioSubmitter.c
new file mode 100644
index 0000000..036bf25
--- /dev/null
+++ b/vdo/kernel/ioSubmitter.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.c#8 $
+ */
+
+#include "ioSubmitter.h"
+
+#include <linux/version.h>
+
+#include "memoryAlloc.h"
+
+#include "bio.h"
+#include "dataKVIO.h"
+#include "kernelLayer.h"
+#include "logger.h"
+
+enum {
+  /*
+   * Whether to use bio merging code.
+   *
+   * Merging I/O requests in the request queue below us is helpful for
+   * many devices, and VDO does a good job sometimes of shuffling up
+   * the I/O order (too much for some simple I/O schedulers to sort
+   * out) as we deal with dedupe advice etc. The bio map tracks the
+   * yet-to-be-submitted I/O requests by block number so that we can
+   * collect together and submit sequential I/O operations that should
+   * be easy to merge. (So we don't actually *merge* them here, we
+   * just arrange them so that merging can happen.)
+   *
+   * For some devices, merging may not help, and we may want to turn
+   * off this code and save compute/spinlock cycles.
+   */
+  USE_BIOMAP           = 1,
+};
+
+/*
+ * Submission of bio operations to the underlying storage device will
+ * go through a separate work queue thread (or more than one) to
+ * prevent blocking in other threads if the storage device has a full
+ * queue. The plug structure allows that thread to do better batching
+ * of requests to make the I/O more efficient.
+ *
+ * When multiple worker threads are used, a thread is chosen for a
+ * I/O operation submission based on the PBN, so a given PBN will
+ * consistently wind up on the same thread. Flush operations are
+ * assigned round-robin.
+ *
+ * The map (protected by the mutex) collects pending I/O operations so
+ * that the worker thread can reorder them to try to encourage I/O
+ * request merging in the request queue underneath.
+ */
+typedef struct bioQueueData {
+  KvdoWorkQueue         *queue;
+  struct blk_plug        plug;
+  IntMap                *map;
+  struct mutex           lock;
+  unsigned int           queueNumber;
+} BioQueueData;
+
+struct ioSubmitter {
+  unsigned int     numBioQueuesUsed;
+  unsigned int     bioQueueRotationInterval;
+  unsigned int     bioQueueRotor;
+  BioQueueData     bioQueueData[];
+};
+
+/**********************************************************************/
+static void startBioQueue(void *ptr)
+{
+  BioQueueData *bioQueueData = (BioQueueData *)ptr;
+  blk_start_plug(&bioQueueData->plug);
+}
+
+/**********************************************************************/
+static void finishBioQueue(void *ptr)
+{
+  BioQueueData *bioQueueData = (BioQueueData *)ptr;
+  blk_finish_plug(&bioQueueData->plug);
+}
+
+static const KvdoWorkQueueType bioQueueType = {
+  .start       = startBioQueue,
+  .finish      = finishBioQueue,
+  .actionTable = {
+    { .name = "bio_compressed_data",
+      .code = BIO_Q_ACTION_COMPRESSED_DATA,
+      .priority = 0 },
+    { .name = "bio_data",
+      .code = BIO_Q_ACTION_DATA,
+      .priority = 0 },
+    { .name = "bio_flush",
+      .code = BIO_Q_ACTION_FLUSH,
+      .priority = 2 },
+    { .name = "bio_high",
+      .code = BIO_Q_ACTION_HIGH,
+      .priority = 2 },
+    { .name = "bio_metadata",
+      .code = BIO_Q_ACTION_METADATA,
+      .priority = 1 },
+    { .name = "bio_readcache",
+      .code = BIO_Q_ACTION_READCACHE,
+      .priority = 0 },
+    { .name = "bio_verify",
+      .code = BIO_Q_ACTION_VERIFY,
+      .priority = 1 },
+  },
+};
+
+/**
+ * Check that we're running normally (i.e., not in an
+ * interrupt-servicing context) in an IOSubmitter bio thread.
+ **/
+static void assertRunningInBioQueue(void)
+{
+  ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context");
+  ASSERT_LOG_ONLY(strnstr(current->comm, "bioQ", TASK_COMM_LEN) != NULL,
+                  "running in bio submission work queue thread");
+}
+
+/**
+ * Returns the BioQueueData pointer associated with the current thread.
+ * Results are undefined if called from any other thread.
+ *
+ * @return the BioQueueData pointer
+ **/
+static inline BioQueueData *getCurrentBioQueueData(void)
+{
+  BioQueueData *bioQueueData = (BioQueueData *) getWorkQueuePrivateData();
+  // Does it look like a bio queue thread?
+  BUG_ON(bioQueueData == NULL);
+  BUG_ON(bioQueueData->queue != getCurrentWorkQueue());
+  return bioQueueData;
+}
+
+/**********************************************************************/
+static inline IOSubmitter *bioQueueToSubmitter(BioQueueData *bioQueue)
+{
+  BioQueueData *firstBioQueue = bioQueue - bioQueue->queueNumber;
+  IOSubmitter *submitter = container_of(firstBioQueue, IOSubmitter,
+                                        bioQueueData[0]);
+  return submitter;
+}
+
+/**
+ * Return the bio thread number handling the specified physical block
+ * number.
+ *
+ * @param ioSubmitter       The I/O submitter data
+ * @param pbn               The physical block number
+ *
+ * @return read cache zone number
+ **/
+static unsigned int bioQueueNumberForPBN(IOSubmitter         *ioSubmitter,
+                                       PhysicalBlockNumber  pbn)
+{
+  unsigned int bioQueueIndex
+    = ((pbn
+        % (ioSubmitter->numBioQueuesUsed
+           * ioSubmitter->bioQueueRotationInterval))
+       / ioSubmitter->bioQueueRotationInterval);
+
+  return bioQueueIndex;
+}
+
+/**
+ * Check that we're running normally (i.e., not in an
+ * interrupt-servicing context) in an IOSubmitter bio thread. Also
+ * require that the thread we're running on is the correct one for the
+ * supplied physical block number.
+ *
+ * @param pbn  The PBN that should have been used in thread selection
+ **/
+static void assertRunningInBioQueueForPBN(PhysicalBlockNumber pbn)
+{
+  assertRunningInBioQueue();
+
+  BioQueueData *thisQueue = getCurrentBioQueueData();
+  IOSubmitter *submitter = bioQueueToSubmitter(thisQueue);
+  unsigned int computedQueueNumber = bioQueueNumberForPBN(submitter, pbn);
+  ASSERT_LOG_ONLY(thisQueue->queueNumber == computedQueueNumber,
+                  "running in correct bio queue (%u vs %u) for PBN %llu",
+                  thisQueue->queueNumber, computedQueueNumber, pbn);
+}
+
+/**
+ * Increments appropriate counters for bio completions
+ *
+ * @param kvio the kvio associated with the bio
+ * @param bio  the bio to count
+ */
+static void countAllBiosCompleted(KVIO *kvio, BIO *bio)
+{
+  KernelLayer *layer = kvio->layer;
+  if (isData(kvio)) {
+    countBios(&layer->biosOutCompleted, bio);
+    return;
+  }
+
+  countBios(&layer->biosMetaCompleted, bio);
+  if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) {
+    countBios(&layer->biosJournalCompleted, bio);
+  } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) {
+    countBios(&layer->biosPageCacheCompleted, bio);
+  }
+}
+
+/**********************************************************************/
+void countCompletedBios(BIO *bio)
+{
+  KVIO        *kvio  = (KVIO *)bio->bi_private;
+  KernelLayer *layer = kvio->layer;
+  atomic64_inc(&layer->biosCompleted);
+  countAllBiosCompleted(kvio, bio);
+}
+
+/**********************************************************************/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+void completeAsyncBio(BIO *bio)
+#else
+void completeAsyncBio(BIO *bio, int error)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+  int error = getBioResult(bio);
+#endif
+  KVIO *kvio = (KVIO *) bio->bi_private;
+  kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io);cb=io($io)"));
+  countCompletedBios(bio);
+  if ((error == 0) && isData(kvio) && isReadVIO(kvio->vio)) {
+    DataKVIO *dataKVIO = kvioAsDataKVIO(kvio);
+    if (!isCompressed(dataKVIO->dataVIO.mapped.state)
+        && !dataKVIO->isPartial) {
+      kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO);
+      return;
+    }
+  }
+  kvdoContinueKvio(kvio, error);
+}
+
+/**
+ * Determines which bio counter to use
+ *
+ * @param kvio the kvio associated with the bio
+ * @param bio  the bio to count
+ */
+static void countAllBios(KVIO *kvio, BIO *bio)
+{
+  KernelLayer *layer = kvio->layer;
+  if (isData(kvio)) {
+    countBios(&layer->biosOut, bio);
+    return;
+  }
+
+  countBios(&layer->biosMeta, bio);
+  if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) {
+    countBios(&layer->biosJournal, bio);
+  } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) {
+    countBios(&layer->biosPageCache, bio);
+  }
+}
+
+/**
+ * Update stats and tracing info, then submit the supplied bio to the
+ * OS for processing.
+ *
+ * @param kvio      The KVIO associated with the bio
+ * @param bio       The bio to submit to the OS
+ * @param location  Call site location for tracing
+ **/
+static void sendBioToDevice(KVIO *kvio, BIO *bio, TraceLocation location)
+{
+  assertRunningInBioQueueForPBN(kvio->vio->physical);
+
+  atomic64_inc(&kvio->layer->biosSubmitted);
+  countAllBios(kvio, bio);
+  kvioAddTraceRecord(kvio, location);
+  bio->bi_next = NULL;
+  generic_make_request(bio);
+}
+
+/**
+ * Submits a bio to the underlying block device.  May block if the
+ * device is busy.
+ *
+ * For metadata or if USE_BIOMAP is disabled, kvio->bioToSubmit holds
+ * the BIO pointer to submit to the target device. For normal
+ * data when USE_BIOMAP is enabled, kvio->biosMerged is the list of
+ * all bios collected together in this group; all of them get
+ * submitted. In both cases, the bi_end_io callback is invoked when
+ * each I/O operation completes.
+ *
+ * @param item  The work item in the KVIO "owning" either the bio to
+ *              submit, or the head of the bio_list to be submitted.
+ **/
+static void processBioMap(KvdoWorkItem *item)
+{
+  assertRunningInBioQueue();
+  KVIO *kvio = workItemAsKVIO(item);
+  /*
+   * XXX Make these paths more regular: Should bi_bdev be set here, or
+   * in the caller, or in the callback function? Should we call
+   * finishBioQueue for the biomap case on old kernels?
+   */
+  if (USE_BIOMAP && isData(kvio)) {
+    // We need to make sure to do two things here:
+    // 1. Use each bio's kvio when submitting. Any other kvio is not safe
+    // 2. Detach the bio list from the kvio before submitting, because it
+    //    could get reused/free'd up before all bios are submitted.
+    BioQueueData *bioQueueData = getWorkQueuePrivateData();
+    BIO          *bio          = NULL;
+    mutex_lock(&bioQueueData->lock);
+    if (!bio_list_empty(&kvio->biosMerged)) {
+      intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.head));
+      intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.tail));
+    }
+    bio = kvio->biosMerged.head;
+    bio_list_init(&kvio->biosMerged);
+    mutex_unlock(&bioQueueData->lock);
+    // Somewhere in the list we'll be submitting the current "kvio",
+    // so drop our handle on it now.
+    kvio = NULL;
+
+    while (bio != NULL) {
+      KVIO *kvioBio = bio->bi_private;
+      BIO  *next    = bio->bi_next;
+      bio->bi_next  = NULL;
+      setBioBlockDevice(bio, getKernelLayerBdev(kvioBio->layer));
+      sendBioToDevice(kvioBio, bio, THIS_LOCATION("$F($io)"));
+      bio = next;
+    }
+  } else {
+    sendBioToDevice(kvio, kvio->bioToSubmit, THIS_LOCATION("$F($io)"));
+  }
+}
+
+/**
+ * This function will attempt to find an already queued bio that the current
+ * bio can be merged with. There are two types of merging possible, forward
+ * and backward, which are distinguished by a flag that uses kernel
+ * elevator terminology.
+ *
+ * @param map        The bio map to use for merging
+ * @param kvio       The kvio we want to merge
+ * @param mergeType  The type of merging we want to try
+ *
+ * @return the kvio to merge to, NULL if no merging is possible
+ */
+static KVIO *getMergeableLocked(IntMap       *map,
+                                KVIO         *kvio,
+                                unsigned int  mergeType)
+{
+  BIO         *bio         = kvio->bioToSubmit;
+  sector_t     mergeSector = getBioSector(bio);
+  switch (mergeType) {
+  case ELEVATOR_BACK_MERGE:
+    mergeSector -= VDO_SECTORS_PER_BLOCK;
+    break;
+  case ELEVATOR_FRONT_MERGE:
+    mergeSector += VDO_SECTORS_PER_BLOCK;
+    break;
+  }
+
+  KVIO *kvioMerge = intMapGet(map, mergeSector);
+
+  if (kvioMerge != NULL) {
+    if (!areWorkItemActionsEqual(&kvio->enqueueable.workItem,
+                                 &kvioMerge->enqueueable.workItem)) {
+      return NULL;
+    } else if (bio_data_dir(bio) != bio_data_dir(kvioMerge->bioToSubmit)) {
+      return NULL;
+    } else if (bio_list_empty(&kvioMerge->biosMerged)) {
+      return NULL;
+    } else {
+      switch (mergeType) {
+      case ELEVATOR_BACK_MERGE:
+        if (getBioSector(kvioMerge->biosMerged.tail) != mergeSector) {
+          return NULL;
+        }
+        break;
+      case ELEVATOR_FRONT_MERGE:
+        if (getBioSector(kvioMerge->biosMerged.head) != mergeSector) {
+          return NULL;
+        }
+        break;
+      }
+    }
+  }
+
+  return kvioMerge;
+}
+
+/**********************************************************************/
+static inline unsigned int advanceBioRotor(IOSubmitter *bioData)
+{
+  unsigned int index = bioData->bioQueueRotor++
+                       % (bioData->numBioQueuesUsed
+                          * bioData->bioQueueRotationInterval);
+  index /= bioData->bioQueueRotationInterval;
+  return index;
+}
+
+/**********************************************************************/
+static bool tryBioMapMerge(BioQueueData *bioQueueData, KVIO *kvio, BIO *bio)
+{
+  bool merged = false;
+
+  mutex_lock(&bioQueueData->lock);
+  KVIO *prevKvio = getMergeableLocked(bioQueueData->map, kvio,
+                                      ELEVATOR_BACK_MERGE);
+  KVIO *nextKvio = getMergeableLocked(bioQueueData->map, kvio,
+                                      ELEVATOR_FRONT_MERGE);
+  if (prevKvio == nextKvio) {
+    nextKvio = NULL;
+  }
+  int result;
+  if ((prevKvio == NULL) && (nextKvio == NULL)) {
+    // no merge. just add to bioQueue
+    result = intMapPut(bioQueueData->map, getBioSector(bio), kvio, true, NULL);
+    // We don't care about failure of intMapPut in this case.
+    result = result;
+    mutex_unlock(&bioQueueData->lock);
+  } else {
+    if (nextKvio == NULL) {
+      // Only prev. merge to  prev's tail
+      intMapRemove(bioQueueData->map, getBioSector(prevKvio->biosMerged.tail));
+      bio_list_merge(&prevKvio->biosMerged, &kvio->biosMerged);
+      result = intMapPut(bioQueueData->map,
+                         getBioSector(prevKvio->biosMerged.head),
+                         prevKvio, true, NULL);
+      result = intMapPut(bioQueueData->map,
+                         getBioSector(prevKvio->biosMerged.tail),
+                         prevKvio, true, NULL);
+    } else {
+      // Only next. merge to next's head
+      //
+      // Handle "next merge" and "gap fill" cases the same way so as to
+      // reorder bios in a way that's compatible with using funnel queues
+      // in work queues.  This avoids removing an existing work item.
+      intMapRemove(bioQueueData->map, getBioSector(nextKvio->biosMerged.head));
+      bio_list_merge_head(&nextKvio->biosMerged, &kvio->biosMerged);
+      result = intMapPut(bioQueueData->map,
+                         getBioSector(nextKvio->biosMerged.head),
+                         nextKvio, true, NULL);
+      result = intMapPut(bioQueueData->map,
+                         getBioSector(nextKvio->biosMerged.tail),
+                         nextKvio, true, NULL);
+    }
+
+    // We don't care about failure of intMapPut in this case.
+    result = result;
+    mutex_unlock(&bioQueueData->lock);
+    merged = true;
+  }
+  return merged;
+}
+
+/**********************************************************************/
+static BioQueueData *bioQueueDataForPBN(IOSubmitter         *ioSubmitter,
+                                        PhysicalBlockNumber  pbn)
+{
+  unsigned int bioQueueIndex = bioQueueNumberForPBN(ioSubmitter, pbn);
+  return &ioSubmitter->bioQueueData[bioQueueIndex];
+}
+
+/**********************************************************************/
+void submitBio(BIO *bio, BioQAction action)
+{
+  KVIO *kvio                  = bio->bi_private;
+  kvio->bioToSubmit           = bio;
+  setupKVIOWork(kvio, processBioMap, (KvdoWorkFunction) bio->bi_end_io,
+                action);
+
+  KernelLayer  *layer = kvio->layer;
+  BioQueueData *bioQueueData
+    = bioQueueDataForPBN(layer->ioSubmitter, kvio->vio->physical);
+
+  kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io)"));
+
+  bio->bi_next = NULL;
+  bio_list_init(&kvio->biosMerged);
+  bio_list_add(&kvio->biosMerged, bio);
+
+  /*
+   * Enabling of MD RAID5 mode optimizes performance for MD RAID5 storage
+   * configurations.  It clears the bits for sync I/O RW flags on data block
+   * bios and sets the bits for sync I/O RW flags on all journal-related
+   * bios.
+   *
+   * This increases the frequency of full-stripe writes by altering flags of
+   * submitted bios.  For workloads with write requests this increases the
+   * likelihood that the MD RAID5 device will update a full stripe instead of
+   * a partial stripe, thereby avoiding making read requests to the underlying
+   * physical storage for purposes of parity chunk calculations.
+   *
+   * Setting the sync-flag on journal-related bios is expected to reduce
+   * latency on journal updates submitted to an MD RAID5 device.
+   */
+  if (layer->deviceConfig->mdRaid5ModeEnabled) {
+    if (isData(kvio)) {
+      // Clear the bits for sync I/O RW flags on data block bios.
+      clearBioOperationFlagSync(bio);
+    } else if ((kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL)
+               || (kvio->vio->type == VIO_TYPE_SLAB_JOURNAL)) {
+      // Set the bits for sync I/O RW flags on all journal-related and
+      // slab-journal-related bios.
+      setBioOperationFlagSync(bio);
+    }
+  }
+
+ /*
+  * Try to use the bio map to submit this bio earlier if we're already sending
+  * IO for an adjacent block. If we can't use an existing pending bio, enqueue
+  * an operation to run in a bio submission thread appropriate to the
+  * indicated physical block number.
+  */
+
+  bool merged = false;
+  if (USE_BIOMAP && isData(kvio)) {
+    merged = tryBioMapMerge(bioQueueData, kvio, bio);
+  }
+  if (!merged) {
+    enqueueKVIOWork(bioQueueData->queue, kvio);
+  }
+}
+
+/**********************************************************************/
+static int initializeBioQueue(BioQueueData *bioQueueData,
+                              const char   *threadNamePrefix,
+                              const char   *queueName,
+                              unsigned int  queueNumber,
+                              KernelLayer  *layer)
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38)
+  bioQueueData->bdev        = layer->dev->bdev;
+#endif
+  bioQueueData->queueNumber = queueNumber;
+
+  return makeWorkQueue(threadNamePrefix, queueName, &layer->wqDirectory,
+                       layer, bioQueueData, &bioQueueType, 1,
+                       &bioQueueData->queue);
+}
+
+/**********************************************************************/
+int makeIOSubmitter(const char    *threadNamePrefix,
+                    unsigned int   threadCount,
+                    unsigned int   rotationInterval,
+                    unsigned int   maxRequestsActive,
+                    KernelLayer   *layer,
+                    IOSubmitter  **ioSubmitterPtr)
+{
+  IOSubmitter *ioSubmitter;
+  int result = ALLOCATE_EXTENDED(IOSubmitter,
+                                 threadCount,
+                                 BioQueueData,
+                                 "bio submission data",
+                                 &ioSubmitter);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  // Setup for each bio-submission work queue
+  char queueName[MAX_QUEUE_NAME_LEN];
+  ioSubmitter->bioQueueRotationInterval = rotationInterval;
+  for (unsigned int i=0; i < threadCount; i++) {
+    BioQueueData *bioQueueData = &ioSubmitter->bioQueueData[i];
+    snprintf(queueName, sizeof(queueName), "bioQ%u", i);
+
+    if (USE_BIOMAP) {
+      mutex_init(&bioQueueData->lock);
+      /*
+       * One I/O operation per request, but both first & last sector numbers.
+       *
+       * If requests are assigned to threads round-robin, they should
+       * be distributed quite evenly. But if they're assigned based on
+       * PBN, things can sometimes be very uneven. So for now, we'll
+       * assume that all requests *may* wind up on one thread, and
+       * thus all in the same map.
+       */
+      result = makeIntMap(maxRequestsActive * 2, 0, &bioQueueData->map);
+      if (result != 0) {
+        // Clean up the partially initialized bio-queue entirely and
+        // indicate that initialization failed.
+        logError("bio map initialization failed %d", result);
+        cleanupIOSubmitter(ioSubmitter);
+        freeIOSubmitter(ioSubmitter);
+        return result;
+      }
+    }
+
+    result = initializeBioQueue(bioQueueData,
+                                threadNamePrefix,
+                                queueName,
+                                i,
+                                layer);
+    if (result != VDO_SUCCESS) {
+      // Clean up the partially initialized bio-queue entirely and
+      // indicate that initialization failed.
+      if (USE_BIOMAP) {
+        freeIntMap(&ioSubmitter->bioQueueData[i].map);
+      }
+      logError("bio queue initialization failed %d", result);
+      cleanupIOSubmitter(ioSubmitter);
+      freeIOSubmitter(ioSubmitter);
+      return result;
+    }
+
+    ioSubmitter->numBioQueuesUsed++;
+  }
+
+  *ioSubmitterPtr = ioSubmitter;
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void cleanupIOSubmitter(IOSubmitter *ioSubmitter)
+{
+  for (int i=ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) {
+    finishWorkQueue(ioSubmitter->bioQueueData[i].queue);
+  }
+}
+
+/**********************************************************************/
+void freeIOSubmitter(IOSubmitter *ioSubmitter)
+{
+  for (int i = ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) {
+    ioSubmitter->numBioQueuesUsed--;
+    freeWorkQueue(&ioSubmitter->bioQueueData[i].queue);
+    if (USE_BIOMAP) {
+      freeIntMap(&ioSubmitter->bioQueueData[i].map);
+    }
+  }
+  FREE(ioSubmitter);
+}
+
+/**********************************************************************/
+void dumpBioWorkQueue(IOSubmitter *ioSubmitter)
+{
+  for (int i=0; i < ioSubmitter->numBioQueuesUsed; i++) {
+    dumpWorkQueue(ioSubmitter->bioQueueData[i].queue);
+  }
+}
+
+
+/**********************************************************************/
+void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem)
+{
+  unsigned int bioQueueIndex = advanceBioRotor(ioSubmitter);
+  enqueueWorkQueue(ioSubmitter->bioQueueData[bioQueueIndex].queue,
+                   workItem);
+}
+
diff --git a/vdo/kernel/ioSubmitter.h b/vdo/kernel/ioSubmitter.h
new file mode 100644
index 0000000..c4fb5ce
--- /dev/null
+++ b/vdo/kernel/ioSubmitter.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.h#4 $
+ */
+
+#ifndef IOSUBMITTER_H
+#define IOSUBMITTER_H
+
+#include <linux/version.h>
+
+#include "kernelLayer.h"
+#include "kvio.h"
+
+/**
+ * Does all the appropriate accounting for bio completions
+ *
+ * @param bio  the bio to count
+ **/
+void countCompletedBios(BIO *bio);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+/**
+ * Completes a bio relating to a kvio, causing the completion callback
+ * to be invoked.
+ *
+ * This is used as the bi_end_io function for most of the bios created
+ * within VDO and submitted to the storage device. Exceptions are the
+ * flush code and the read-block code, both of which need to regain
+ * control in the kernel layer after the I/O is completed.
+ *
+ * @param bio   The bio to complete
+ **/
+void completeAsyncBio(BIO *bio);
+#else
+/**
+ * Completes a bio relating to a kvio, causing the completion callback
+ * to be invoked.
+ *
+ * This is used as the bi_end_io function for most of the bios created
+ * within VDO and submitted to the storage device. Exceptions are the
+ * flush code and the read-block code, both of which need to regain
+ * control in the kernel layer after the I/O is completed.
+ *
+ * @param bio   The bio to complete
+ * @param error Possible error from underlying block device
+ **/
+void completeAsyncBio(BIO *bio, int error);
+#endif
+
+/**
+ * Create a IOSubmitter structure for a new physical layer.
+ *
+ * @param [in]  threadNamePrefix  The per-device prefix to use in process names
+ * @param [in]  threadCount       Number of bio-submission threads to set up
+ * @param [in]  rotationInterval  Interval to use when rotating between
+ *                                bio-submission threads when enqueuing work
+ *                                items
+ * @param [in]  maxRequestsActive Number of bios for merge tracking
+ * @param [in]  layer             The kernel layer
+ * @param [out] ioSubmitter       Pointer to the new data structure
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeIOSubmitter(const char    *threadNamePrefix,
+                    unsigned int   threadCount,
+                    unsigned int   rotationInterval,
+                    unsigned int   maxRequestsActive,
+                    KernelLayer   *layer,
+                    IOSubmitter  **ioSubmitter);
+
+/**
+ * Tear down the IOSubmitter fields as needed for a physical layer.
+ *
+ * @param [in]  ioSubmitter    The I/O submitter data to tear down
+ **/
+void cleanupIOSubmitter(IOSubmitter *ioSubmitter);
+
+/**
+ * Free the IOSubmitter fields and structure as needed for a
+ * physical layer. This must be called after
+ * cleanupIOSubmitter(). It is used to release resources late in
+ * the shutdown process to avoid or reduce the chance of race
+ * conditions.
+ *
+ * @param [in]  ioSubmitter    The I/O submitter data to destroy
+ **/
+void freeIOSubmitter(IOSubmitter *ioSubmitter);
+
+/**
+ * Dump info to the kernel log about the work queue used by the
+ * physical layer. For debugging only.
+ *
+ * @param [in]  ioSubmitter        The I/O submitter data
+ **/
+void dumpBioWorkQueue(IOSubmitter *ioSubmitter);
+
+
+/**
+ * Enqueue a work item to run in the work queue(s) used for bio
+ * submissions from the physical layer.
+ *
+ * Outside of IOSubmitter, used only for finishing processing of empty
+ * flush bios by sending them to the storage device.
+ *
+ * @param ioSubmitter        The I/O submitter data to update
+ * @param workItem           The new work item to run
+ **/
+void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem);
+
+/**
+ * Submit bio but don't block.
+ *
+ * Submits the bio to a helper work queue which sits in a loop
+ * submitting bios. The worker thread may block if the target device
+ * is busy, which is why we don't want to do the submission in the
+ * original calling thread.
+ *
+ * The bi_private field of the bio must point to a KVIO associated
+ * with the operation. The bi_end_io callback is invoked when the I/O
+ * operation completes.
+ *
+ * @param bio      the block I/O operation descriptor to submit
+ * @param action   the action code specifying the priority for the operation
+ **/
+void submitBio(BIO *bio, BioQAction action);
+
+#endif // IOSUBMITTER_H
diff --git a/vdo/kernel/kernelLayer.c b/vdo/kernel/kernelLayer.c
new file mode 100644
index 0000000..8d4d4ed
--- /dev/null
+++ b/vdo/kernel/kernelLayer.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.c#38 $
+ */
+
+#include "kernelLayer.h"
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "murmur/MurmurHash3.h"
+
+#include "lz4.h"
+#include "releaseVersions.h"
+#include "volumeGeometry.h"
+#include "statistics.h"
+#include "vdo.h"
+
+#include "bio.h"
+#include "dataKVIO.h"
+#include "dedupeIndex.h"
+#include "deviceConfig.h"
+#include "deviceRegistry.h"
+#include "instanceNumber.h"
+#include "ioSubmitter.h"
+#include "kvdoFlush.h"
+#include "kvio.h"
+#include "poolSysfs.h"
+#include "statusProcfs.h"
+#include "stringUtils.h"
+#include "verify.h"
+
+enum {
+  DEDUPE_TIMEOUT_REPORT_INTERVAL = 1000,
+};
+
+static const KvdoWorkQueueType bioAckQType = {
+  .actionTable = {
+    { .name = "bio_ack",
+      .code = BIO_ACK_Q_ACTION_ACK,
+      .priority = 0 },
+  },
+};
+
+static const KvdoWorkQueueType cpuQType = {
+  .actionTable = {
+    { .name = "cpu_complete_kvio",
+      .code = CPU_Q_ACTION_COMPLETE_KVIO,
+      .priority = 0 },
+    { .name = "cpu_compress_block",
+      .code = CPU_Q_ACTION_COMPRESS_BLOCK,
+      .priority = 0 },
+    { .name = "cpu_hash_block",
+      .code = CPU_Q_ACTION_HASH_BLOCK,
+      .priority = 0 },
+    { .name = "cpu_event_reporter",
+      .code = CPU_Q_ACTION_EVENT_REPORTER,
+      .priority = 0 },
+  },
+};
+
+// 2000 is half the number of entries currently in our page cache,
+// to allow for each in-progress operation to update two pages.
+int defaultMaxRequestsActive = 2000;
+
+/**********************************************************************/
+static CRC32Checksum kvdoUpdateCRC32(CRC32Checksum  crc,
+                                     const byte    *buffer,
+                                     size_t         length)
+{
+  /*
+   * The kernel's CRC 32 implementation does not do pre- and post-
+   * conditioning, so do it ourselves.
+   */
+  return crc32(crc ^ 0xffffffff, buffer, length) ^ 0xffffffff;
+}
+
+/**********************************************************************/
+static BlockCount kvdoGetBlockCount(PhysicalLayer *header)
+{
+  return asKernelLayer(header)->deviceConfig->physicalBlocks;
+}
+
+/**********************************************************************/
+bool layerIsNamed(KernelLayer *layer, void *context)
+{
+  struct dm_target *ti = layer->deviceConfig->owningTarget;
+  const char *deviceName = dm_device_name(dm_table_get_md(ti->table));
+  return (strcmp(deviceName, (const char *) context) == 0);
+}
+
+/**
+ * Implements LayerFilter.
+ **/
+static bool layerUsesDevice(KernelLayer *layer, void *context)
+{
+  DeviceConfig *config = context;
+  return (layer->deviceConfig->ownedDevice->bdev->bd_dev
+          == config->ownedDevice->bdev->bd_dev);
+}
+
+int mapToSystemError(int error)
+{
+  // 0 is success, negative a system error code
+  if (likely(error <= 0)) {
+    return error;
+  }
+  if (error < 1024) {
+    // errno macro used without negating - may be a minor bug
+    return -error;
+  }
+  // VDO or UDS error
+  char errorName[80], errorMessage[ERRBUF_SIZE];
+  switch (sansUnrecoverable(error)) {
+  case VDO_NO_SPACE:
+    return -ENOSPC;
+  case VDO_READ_ONLY:
+    return -EIO;
+  default:
+    logInfo("%s: mapping internal status code %d (%s: %s) to EIO",
+            __func__, error,
+            stringErrorName(error, errorName, sizeof(errorName)),
+            stringError(error, errorMessage, sizeof(errorMessage)));
+    return -EIO;
+  }
+}
+
+/**********************************************************************/
+static void setKernelLayerState(KernelLayer *layer, KernelLayerState newState)
+{
+  atomicStore32(&layer->state, newState);
+}
+
+/**********************************************************************/
+void waitForNoRequestsActive(KernelLayer *layer)
+{
+  // Do nothing if there are no requests active.  This check is not necessary
+  // for correctness but does reduce log message traffic.
+  if (limiterIsIdle(&layer->requestLimiter)) {
+    return;
+  }
+
+  // We have to make sure to flush the packer before waiting. We do this
+  // by turning off compression, which also means no new entries coming in
+  // while waiting will end up in the packer.
+  bool wasCompressing = setKVDOCompressing(&layer->kvdo, false);
+  // Now wait for there to be no active requests
+  limiterWaitForIdle(&layer->requestLimiter);
+  // Reset the compression state after all requests are done
+  if (wasCompressing) {
+    setKVDOCompressing(&layer->kvdo, true);
+  }
+}
+
+/**
+ * Start processing a new data KVIO based on the supplied bio, but from within
+ * a VDO thread context, when we're not allowed to block. Using this path at
+ * all suggests a bug or erroneous usage, but we special-case it to avoid a
+ * deadlock that can apparently result. Message will be logged to alert the
+ * administrator that something has gone wrong, while we attempt to continue
+ * processing other requests.
+ *
+ * If a request permit can be acquired immediately, kvdoLaunchDataKVIOFromBio
+ * will be called. (If the bio is a discard operation, a permit from the
+ * discard limiter will be requested but the call will be made with or without
+ * it.) If the request permit is not available, the bio will be saved on a list
+ * to be launched later. Either way, this function will not block, and will
+ * take responsibility for processing the bio.
+ *
+ * @param layer        The kernel layer
+ * @param bio          The bio to launch
+ * @param arrivalTime  The arrival time of the bio
+ *
+ * @return  DM_MAPIO_SUBMITTED or a system error code
+ **/
+static int launchDataKVIOFromVDOThread(KernelLayer *layer,
+                                       BIO         *bio,
+                                       Jiffies      arrivalTime)
+{
+  logWarning("kvdoMapBio called from within a VDO thread!");
+  /*
+   * We're not yet entirely sure what circumstances are causing this situation
+   * in [ESC-638], but it does appear to be happening and causing VDO to
+   * deadlock.
+   *
+   * Somehow kvdoMapBio is being called from generic_make_request which is
+   * being called from the VDO code to pass a flush on down to the underlying
+   * storage system; we've got 2000 requests in progress, so we have to wait
+   * for one to complete, but none can complete while the bio thread is blocked
+   * from passing more I/O requests down. Near as we can tell, the flush bio
+   * should always have gotten updated to point to the storage system, so we
+   * shouldn't be calling back into VDO unless something's gotten messed up
+   * somewhere.
+   *
+   * To side-step this case, if the limiter says we're busy *and* we're running
+   * on one of VDO's own threads, we'll drop the I/O request in a special queue
+   * for processing as soon as KVIOs become free.
+   *
+   * We don't want to do this in general because it leads to unbounded
+   * buffering, arbitrarily high latencies, inability to push back in a way the
+   * caller can take advantage of, etc. If someone wants huge amounts of
+   * buffering on top of VDO, they're welcome to access it through the kernel
+   * page cache or roll their own.
+   */
+  if (!limiterPoll(&layer->requestLimiter)) {
+    addToDeadlockQueue(&layer->deadlockQueue, bio, arrivalTime);
+    logWarning("queued an I/O request to avoid deadlock!");
+
+    return DM_MAPIO_SUBMITTED;
+  }
+
+  bool hasDiscardPermit
+    = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter));
+  int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime,
+                                         hasDiscardPermit);
+  // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now.
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return DM_MAPIO_SUBMITTED;
+}
+
+/**********************************************************************/
+int kvdoMapBio(KernelLayer *layer, BIO *bio)
+{
+  Jiffies          arrivalTime = jiffies;
+  KernelLayerState state       = getKernelLayerState(layer);
+  ASSERT_LOG_ONLY(state == LAYER_RUNNING,
+                  "kvdoMapBio should not be called while in state %d", state);
+
+  // Count all incoming bios.
+  countBios(&layer->biosIn, bio);
+
+  // Handle empty bios.  Empty flush bios are not associated with a VIO.
+  if (isFlushBio(bio)) {
+    if (ASSERT(getBioSize(bio) == 0, "Flush bio is size 0") != VDO_SUCCESS) {
+      // We expect flushes to be of size 0.
+      return -EINVAL;
+    }
+    if (shouldProcessFlush(layer)) {
+      launchKVDOFlush(layer, bio);
+      return DM_MAPIO_SUBMITTED;
+    } else {
+      // We're not acknowledging this bio now, but we'll never touch it
+      // again, so this is the last chance to account for it.
+      countBios(&layer->biosAcknowledged, bio);
+      atomic64_inc(&layer->flushOut);
+      setBioBlockDevice(bio, getKernelLayerBdev(layer));
+      return DM_MAPIO_REMAPPED;
+    }
+  }
+
+  if (ASSERT(getBioSize(bio) != 0, "Data bio is not size 0") != VDO_SUCCESS) {
+    // We expect non-flushes to be non-zero in size.
+    return -EINVAL;
+  }
+
+  if (isDiscardBio(bio) && isReadBio(bio)) {
+    // Read and Discard should never occur together
+    return -EIO;
+  }
+
+  KvdoWorkQueue *currentWorkQueue = getCurrentWorkQueue();
+  if ((currentWorkQueue != NULL)
+      && (layer == getWorkQueueOwner(currentWorkQueue))) {
+    /*
+     * This prohibits sleeping during I/O submission to VDO from its own
+     * thread.
+     */
+    return launchDataKVIOFromVDOThread(layer, bio, arrivalTime);
+  }
+  bool hasDiscardPermit = false;
+  if (isDiscardBio(bio)) {
+    limiterWaitForOneFree(&layer->discardLimiter);
+    hasDiscardPermit = true;
+  }
+  limiterWaitForOneFree(&layer->requestLimiter);
+
+  int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime,
+                                         hasDiscardPermit);
+  // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now.
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  return DM_MAPIO_SUBMITTED;
+}
+
+/**********************************************************************/
+struct block_device *getKernelLayerBdev(const KernelLayer *layer)
+{
+  return layer->deviceConfig->ownedDevice->bdev;
+}
+
+/**********************************************************************/
+void completeManyRequests(KernelLayer *layer, uint32_t count)
+{
+  // If we had to buffer some requests to avoid deadlock, release them now.
+  while (count > 0) {
+    Jiffies arrivalTime = 0;
+    BIO *bio = pollDeadlockQueue(&layer->deadlockQueue, &arrivalTime);
+    if (likely(bio == NULL)) {
+      break;
+    }
+
+    bool hasDiscardPermit
+      = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter));
+    int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime,
+                                           hasDiscardPermit);
+    if (result != VDO_SUCCESS) {
+      completeBio(bio, result);
+    }
+    // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now.
+    count--;
+  }
+  // Notify the limiter, so it can wake any blocked processes.
+  if (count > 0) {
+    limiterReleaseMany(&layer->requestLimiter, count);
+  }
+}
+
+/**********************************************************************/
+static void reportEvents(PeriodicEventReporter *reporter)
+{
+  atomic_set(&reporter->workItemQueued, 0);
+  uint64_t newValue = atomic64_read(&reporter->value);
+  uint64_t difference = newValue - reporter->lastReportedValue;
+  if (difference != 0) {
+    logDebug(reporter->format, difference);
+    reporter->lastReportedValue = newValue;
+  }
+}
+
+/**********************************************************************/
+static void reportEventsWork(KvdoWorkItem *item)
+{
+  PeriodicEventReporter *reporter = container_of(item, PeriodicEventReporter,
+                                                 workItem);
+  reportEvents(reporter);
+}
+
+/**********************************************************************/
+static void initPeriodicEventReporter(PeriodicEventReporter *reporter,
+                                      const char            *format,
+                                      unsigned long          reportingInterval,
+                                      KernelLayer           *layer)
+{
+  setupWorkItem(&reporter->workItem, reportEventsWork, NULL,
+                CPU_Q_ACTION_EVENT_REPORTER);
+  reporter->format            = format;
+  reporter->reportingInterval = msecs_to_jiffies(reportingInterval);
+  reporter->layer             = layer;
+}
+
+/**********************************************************************/
+static void addEventCount(PeriodicEventReporter *reporter, unsigned int count)
+{
+  if (count > 0) {
+    atomic64_add(count, &reporter->value);
+    int oldWorkItemQueued = atomic_xchg(&reporter->workItemQueued, 1);
+    if (oldWorkItemQueued == 0) {
+      enqueueWorkQueueDelayed(reporter->layer->cpuQueue,
+                              &reporter->workItem,
+                              jiffies + reporter->reportingInterval);
+    }
+  }
+}
+
+/**********************************************************************/
+static void stopPeriodicEventReporter(PeriodicEventReporter *reporter)
+{
+  reportEvents(reporter);
+}
+
+/**********************************************************************/
+void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount)
+{
+  addEventCount(&layer->albireoTimeoutReporter, expiredCount);
+}
+
+/**********************************************************************/
+static int kvdoCreateEnqueueable(VDOCompletion *completion)
+{
+  KvdoEnqueueable *kvdoEnqueueable;
+  int result = ALLOCATE(1, KvdoEnqueueable, "kvdoEnqueueable",
+                        &kvdoEnqueueable);
+  if (result != VDO_SUCCESS) {
+    logError("kvdoEnqueueable allocation failure %d", result);
+    return result;
+  }
+  kvdoEnqueueable->enqueueable.completion = completion;
+  completion->enqueueable                 = &kvdoEnqueueable->enqueueable;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+static void kvdoDestroyEnqueueable(Enqueueable **enqueueablePtr)
+{
+  Enqueueable *enqueueable = *enqueueablePtr;
+  if (enqueueable != NULL) {
+    KvdoEnqueueable *kvdoEnqueueable
+      = container_of(enqueueable, KvdoEnqueueable, enqueueable);
+    FREE(kvdoEnqueueable);
+    *enqueueablePtr = NULL;
+  }
+}
+
+/**
+ * Implements BufferAllocator.
+ **/
+static int kvdoAllocateIOBuffer(PhysicalLayer  *layer __attribute__((unused)),
+                                size_t          bytes,
+                                const char     *why,
+                                char          **bufferPtr)
+{
+  return ALLOCATE(bytes, char, why, bufferPtr);
+}
+
+/**
+ * Implements ExtentReader. Exists only for the geometry block; is unset after
+ * it is read.
+ **/
+static int kvdoSynchronousRead(PhysicalLayer       *layer,
+                               PhysicalBlockNumber  startBlock,
+                               size_t               blockCount,
+                               char                *buffer,
+                               size_t              *blocksRead)
+{
+  if (blockCount != 1) {
+    return VDO_NOT_IMPLEMENTED;
+  }
+
+  KernelLayer *kernelLayer = asKernelLayer(layer);
+
+  BIO *bio;
+  int result = createBio(kernelLayer, buffer, &bio);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  setBioBlockDevice(bio, getKernelLayerBdev(kernelLayer));
+  setBioSector(bio, blockToSector(kernelLayer, startBlock));
+  setBioOperationRead(bio);
+  result = submitBioAndWait(bio);
+  if (result != 0) {
+    logErrorWithStringError(result, "synchronous read failed");
+    result = -EIO;
+  }
+  freeBio(bio, kernelLayer);
+
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+  if (blocksRead != NULL) {
+    *blocksRead = blockCount;
+  }
+  return VDO_SUCCESS;
+}
+
+/**
+ * Implements VIODestructor.
+ **/
+static void kvdoFreeVIO(VIO **vioPtr)
+{
+  VIO *vio = *vioPtr;
+  if (vio == NULL) {
+    return;
+  }
+
+  BUG_ON(isDataVIO(vio));
+
+  if (isCompressedWriteVIO(vio)) {
+    CompressedWriteKVIO *compressedWriteKVIO
+      = allocatingVIOAsCompressedWriteKVIO(vioAsAllocatingVIO(vio));
+    freeCompressedWriteKVIO(&compressedWriteKVIO);
+  } else {
+    MetadataKVIO *metadataKVIO = vioAsMetadataKVIO(vio);
+    freeMetadataKVIO(&metadataKVIO);
+  }
+
+  *vioPtr = NULL;
+}
+
+/**********************************************************************/
+static WritePolicy kvdoGetWritePolicy(PhysicalLayer *common)
+{
+  KernelLayer *layer = asKernelLayer(common);
+  return getKVDOWritePolicy(&layer->kvdo);
+}
+
+/**
+ * Function that is called when a synchronous operation is completed. We let
+ * the waiting thread know it can continue.
+ *
+ * <p>Implements OperationComplete.
+ *
+ * @param common  The kernel layer
+ **/
+static void kvdoCompleteSyncOperation(PhysicalLayer *common)
+{
+  KernelLayer *layer = asKernelLayer(common);
+  complete(&layer->callbackSync);
+}
+
+/**
+ * Wait for a synchronous operation to complete.
+ *
+ * <p>Implements OperationWaiter.
+ *
+ * @param common  The kernel layer
+ **/
+static void waitForSyncOperation(PhysicalLayer *common)
+{
+  KernelLayer *layer = asKernelLayer(common);
+  // Using the "interruptible" interface means that Linux will not log a
+  // message when we wait for more than 120 seconds.
+  while (wait_for_completion_interruptible(&layer->callbackSync) != 0) {
+    // However, if we get a signal in a user-mode process, we could
+    // spin...
+    msleep(1);
+  }
+}
+
+/**
+ * Make the bio set for allocating new bios.
+ *
+ * @param layer  The kernel layer
+ *
+ * @returns VDO_SUCCESS if bio set created, error code otherwise
+ **/
+static int makeDedupeBioSet(KernelLayer *layer)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0)
+  int result = ALLOCATE(1, struct bio_set, "bio set", &layer->bioset);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = bioset_init(layer->bioset, 0, 0, BIOSET_NEED_BVECS);
+  if (result != 0) {
+    return result;
+  }
+#else
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0)
+  layer->bioset = bioset_create(0, 0, BIOSET_NEED_BVECS);
+#else
+  layer->bioset = bioset_create(0, 0);
+#endif
+  if (layer->bioset == NULL) {
+    return -ENOMEM;
+  }
+#endif
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int makeKernelLayer(uint64_t        startingSector,
+                    unsigned int    instance,
+                    DeviceConfig   *config,
+                    struct kobject *parentKobject,
+                    ThreadConfig  **threadConfigPointer,
+                    char          **reason,
+                    KernelLayer   **layerPtr)
+{
+  // VDO-3769 - Set a generic reason so we don't ever return garbage.
+  *reason = "Unspecified error";
+
+  KernelLayer *oldLayer = findLayerMatching(layerUsesDevice, config);
+  if (oldLayer != NULL) {
+    logError("Existing layer named %s already uses device %s",
+             oldLayer->deviceConfig->poolName,
+	     oldLayer->deviceConfig->parentDeviceName);
+    *reason = "Cannot share storage device with already-running VDO";
+    return VDO_BAD_CONFIGURATION;
+  }
+
+  /*
+   * Part 1 - Allocate the kernel layer, its essential parts, and setup up the
+   * sysfs node.  These must come first so that the sysfs node works correctly
+   * through the freeing of the kernel layer.  After this part you must use
+   * freeKernelLayer.
+   */
+  KernelLayer *layer;
+  int result = ALLOCATE(1, KernelLayer, "VDO configuration", &layer);
+  if (result != UDS_SUCCESS) {
+    *reason = "Cannot allocate VDO configuration";
+    return result;
+  }
+
+  // Allow the base VDO to allocate buffers and construct or destroy
+  // enqueuables as part of its allocation.
+  layer->common.allocateIOBuffer   = kvdoAllocateIOBuffer;
+  layer->common.createEnqueueable  = kvdoCreateEnqueueable;
+  layer->common.destroyEnqueueable = kvdoDestroyEnqueueable;
+
+  result = allocateVDO(&layer->common, &layer->kvdo.vdo);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot allocate VDO";
+    FREE(layer);
+    return result;
+  }
+
+  // After this point, calling kobject_put on kobj will decrement its
+  // reference count, and when the count goes to 0 the KernelLayer will
+  // be freed.
+  kobject_init(&layer->kobj, &kernelLayerKobjType);
+  result = kobject_add(&layer->kobj, parentKobject, config->poolName);
+  if (result != 0) {
+    *reason = "Cannot add sysfs node";
+    kobject_put(&layer->kobj);
+    return result;
+  }
+  kobject_init(&layer->wqDirectory, &workQueueDirectoryKobjType);
+  result = kobject_add(&layer->wqDirectory, &layer->kobj, "work_queues");
+  if (result != 0) {
+    *reason = "Cannot add sysfs node";
+    kobject_put(&layer->wqDirectory);
+    kobject_put(&layer->kobj);
+    return result;
+  }
+
+  /*
+   * Part 2 - Do all the simple initialization.  These initializations have no
+   * order dependencies and can be done in any order, but freeKernelLayer()
+   * cannot be called until all the simple layer properties are set.
+   *
+   * The KernelLayer structure starts as all zeros.  Pointer initializations
+   * consist of replacing a NULL pointer with a non-NULL pointer, which can be
+   * easily undone by freeing all of the non-NULL pointers (using the proper
+   * free routine).
+   */
+  setKernelLayerState(layer, LAYER_SIMPLE_THINGS_INITIALIZED);
+
+  initializeDeadlockQueue(&layer->deadlockQueue);
+
+  int requestLimit = defaultMaxRequestsActive;
+  initializeLimiter(&layer->requestLimiter, requestLimit);
+  initializeLimiter(&layer->discardLimiter, requestLimit * 3 / 4);
+
+  layer->allocationsAllowed   = true;
+  layer->instance             = instance;
+  layer->deviceConfig         = config;
+  layer->startingSectorOffset = startingSector;
+  initializeRing(&layer->deviceConfigRing);
+
+  layer->common.updateCRC32              = kvdoUpdateCRC32;
+  layer->common.getBlockCount            = kvdoGetBlockCount;
+  layer->common.getWritePolicy           = kvdoGetWritePolicy;
+  layer->common.createMetadataVIO        = kvdoCreateMetadataVIO;
+  layer->common.createCompressedWriteVIO = kvdoCreateCompressedWriteVIO;
+  layer->common.freeVIO                  = kvdoFreeVIO;
+  layer->common.completeFlush            = kvdoCompleteFlush;
+  layer->common.enqueue                  = kvdoEnqueue;
+  layer->common.waitForAdminOperation    = waitForSyncOperation;
+  layer->common.completeAdminOperation   = kvdoCompleteSyncOperation;
+  layer->common.getCurrentThreadID       = kvdoGetCurrentThreadID;
+  layer->common.zeroDataVIO              = kvdoZeroDataVIO;
+  layer->common.compareDataVIOs          = kvdoCompareDataVIOs;
+  layer->common.copyData                 = kvdoCopyDataVIO;
+  layer->common.readData                 = kvdoReadDataVIO;
+  layer->common.writeData                = kvdoWriteDataVIO;
+  layer->common.writeCompressedBlock     = kvdoWriteCompressedBlock;
+  layer->common.readMetadata             = kvdoSubmitMetadataVIO;
+  layer->common.writeMetadata            = kvdoSubmitMetadataVIO;
+  layer->common.applyPartialWrite        = kvdoModifyWriteDataVIO;
+  layer->common.flush                    = kvdoFlushVIO;
+  layer->common.hashData                 = kvdoHashDataVIO;
+  layer->common.checkForDuplication      = kvdoCheckForDuplication;
+  layer->common.verifyDuplication        = kvdoVerifyDuplication;
+  layer->common.acknowledgeDataVIO       = kvdoAcknowledgeDataVIO;
+  layer->common.compressDataVIO          = kvdoCompressDataVIO;
+  layer->common.updateAlbireo            = kvdoUpdateDedupeAdvice;
+
+  spin_lock_init(&layer->flushLock);
+  mutex_init(&layer->statsMutex);
+  bio_list_init(&layer->waitingFlushes);
+
+  result = addLayerToDeviceRegistry(layer);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot add layer to device registry";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  snprintf(layer->threadNamePrefix, sizeof(layer->threadNamePrefix), "%s%u",
+           THIS_MODULE->name, instance);
+
+  result = makeThreadConfig(config->threadCounts.logicalZones,
+                            config->threadCounts.physicalZones,
+                            config->threadCounts.hashZones,
+                            threadConfigPointer);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot create thread configuration";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  logInfo("zones: %d logical, %d physical, %d hash; base threads: %d",
+          config->threadCounts.logicalZones,
+          config->threadCounts.physicalZones,
+          config->threadCounts.hashZones,
+          (*threadConfigPointer)->baseThreadCount);
+
+  result = makeBatchProcessor(layer, returnDataKVIOBatchToPool, layer,
+                              &layer->dataKVIOReleaser);
+  if (result != UDS_SUCCESS) {
+    *reason = "Cannot allocate KVIO-freeing batch processor";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // Spare KVDOFlush, so that we will always have at least one available
+  result = makeKVDOFlush(&layer->spareKVDOFlush);
+  if (result != UDS_SUCCESS) {
+    *reason = "Cannot allocate KVDOFlush record";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // BIO pool (needed before the geometry block)
+  result = makeDedupeBioSet(layer);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot allocate dedupe bioset";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // Read the geometry block so we know how to set up the index. Allow it to
+  // do synchronous reads.
+  layer->common.reader = kvdoSynchronousRead;
+  result = loadVolumeGeometry(&layer->common, &layer->geometry);
+  layer->common.reader = NULL;
+  if (result != VDO_SUCCESS) {
+    *reason = "Could not load geometry block";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // Albireo Timeout Reporter
+  initPeriodicEventReporter(&layer->albireoTimeoutReporter,
+                            "Albireo timeout on %llu requests",
+                            DEDUPE_TIMEOUT_REPORT_INTERVAL, layer);
+
+  // Dedupe Index
+  BUG_ON(layer->threadNamePrefix[0] == '\0');
+  result = makeDedupeIndex(&layer->dedupeIndex, layer);
+  if (result != UDS_SUCCESS) {
+    *reason = "Cannot initialize dedupe index";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // Compression context storage
+  result = ALLOCATE(config->threadCounts.cpuThreads, char *, "LZ4 context",
+                    &layer->compressionContext);
+  if (result != VDO_SUCCESS) {
+    *reason = "cannot allocate LZ4 context";
+    freeKernelLayer(layer);
+    return result;
+  }
+  for (int i = 0; i < config->threadCounts.cpuThreads; i++) {
+    result = ALLOCATE(LZ4_context_size(), char, "LZ4 context",
+                      &layer->compressionContext[i]);
+    if (result != VDO_SUCCESS) {
+      *reason = "cannot allocate LZ4 context";
+      freeKernelLayer(layer);
+      return result;
+    }
+  }
+
+
+  /*
+   * Part 3 - Do initializations that depend upon other previous
+   * initializations, but have no order dependencies at freeing time.
+   * Order dependencies for initialization are identified using BUG_ON.
+   */
+  setKernelLayerState(layer, LAYER_BUFFER_POOLS_INITIALIZED);
+
+  // Trace pool
+  BUG_ON(layer->requestLimiter.limit <= 0);
+  result = traceKernelLayerInit(layer);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot initialize trace data";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  // KVIO and VIO pool
+  BUG_ON(layer->deviceConfig->logicalBlockSize <= 0);
+  BUG_ON(layer->requestLimiter.limit <= 0);
+  BUG_ON(layer->bioset == NULL);
+  BUG_ON(layer->deviceConfig->ownedDevice == NULL);
+  result = makeDataKVIOBufferPool(layer, layer->requestLimiter.limit,
+                                  &layer->dataKVIOPool);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot allocate vio data";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  /*
+   * Part 4 - Do initializations that depend upon other previous
+   * initialization, that may have order dependencies at freeing time.
+   * These are mostly starting up the workqueue threads.
+   */
+
+  // Base-code thread, etc
+  result = initializeKVDO(&layer->kvdo, *threadConfigPointer, reason);
+  if (result != VDO_SUCCESS) {
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  setKernelLayerState(layer, LAYER_REQUEST_QUEUE_INITIALIZED);
+
+  // Bio queue
+  result = makeIOSubmitter(layer->threadNamePrefix,
+                           config->threadCounts.bioThreads,
+                           config->threadCounts.bioRotationInterval,
+                           layer->requestLimiter.limit,
+                           layer,
+                           &layer->ioSubmitter);
+  if (result != VDO_SUCCESS) {
+    // If initialization of the bio-queues failed, they are cleaned
+    // up already, so just free the rest of the kernel layer.
+    freeKernelLayer(layer);
+    *reason = "bio submission initialization failed";
+    return result;
+  }
+  setKernelLayerState(layer, LAYER_BIO_DATA_INITIALIZED);
+
+  // Bio ack queue
+  if (useBioAckQueue(layer)) {
+    result = makeWorkQueue(layer->threadNamePrefix, "ackQ",
+                           &layer->wqDirectory, layer, layer, &bioAckQType,
+                           config->threadCounts.bioAckThreads,
+                           &layer->bioAckQueue);
+    if (result != VDO_SUCCESS) {
+      *reason = "bio ack queue initialization failed";
+      freeKernelLayer(layer);
+      return result;
+    }
+  }
+
+  setKernelLayerState(layer, LAYER_BIO_ACK_QUEUE_INITIALIZED);
+
+  // CPU Queues
+  result = makeWorkQueue(layer->threadNamePrefix, "cpuQ", &layer->wqDirectory,
+                         layer, NULL, &cpuQType,
+                         config->threadCounts.cpuThreads, &layer->cpuQueue);
+  if (result != VDO_SUCCESS) {
+    *reason = "Albireo CPU queue initialization failed";
+    freeKernelLayer(layer);
+    return result;
+  }
+
+  setKernelLayerState(layer, LAYER_CPU_QUEUE_INITIALIZED);
+
+  *layerPtr = layer;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int prepareToModifyKernelLayer(KernelLayer       *layer,
+                               DeviceConfig      *config,
+                               char             **errorPtr)
+{
+  DeviceConfig *extantConfig = layer->deviceConfig;
+  if (config->owningTarget->begin != extantConfig->owningTarget->begin) {
+    *errorPtr = "Starting sector cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (strcmp(config->parentDeviceName, extantConfig->parentDeviceName) != 0) {
+    *errorPtr = "Underlying device cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (config->logicalBlockSize != extantConfig->logicalBlockSize) {
+    *errorPtr = "Logical block size cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (config->cacheSize != extantConfig->cacheSize) {
+    *errorPtr = "Block map cache size cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (config->blockMapMaximumAge != extantConfig->blockMapMaximumAge) {
+    *errorPtr = "Block map maximum age cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (config->mdRaid5ModeEnabled != extantConfig->mdRaid5ModeEnabled) {
+    *errorPtr = "mdRaid5Mode cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  if (memcmp(&config->threadCounts, &extantConfig->threadCounts,
+	     sizeof(ThreadCountConfig)) != 0) {
+    *errorPtr = "Thread configuration cannot change";
+    return VDO_PARAMETER_MISMATCH;
+  }
+
+  // Below here are the actions to take when a non-immutable property changes.
+
+  if (config->writePolicy != extantConfig->writePolicy) {
+    // Nothing needs doing right now for a write policy change.
+  }
+
+  if (config->owningTarget->len != extantConfig->owningTarget->len) {
+    size_t logicalBytes = to_bytes(config->owningTarget->len);
+    if ((logicalBytes % VDO_BLOCK_SIZE) != 0) {
+      *errorPtr = "Logical size must be a multiple of 4096";
+      return VDO_PARAMETER_MISMATCH;
+    }
+
+    int result = prepareToResizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE);
+    if (result != VDO_SUCCESS) {
+      *errorPtr = "Device prepareToGrowLogical failed";
+      return result;
+    }
+  }
+
+  if (config->physicalBlocks != extantConfig->physicalBlocks) {
+    int result = prepareToResizePhysical(layer, config->physicalBlocks);
+    if (result != VDO_SUCCESS) {
+      if (result == VDO_TOO_MANY_SLABS) {
+        *errorPtr = "Device prepareToGrowPhysical failed (specified physical"
+                    " size too big based on formatted slab size)";
+      } else {
+        *errorPtr = "Device prepareToGrowPhysical failed";
+      }
+      return result;
+    }
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************
+ * Modify the pool name of the device.
+ *
+ * @param layer      The kernel layer
+ * @param oldName    The old pool name
+ * @param newName    The new pool name
+ *
+ * @return  VDO_SUCCESS or an error
+ *
+ */
+int modifyPoolName(KernelLayer *layer, char *oldName, char *newName)
+{
+  // We use pool name for sysfs and procfs. Rename them accordingly
+  logInfo("Modify pool name from %s to %s", oldName, newName);
+
+  void *procfsPrivate;
+  int result = vdoCreateProcfsEntry(layer, newName, &procfsPrivate);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  result = kobject_rename(&layer->kobj, newName);
+  if (result != 0) {
+    vdoDestroyProcfsEntry(newName, procfsPrivate);
+    return result;
+  }
+
+  void *tmpProcfs = layer->procfsPrivate;
+  layer->procfsPrivate = procfsPrivate;
+
+  vdoDestroyProcfsEntry(oldName, tmpProcfs);
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int modifyKernelLayer(KernelLayer  *layer,
+                      DeviceConfig *config)
+{
+  KernelLayerState state = getKernelLayerState(layer);
+  if (state == LAYER_RUNNING) {
+    return VDO_SUCCESS;
+  } else if (state != LAYER_SUSPENDED) {
+    logError("pre-resume invoked while in unexpected kernel layer state %d",
+             state);
+    return -EINVAL;
+  }
+
+  setKernelLayerState(layer, LAYER_RESUMING);
+
+  DeviceConfig *extantConfig = layer->deviceConfig;
+
+  // A failure here is unrecoverable. So there is no problem if it happens.
+
+  if (config->writePolicy != extantConfig->writePolicy) {
+    /*
+     * Ordinarily, when going from async to sync, we must flush any metadata
+     * written. However, because the underlying storage must have gone into
+     * sync mode before we suspend VDO, and suspending VDO concludes by
+     * issuing a flush, all metadata written before the suspend is flushed
+     * by the suspend and all metadata between the suspend and the write
+     * policy change is written to synchronous storage.
+     */
+    logInfo("Modifying device '%s' write policy from %s to %s",
+            config->poolName, getConfigWritePolicyString(extantConfig),
+            getConfigWritePolicyString(config));
+    setWritePolicy(layer->kvdo.vdo, config->writePolicy);
+  }
+
+  if (config->owningTarget->len != extantConfig->owningTarget->len) {
+    size_t logicalBytes = to_bytes(config->owningTarget->len);
+    int result = resizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  // Grow physical if the version is 0, so we can't tell if we
+  // got an old-style growPhysical command, or if size changed.
+  if ((config->physicalBlocks != extantConfig->physicalBlocks)
+      || (config->version == 0)) {
+    int result = resizePhysical(layer, config->physicalBlocks);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+
+  if (strcmp(config->poolName, extantConfig->poolName) != 0) {
+    logInfo("Modifying device '%s' pool name from %s to %s",
+	    config->poolName, extantConfig->poolName, config->poolName);
+    int result = modifyPoolName(layer, extantConfig->poolName,
+				config->poolName);
+    if (result != VDO_SUCCESS) {
+      return result;
+    }
+  }
+    
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void freeKernelLayer(KernelLayer *layer)
+{
+  // This is not the cleanest implementation, but given the current timing
+  // uncertainties in the shutdown process for work queues, we need to
+  // store information to enable a late-in-process deallocation of
+  // funnel-queue data structures in work queues.
+  bool usedBioAckQueue = false;
+  bool usedCpuQueue    = false;
+  bool usedKVDO        = false;
+  bool releaseInstance = false;
+
+  KernelLayerState state = getKernelLayerState(layer);
+  switch (state) {
+  case LAYER_STOPPING:
+    logError("re-entered freeKernelLayer while stopping");
+    break;
+
+  case LAYER_RUNNING:
+    suspendKernelLayer(layer);
+    // fall through
+
+  case LAYER_STARTING:
+  case LAYER_RESUMING:
+  case LAYER_SUSPENDED:
+    stopKernelLayer(layer);
+    // fall through
+
+  case LAYER_STOPPED:
+  case LAYER_CPU_QUEUE_INITIALIZED:
+    finishWorkQueue(layer->cpuQueue);
+    usedCpuQueue = true;
+    releaseInstance = true;
+    // fall through
+
+  case LAYER_BIO_ACK_QUEUE_INITIALIZED:
+    if (useBioAckQueue(layer)) {
+      finishWorkQueue(layer->bioAckQueue);
+      usedBioAckQueue = true;
+    }
+    // fall through
+
+  case LAYER_BIO_DATA_INITIALIZED:
+    cleanupIOSubmitter(layer->ioSubmitter);
+    // fall through
+
+  case LAYER_REQUEST_QUEUE_INITIALIZED:
+    finishKVDO(&layer->kvdo);
+    usedKVDO = true;
+    // fall through
+
+  case LAYER_BUFFER_POOLS_INITIALIZED:
+    freeBufferPool(&layer->dataKVIOPool);
+    freeBufferPool(&layer->traceBufferPool);
+    // fall through
+
+  case LAYER_SIMPLE_THINGS_INITIALIZED:
+    if (layer->compressionContext != NULL) {
+      for (int i = 0; i < layer->deviceConfig->threadCounts.cpuThreads; i++) {
+        FREE(layer->compressionContext[i]);
+      }
+      FREE(layer->compressionContext);
+    }
+    if (layer->dedupeIndex != NULL) {
+      finishDedupeIndex(layer->dedupeIndex);
+    }
+    FREE(layer->spareKVDOFlush);
+    layer->spareKVDOFlush = NULL;
+    freeBatchProcessor(&layer->dataKVIOReleaser);
+    removeLayerFromDeviceRegistry(layer);
+    break;
+
+  default:
+    logError("Unknown Kernel Layer state: %d", state);
+  }
+
+  // Late deallocation of resources in work queues.
+  if (usedCpuQueue) {
+    freeWorkQueue(&layer->cpuQueue);
+  }
+  if (usedBioAckQueue) {
+    freeWorkQueue(&layer->bioAckQueue);
+  }
+  if (layer->ioSubmitter) {
+    freeIOSubmitter(layer->ioSubmitter);
+  }
+  if (usedKVDO) {
+    destroyKVDO(&layer->kvdo);
+  }
+  if (layer->bioset != NULL) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0)
+    bioset_exit(layer->bioset);
+    FREE(layer->bioset);
+#else
+    bioset_free(layer->bioset);
+#endif
+    layer->bioset = NULL;
+  }
+
+  freeDedupeIndex(&layer->dedupeIndex);
+
+  stopPeriodicEventReporter(&layer->albireoTimeoutReporter);
+  if (releaseInstance) {
+    releaseKVDOInstance(layer->instance);
+  }
+
+  // The call to kobject_put on the kobj sysfs node will decrement its
+  // reference count; when the count goes to zero the VDO object and
+  // the kernel layer object will be freed as a side effect.
+  kobject_put(&layer->wqDirectory);
+  kobject_put(&layer->kobj);
+}
+
+/**********************************************************************/
+static void poolStatsRelease(struct kobject *kobj)
+{
+  KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory);
+  complete(&layer->statsShutdown);
+}
+
+/**********************************************************************/
+int preloadKernelLayer(KernelLayer          *layer,
+                       const VDOLoadConfig  *loadConfig,
+                       char                **reason)
+{
+  if (getKernelLayerState(layer) != LAYER_CPU_QUEUE_INITIALIZED) {
+    *reason = "preloadKernelLayer() may only be invoked after initialization";
+    return UDS_BAD_STATE;
+  }
+
+  setKernelLayerState(layer, LAYER_STARTING);
+  int result = preloadKVDO(&layer->kvdo, &layer->common, loadConfig,
+                         layer->vioTraceRecording, reason);
+  if (result != VDO_SUCCESS) {
+    stopKernelLayer(layer);
+    return result;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int startKernelLayer(KernelLayer *layer, char **reason)
+{
+  if (getKernelLayerState(layer) != LAYER_STARTING) {
+    *reason = "Cannot start kernel from non-starting state";
+    stopKernelLayer(layer);
+    return UDS_BAD_STATE;
+  }
+
+  int result = startKVDO(&layer->kvdo, &layer->common, reason);
+  if (result != VDO_SUCCESS) {
+    stopKernelLayer(layer);
+    return result;
+  }
+
+  setKernelLayerState(layer, LAYER_RUNNING);
+  static struct kobj_type statsDirectoryKobjType = {
+    .release       = poolStatsRelease,
+    .sysfs_ops     = &poolStatsSysfsOps,
+    .default_attrs = poolStatsAttrs,
+  };
+  kobject_init(&layer->statsDirectory, &statsDirectoryKobjType);
+  result = kobject_add(&layer->statsDirectory, &layer->kobj, "statistics");
+  if (result != 0) {
+    *reason = "Cannot add sysfs statistics node";
+    stopKernelLayer(layer);
+    return result;
+  }
+  layer->statsAdded = true;
+
+  if (layer->deviceConfig->deduplication) {
+    // Don't try to load or rebuild the index first (and log scary error
+    // messages) if this is known to be a newly-formatted volume.
+    startDedupeIndex(layer->dedupeIndex, wasNew(layer->kvdo.vdo));
+  }
+
+  result = vdoCreateProcfsEntry(layer, layer->deviceConfig->poolName,
+                                &layer->procfsPrivate);
+  if (result != VDO_SUCCESS) {
+    *reason = "Could not create proc filesystem entry";
+    stopKernelLayer(layer);
+    return result;
+  }
+
+  layer->allocationsAllowed = false;
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void stopKernelLayer(KernelLayer *layer)
+{
+  layer->allocationsAllowed = true;
+
+  // Stop services that need to gather VDO statistics from the worker threads.
+  if (layer->statsAdded) {
+    layer->statsAdded = false;
+    init_completion(&layer->statsShutdown);
+    kobject_put(&layer->statsDirectory);
+    wait_for_completion(&layer->statsShutdown);
+  }
+  vdoDestroyProcfsEntry(layer->deviceConfig->poolName, layer->procfsPrivate);
+
+  switch (getKernelLayerState(layer)) {
+  case LAYER_RUNNING:
+    suspendKernelLayer(layer);
+    // fall through
+
+  case LAYER_SUSPENDED:
+    setKernelLayerState(layer, LAYER_STOPPING);
+    stopDedupeIndex(layer->dedupeIndex);
+    // fall through
+
+  case LAYER_STOPPING:
+  case LAYER_STOPPED:
+  default:
+    setKernelLayerState(layer, LAYER_STOPPED);
+  }
+}
+
+/**********************************************************************/
+int suspendKernelLayer(KernelLayer *layer)
+{
+  // It's important to note any error here does not actually stop device-mapper
+  // from suspending the device. All this work is done post suspend.
+  KernelLayerState state = getKernelLayerState(layer);
+  if (state == LAYER_SUSPENDED) {
+    return VDO_SUCCESS;
+  }
+  if (state != LAYER_RUNNING) {
+    logError("Suspend invoked while in unexpected kernel layer state %d",
+             state);
+    return -EINVAL;
+  }
+
+  /*
+   * Attempt to flush all I/O before completing post suspend work. This is
+   * needed so that changing write policy upon resume is safe. Also, we think
+   * a suspended device is expected to have persisted all data written before
+   * the suspend, even if it hasn't been flushed yet.
+   */
+  waitForNoRequestsActive(layer);
+  int result = synchronousFlush(layer);
+  if (result != VDO_SUCCESS) {
+    setKVDOReadOnly(&layer->kvdo, result);
+  }
+
+  /*
+   * Suspend the VDO, writing out all dirty metadata if the no-flush flag
+   * was not set on the dmsetup suspend call. This will ensure that we don't
+   * have cause to write while suspended [VDO-4402].
+   */
+  int suspendResult = suspendKVDO(&layer->kvdo);
+  if (result == VDO_SUCCESS) {
+    result = suspendResult;
+  }
+
+  suspendDedupeIndex(layer->dedupeIndex, !layer->noFlushSuspend);
+  setKernelLayerState(layer, LAYER_SUSPENDED);
+  return result;
+}
+
+/**********************************************************************/
+int resumeKernelLayer(KernelLayer *layer)
+{
+  if (getKernelLayerState(layer) == LAYER_RUNNING) {
+    return VDO_SUCCESS;
+  }
+
+  resumeDedupeIndex(layer->dedupeIndex);
+  int result = resumeKVDO(&layer->kvdo);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  setKernelLayerState(layer, LAYER_RUNNING);
+  return VDO_SUCCESS;
+}
+
+/***********************************************************************/
+int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount)
+{
+  logInfo("Preparing to resize physical to %llu", physicalCount);
+  // Allocations are allowed and permissible through this non-VDO thread,
+  // since IO triggered by this allocation to VDO can finish just fine.
+  int result = kvdoPrepareToGrowPhysical(&layer->kvdo, physicalCount);
+  if (result != VDO_SUCCESS) {
+    // kvdoPrepareToGrowPhysical logs errors.
+    if (result == VDO_PARAMETER_MISMATCH) {
+      // If we don't trap this case, mapToSystemError() will remap it to -EIO,
+      // which is misleading and ahistorical.
+      return -EINVAL;
+    } else {
+      return result;
+    }
+  }
+
+  logInfo("Done preparing to resize physical");
+  return VDO_SUCCESS;
+}
+
+/***********************************************************************/
+int resizePhysical(KernelLayer *layer, BlockCount physicalCount)
+{
+  // We must not mark the layer as allowing allocations when it is suspended
+  // lest an allocation attempt block on writing IO to the suspended VDO.
+  int result = kvdoResizePhysical(&layer->kvdo, physicalCount);
+  if (result != VDO_SUCCESS) {
+    // kvdoResizePhysical logs errors
+    return result;
+  }
+  return VDO_SUCCESS;
+}
+
+/***********************************************************************/
+int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount)
+{
+  logInfo("Preparing to resize logical to %llu", logicalCount);
+  // Allocations are allowed and permissible through this non-VDO thread,
+  // since IO triggered by this allocation to VDO can finish just fine.
+  int result = kvdoPrepareToGrowLogical(&layer->kvdo, logicalCount);
+  if (result != VDO_SUCCESS) {
+    // kvdoPrepareToGrowLogical logs errors
+    return result;
+  }
+
+  logInfo("Done preparing to resize logical");
+  return VDO_SUCCESS;
+}
+
+/***********************************************************************/
+int resizeLogical(KernelLayer *layer, BlockCount logicalCount)
+{
+  logInfo("Resizing logical to %llu", logicalCount);
+  // We must not mark the layer as allowing allocations when it is suspended
+  // lest an allocation attempt block on writing IO to the suspended VDO.
+  int result = kvdoResizeLogical(&layer->kvdo, logicalCount);
+  if (result != VDO_SUCCESS) {
+    // kvdoResizeLogical logs errors
+    return result;
+  }
+
+  logInfo("Logical blocks now %llu", logicalCount);
+  return VDO_SUCCESS;
+}
+
diff --git a/vdo/kernel/kernelLayer.h b/vdo/kernel/kernelLayer.h
new file mode 100644
index 0000000..4e0bf8c
--- /dev/null
+++ b/vdo/kernel/kernelLayer.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.h#18 $
+ */
+
+#ifndef KERNELLAYER_H
+#define KERNELLAYER_H
+
+#include <linux/device-mapper.h>
+
+#include "atomic.h"
+#include "constants.h"
+#include "flush.h"
+#include "intMap.h"
+#include "physicalLayer.h"
+#include "ringNode.h"
+#include "volumeGeometry.h"
+#include "waitQueue.h"
+
+#include "batchProcessor.h"
+#include "bufferPool.h"
+#include "deadlockQueue.h"
+#include "deviceConfig.h"
+#include "histogram.h"
+#include "kernelStatistics.h"
+#include "kernelTypes.h"
+#include "kernelVDO.h"
+#include "ktrace.h"
+#include "limiter.h"
+#include "statistics.h"
+#include "workQueue.h"
+
+enum {
+  VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT)
+};
+
+typedef enum {
+  LAYER_SIMPLE_THINGS_INITIALIZED,
+  LAYER_BUFFER_POOLS_INITIALIZED,
+  LAYER_REQUEST_QUEUE_INITIALIZED,
+  LAYER_CPU_QUEUE_INITIALIZED,
+  LAYER_BIO_ACK_QUEUE_INITIALIZED,
+  LAYER_BIO_DATA_INITIALIZED,
+  LAYER_STARTING,
+  LAYER_RUNNING,
+  LAYER_SUSPENDED,
+  LAYER_STOPPING,
+  LAYER_STOPPED,
+  LAYER_RESUMING,
+} KernelLayerState;
+
+/* Keep BIO statistics atomically */
+struct atomicBioStats {
+  atomic64_t read;              // Number of not REQ_WRITE bios
+  atomic64_t write;             // Number of REQ_WRITE bios
+  atomic64_t discard;           // Number of REQ_DISCARD bios
+  atomic64_t flush;             // Number of REQ_FLUSH bios
+  atomic64_t fua;               // Number of REQ_FUA bios
+};
+
+// Data managing the reporting of Albireo timeouts
+typedef struct periodicEventReporter {
+  uint64_t             lastReportedValue;
+  const char          *format;
+  atomic64_t           value;
+  Jiffies              reportingInterval; // jiffies
+  /*
+   * Just an approximation.  If nonzero, then either the work item has
+   * been queued to run, or some other thread currently has
+   * responsibility for enqueueing it, or the reporter function is
+   * running but hasn't looked at the current value yet.
+   *
+   * If this is set, don't set the timer again, because we don't want
+   * the work item queued twice.  Use an atomic xchg or cmpxchg to
+   * test-and-set it, and an atomic store to clear it.
+   */
+  atomic_t             workItemQueued;
+  KvdoWorkItem         workItem;
+  KernelLayer         *layer;
+} PeriodicEventReporter;
+
+static inline uint64_t getEventCount(PeriodicEventReporter *reporter)
+{
+  return atomic64_read(&reporter->value);
+}
+
+/**
+ * The VDO representation of the target device
+ **/
+struct kernelLayer {
+  PhysicalLayer           common;
+  // Layer specific info
+  DeviceConfig           *deviceConfig;
+  /** A ring of all DeviceConfigs referencing this layer */
+  RingNode                deviceConfigRing;
+  char                    threadNamePrefix[MAX_QUEUE_NAME_LEN];
+  struct kobject          kobj;
+  struct kobject          wqDirectory;
+  struct kobject          statsDirectory;
+  /**
+   * A counter value to attach to thread names and log messages to
+   * identify the individual device.
+   **/
+  unsigned int            instance;
+  /** Contains the current KernelLayerState, which rarely changes */
+  Atomic32                state;
+  bool                    noFlushSuspend;
+  bool                    allocationsAllowed;
+  AtomicBool              processingMessage;
+  /** Limit the number of requests that are being processed. */
+  Limiter                 requestLimiter;
+  Limiter                 discardLimiter;
+  KVDO                    kvdo;
+  /** Incoming bios we've had to buffer to avoid deadlock. */
+  DeadlockQueue           deadlockQueue;
+  // for REQ_FLUSH processing
+  struct bio_list         waitingFlushes;
+  KVDOFlush              *spareKVDOFlush;
+  spinlock_t              flushLock;
+  Jiffies                 flushArrivalTime;
+  /**
+   * Bio submission manager used for sending bios to the storage
+   * device.
+   **/
+  IOSubmitter            *ioSubmitter;
+  /**
+   * Work queue (possibly with multiple threads) for miscellaneous
+   * CPU-intensive, non-blocking work.
+   **/
+  KvdoWorkQueue          *cpuQueue;
+  /** N blobs of context data for LZ4 code, one per CPU thread. */
+  char                  **compressionContext;
+  Atomic32                compressionContextIndex;
+  /** Optional work queue for calling bio_endio. */
+  KvdoWorkQueue          *bioAckQueue;
+  /** Underlying block device info. */
+  uint64_t                startingSectorOffset;
+  VolumeGeometry          geometry;
+  // Memory allocation
+  BufferPool             *dataKVIOPool;
+  struct bio_set         *bioset;
+  // Albireo specific info
+  DedupeIndex            *dedupeIndex;
+  // Statistics
+  atomic64_t              biosSubmitted;
+  atomic64_t              biosCompleted;
+  atomic64_t              dedupeContextBusy;
+  atomic64_t              flushOut;
+  AtomicBioStats          biosIn;
+  AtomicBioStats          biosInPartial;
+  AtomicBioStats          biosOut;
+  AtomicBioStats          biosOutCompleted;
+  AtomicBioStats          biosAcknowledged;
+  AtomicBioStats          biosAcknowledgedPartial;
+  AtomicBioStats          biosMeta;
+  AtomicBioStats          biosMetaCompleted;
+  AtomicBioStats          biosJournal;
+  AtomicBioStats          biosPageCache;
+  AtomicBioStats          biosJournalCompleted;
+  AtomicBioStats          biosPageCacheCompleted;
+  // for reporting Albireo timeouts
+  PeriodicEventReporter   albireoTimeoutReporter;
+  // Debugging
+  /* Whether to dump VDO state on shutdown */
+  bool                    dumpOnShutdown;
+  /**
+   * Whether we should collect tracing info. (Actually, this controls
+   * allocations; non-null record pointers cause recording.)
+   **/
+  bool                    vioTraceRecording;
+  SampleCounter           traceSampleCounter;
+  /* Should we log tracing info? */
+  bool                    traceLogging;
+  /* Storage for trace data. */
+  BufferPool             *traceBufferPool;
+  /* Private storage for procfs. */
+  void                   *procfsPrivate;
+  /* For returning batches of DataKVIOs to their pool */
+  BatchProcessor         *dataKVIOReleaser;
+
+  // Administrative operations
+  /* The object used to wait for administrative operations to complete */
+  struct completion       callbackSync;
+
+  // Statistics reporting
+  /* Protects the *statsStorage structs */
+  struct mutex            statsMutex;
+  /* Used when shutting down the sysfs statistics */
+  struct completion       statsShutdown;;
+  /* true if sysfs statistics directory is set up */
+  bool                    statsAdded;
+  /* Used to gather statistics without allocating memory */
+  VDOStatistics           vdoStatsStorage;
+  KernelStatistics        kernelStatsStorage;
+};
+
+typedef enum bioQAction {
+  BIO_Q_ACTION_COMPRESSED_DATA,
+  BIO_Q_ACTION_DATA,
+  BIO_Q_ACTION_FLUSH,
+  BIO_Q_ACTION_HIGH,
+  BIO_Q_ACTION_METADATA,
+  BIO_Q_ACTION_READCACHE,
+  BIO_Q_ACTION_VERIFY
+} BioQAction;
+
+typedef enum cpuQAction {
+  CPU_Q_ACTION_COMPLETE_KVIO,
+  CPU_Q_ACTION_COMPRESS_BLOCK,
+  CPU_Q_ACTION_EVENT_REPORTER,
+  CPU_Q_ACTION_HASH_BLOCK,
+} CPUQAction;
+
+typedef enum bioAckQAction {
+  BIO_ACK_Q_ACTION_ACK,
+} BioAckQAction;
+
+typedef void (*DedupeShutdownCallbackFunction)(KernelLayer *layer);
+
+/*
+ * Wrapper for the Enqueueable object, to associate it with a kernel
+ * layer work item.
+ */
+typedef struct kvdoEnqueueable {
+  KvdoWorkItem workItem;
+  Enqueueable  enqueueable;
+} KvdoEnqueueable;
+
+/**
+ * Implements LayerFilter.
+ **/
+bool layerIsNamed(KernelLayer *layer, void *context)
+  __attribute__((warn_unused_result));
+
+/**
+ * Creates a kernel specific physical layer to be used by VDO
+ *
+ * @param startingSector        The sector offset of our table entry in the
+ *                              DM device
+ * @param instance              Device instantiation counter
+ * @param parentKobject         The parent sysfs node
+ * @param config                The device configuration
+ * @param threadConfigPointer   Where to store the new threadConfig handle
+ * @param reason                The reason for any failure during this call
+ * @param layerPtr              A pointer to hold the created layer
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int makeKernelLayer(uint64_t        startingSector,
+                    unsigned int    instance,
+                    DeviceConfig   *config,
+                    struct kobject *parentKobject,
+                    ThreadConfig  **threadConfigPointer,
+                    char          **reason,
+                    KernelLayer   **layerPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Prepare to modify a kernel layer.
+ *
+ * @param layer     The layer to modify
+ * @param config    The new device configuration
+ * @param errorPtr  A pointer to store the reason for any failure
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int prepareToModifyKernelLayer(KernelLayer       *layer,
+                               DeviceConfig      *config,
+                               char             **errorPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Modify a kernel physical layer.
+ *
+ * @param layer   The layer to modify
+ * @param config  The new device configuration
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int modifyKernelLayer(KernelLayer       *layer,
+                      DeviceConfig      *config)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a kernel physical layer.
+ *
+ * @param layer    The layer, which must have been created by
+ *                 makeKernelLayer
+ **/
+void freeKernelLayer(KernelLayer *layer);
+
+/**
+ * Make and configure a kernel layer. This method does not alter the VDO state
+ * on disk. It should be run from the VDO constructor for devices which have
+ * not been started.
+ *
+ * @param layer       The kernel layer
+ * @param loadConfig  Load-time parameters for the VDO
+ * @param reason      The reason for any failure during this call
+ *
+ * @return VDO_SUCCESS or an error
+ *
+ * @note redundant starts are silently ignored
+ **/
+int preloadKernelLayer(KernelLayer          *layer,
+                       const VDOLoadConfig  *loadConfig,
+                       char                **reason);
+
+/**
+ * Start the kernel layer. This method finishes bringing a VDO online now that
+ * a table is being resumed for the first time.
+ *
+ * @param layer   The kernel layer
+ * @param reason  The reason for any failure during this call
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int startKernelLayer(KernelLayer *layer, char **reason);
+
+/**
+ * Stop the kernel layer.
+ *
+ * @param layer  The kernel layer
+ **/
+void stopKernelLayer(KernelLayer *layer);
+
+/**
+ * Suspend the kernel layer.
+ *
+ * @param layer  The kernel layer
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int suspendKernelLayer(KernelLayer *layer);
+
+/**
+ * Resume the kernel layer.
+ *
+ * @param layer  The kernel layer
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int resumeKernelLayer(KernelLayer *layer);
+
+/**
+ * Get the kernel layer state.
+ *
+ * @param layer  The kernel layer
+ *
+ * @return the instantaneously correct kernel layer state
+ **/
+static inline KernelLayerState getKernelLayerState(const KernelLayer *layer)
+{
+  return atomicLoad32(&layer->state);
+}
+
+/**
+ * Function call to begin processing a bio passed in from the block layer
+ *
+ * @param layer  The physical layer
+ * @param bio    The bio from the block layer
+ *
+ * @return value to return from the VDO map function.  Either an error code
+ *         or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdoMapBio for
+ *         details).
+ **/
+int kvdoMapBio(KernelLayer *layer, BIO *bio);
+
+/**
+ * Convert a generic PhysicalLayer to a kernelLayer.
+ *
+ * @param layer The PhysicalLayer to convert
+ *
+ * @return The PhysicalLayer as a KernelLayer
+ **/
+static inline KernelLayer *asKernelLayer(PhysicalLayer *layer)
+{
+  return container_of(layer, KernelLayer, common);
+}
+
+/**
+ * Convert a block number (or count) to a (512-byte-)sector number.
+ *
+ * The argument type is sector_t to force conversion to the type we
+ * want, although the actual values passed are of various integral
+ * types.  It's just too easy to forget and do the multiplication
+ * without casting, resulting in 32-bit arithmetic that accidentally
+ * produces wrong results in devices over 2TB (2**32 sectors).
+ *
+ * @param [in] layer        the physical layer
+ * @param [in] blockNumber  the block number/count
+ *
+ * @return      the sector number/count
+ **/
+static inline sector_t blockToSector(KernelLayer *layer, sector_t blockNumber)
+{
+  return (blockNumber * VDO_SECTORS_PER_BLOCK);
+}
+
+/**
+ * Convert a sector number (or count) to a block number. Does not
+ * check to make sure the sector number is an integral number of
+ * blocks.
+ *
+ * @param [in] layer         the physical layer
+ * @param [in] sectorNumber  the sector number/count
+ *
+ * @return      the block number/count
+ **/
+static inline sector_t sectorToBlock(KernelLayer *layer, sector_t sectorNumber)
+{
+  return (sectorNumber / VDO_SECTORS_PER_BLOCK);
+}
+
+/**
+ * Convert a sector number to an offset within a block.
+ *
+ * @param [in] layer         the physical layer
+ * @param [in] sectorNumber  the sector number
+ *
+ * @return      the offset within the block
+ **/
+static inline BlockSize sectorToBlockOffset(KernelLayer *layer,
+                                            sector_t     sectorNumber)
+{
+  unsigned int sectorsPerBlockMask = VDO_SECTORS_PER_BLOCK - 1;
+  return to_bytes(sectorNumber & sectorsPerBlockMask);
+}
+
+/**
+ * Get the block device object currently underlying a kernel layer.
+ *
+ * @param layer  The kernel layer in question
+ *
+ * @return The block device object under the layer
+ **/
+struct block_device *getKernelLayerBdev(const KernelLayer *layer)
+  __attribute__((warn_unused_result));
+
+/**
+ * Set the layer's active config.
+ *
+ * @param layer   The kernel layer in question
+ * @param config  The config in question
+ **/
+static inline void setKernelLayerActiveConfig(KernelLayer  *layer,
+                                              DeviceConfig *config)
+{
+  layer->deviceConfig = config;
+}
+
+/**
+ * Given an error code, return a value we can return to the OS.  The
+ * input error code may be a system-generated value (such as -EIO), an
+ * errno macro used in our code (such as EIO), or a UDS or VDO status
+ * code; the result must be something the rest of the OS can consume
+ * (negative errno values such as -EIO, in the case of the kernel).
+ *
+ * @param error    the error code to convert
+ *
+ * @return   a system error code value
+ **/
+int mapToSystemError(int error);
+
+/**
+ * Record and eventually report that some number of dedupe requests
+ * reached their expiration time without getting an answer, so we
+ * timed out on them.
+ *
+ * This is called in a timer context, so it shouldn't do the reporting
+ * directly.
+ *
+ * @param layer          The kernel layer for the device
+ * @param expiredCount   The number of expired requests we timed out on
+ **/
+void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount);
+
+/**
+ * Wait until there are no requests in progress.
+ *
+ * @param layer  The kernel layer for the device
+ **/
+void waitForNoRequestsActive(KernelLayer *layer);
+
+/**
+ * Enqueues an item on our internal "cpu queues". Since there is more than
+ * one, we rotate through them in hopes of creating some general balance.
+ *
+ * @param layer The kernel layer
+ * @param item  The work item to enqueue
+ */
+static inline void enqueueCPUWorkQueue(KernelLayer *layer, KvdoWorkItem *item)
+{
+  enqueueWorkQueue(layer->cpuQueue, item);
+}
+
+/**
+ * Adjust parameters to prepare to use a larger physical space.
+ * The size must be larger than the current size.
+ *
+ * @param layer          the kernel layer
+ * @param physicalCount  the new physical size in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ */
+int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount);
+
+/**
+ * Adjusts parameters to reflect resizing the underlying device.
+ * The size must be larger than the current size.
+ *
+ * @param layer            the kernel layer
+ * @param physicalCount    the new physical count in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ */
+int resizePhysical(KernelLayer *layer, BlockCount physicalCount);
+
+/**
+ * Adjust parameters to prepare to present a larger logical space.
+ * The size must be larger than the current size.
+ *
+ * @param layer         the kernel layer
+ * @param logicalCount  the new logical size in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ */
+int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount);
+
+/**
+ * Adjust parameters to present a larger logical space.
+ * The size must be larger than the current size.
+ *
+ * @param layer         the kernel layer
+ * @param logicalCount  the new logical size in blocks
+ *
+ * @return VDO_SUCCESS or an error
+ */
+int resizeLogical(KernelLayer *layer, BlockCount logicalCount);
+
+/**
+ * Indicate whether the kernel layer is configured to use a separate
+ * work queue for acknowledging received and processed bios.
+ *
+ * Note that this directly controls handling of write operations, but
+ * the compile-time flag USE_BIO_ACK_QUEUE_FOR_READ is also checked
+ * for read operations.
+ *
+ * @param  layer  The kernel layer
+ *
+ * @return   Whether a bio-acknowledgement work queue is in use
+ **/
+static inline bool useBioAckQueue(KernelLayer *layer)
+{
+  return layer->deviceConfig->threadCounts.bioAckThreads > 0;
+}
+
+/**
+ * Update bookkeeping for the completion of some number of requests, so that
+ * more incoming requests can be accepted.
+ *
+ * @param layer  The kernel layer
+ * @param count  The number of completed requests
+ **/
+void completeManyRequests(KernelLayer *layer, uint32_t count);
+
+#endif /* KERNELLAYER_H */
diff --git a/vdo/kernel/kernelStatistics.h b/vdo/kernel/kernelStatistics.h
new file mode 100644
index 0000000..a5c1210
--- /dev/null
+++ b/vdo/kernel/kernelStatistics.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ */
+
+#ifndef KERNEL_STATISTICS_H
+#define KERNEL_STATISTICS_H
+
+#include "header.h"
+#include "types.h"
+
+typedef struct {
+  /** Number of not REQ_WRITE bios */
+  uint64_t read;
+  /** Number of REQ_WRITE bios */
+  uint64_t write;
+  /** Number of REQ_DISCARD bios */
+  uint64_t discard;
+  /** Number of REQ_FLUSH bios */
+  uint64_t flush;
+  /** Number of REQ_FUA bios */
+  uint64_t fua;
+} BioStats;
+
+typedef struct {
+  /** Tracked bytes currently allocated. */
+  uint64_t bytesUsed;
+  /** Maximum tracked bytes allocated. */
+  uint64_t peakBytesUsed;
+} MemoryUsage;
+
+/** UDS index statistics */
+typedef struct {
+  /** Number of chunk names stored in the index */
+  uint64_t entriesIndexed;
+  /** Number of post calls that found an existing entry */
+  uint64_t postsFound;
+  /** Number of post calls that added a new entry */
+  uint64_t postsNotFound;
+  /** Number of query calls that found an existing entry */
+  uint64_t queriesFound;
+  /** Number of query calls that added a new entry */
+  uint64_t queriesNotFound;
+  /** Number of update calls that found an existing entry */
+  uint64_t updatesFound;
+  /** Number of update calls that added a new entry */
+  uint64_t updatesNotFound;
+  /** Current number of dedupe queries that are in flight */
+  uint32_t currDedupeQueries;
+  /** Maximum number of dedupe queries that have been in flight */
+  uint32_t maxDedupeQueries;
+} IndexStatistics;
+
+typedef struct {
+  uint32_t version;
+  uint32_t releaseVersion;
+  /** The VDO instance */
+  uint32_t instance;
+  /** Current number of active VIOs */
+  uint32_t currentVIOsInProgress;
+  /** Maximum number of active VIOs */
+  uint32_t maxVIOs;
+  /** Number of times the UDS index was too slow in responding */
+  uint64_t dedupeAdviceTimeouts;
+  /** Number of flush requests submitted to the storage device */
+  uint64_t flushOut;
+  /** Logical block size */
+  uint64_t logicalBlockSize;
+  /** Bios submitted into VDO from above */
+  BioStats biosIn;
+  BioStats biosInPartial;
+  /** Bios submitted onward for user data */
+  BioStats biosOut;
+  /** Bios submitted onward for metadata */
+  BioStats biosMeta;
+  BioStats biosJournal;
+  BioStats biosPageCache;
+  BioStats biosOutCompleted;
+  BioStats biosMetaCompleted;
+  BioStats biosJournalCompleted;
+  BioStats biosPageCacheCompleted;
+  BioStats biosAcknowledged;
+  BioStats biosAcknowledgedPartial;
+  /** Current number of bios in progress */
+  BioStats biosInProgress;
+  /** Memory usage stats. */
+  MemoryUsage memoryUsage;
+  /** The statistics for the UDS index */
+  IndexStatistics index;
+} KernelStatistics;
+
+/**
+ * Get the root for all stats proc files.
+ *
+ * @return The proc root
+ **/
+static inline const char *getProcRoot(void) {
+  return "vdo";
+}
+
+/**
+ * Get the proc file path for reading KernelStatistics.
+ *
+ * @return The proc file path
+ **/
+static inline const char *getKernelStatisticsProcFile(void) {
+  return "kernel_stats";
+}
+
+#endif /* not KERNEL_STATISTICS_H */
diff --git a/vdo/kernel/kernelTypes.h b/vdo/kernel/kernelTypes.h
new file mode 100644
index 0000000..b338440
--- /dev/null
+++ b/vdo/kernel/kernelTypes.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelTypes.h#3 $
+ */
+
+#ifndef KERNEL_TYPES_H
+#define KERNEL_TYPES_H
+
+#include "types.h"
+
+/**
+ * The size of a discard request in bytes.
+ **/
+typedef uint32_t DiscardSize;
+
+/**
+ * A time in jiffies.
+ **/
+typedef uint64_t Jiffies;
+
+/**
+ * A timeout in jiffies.
+ **/
+typedef int64_t TimeoutJiffies;
+
+typedef struct atomicBioStats AtomicBioStats;
+typedef struct bio            BIO;
+typedef struct dataKVIO       DataKVIO;
+typedef struct dedupeContext  DedupeContext;
+typedef struct dedupeIndex    DedupeIndex;
+typedef struct ioSubmitter    IOSubmitter;
+typedef struct kernelLayer    KernelLayer;
+typedef struct kvdo           KVDO;
+typedef struct kvdoFlush      KVDOFlush;
+typedef struct kvdoWorkItem   KvdoWorkItem;
+typedef struct kvdoWorkQueue  KvdoWorkQueue;
+typedef struct kvio           KVIO;
+
+typedef void (*KVIOCallback)(KVIO *kvio);
+typedef void (*DataKVIOCallback)(DataKVIO *dataKVIO);
+typedef void (*KvdoWorkFunction)(KvdoWorkItem *workItem);
+
+/**
+ * Method type for layer matching methods.
+ *
+ * A LayerFilter method returns false if the layer doesn't match.
+ **/
+typedef bool LayerFilter(KernelLayer *layer, void *context);
+
+#endif /* KERNEL_TYPES_H */
diff --git a/vdo/kernel/kernelVDO.c b/vdo/kernel/kernelVDO.c
new file mode 100644
index 0000000..5e1a72e
--- /dev/null
+++ b/vdo/kernel/kernelVDO.c
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.c#7 $
+ */
+
+#include "kernelVDOInternals.h"
+
+#include <linux/delay.h>
+
+#include "memoryAlloc.h"
+
+#include "statistics.h"
+#include "threadConfig.h"
+#include "vdo.h"
+#include "vdoDebug.h"
+#include "vdoLoad.h"
+#include "vdoResize.h"
+#include "vdoResizeLogical.h"
+#include "vdoResume.h"
+#include "vdoSuspend.h"
+
+#include "kernelLayer.h"
+#include "kvio.h"
+#include "logger.h"
+
+enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 };
+
+/**********************************************************************/
+static void startKVDORequestQueue(void *ptr)
+{
+  KVDOThread  *thread = ptr;
+  KVDO        *kvdo   = thread->kvdo;
+  KernelLayer *layer  = container_of(kvdo, KernelLayer, kvdo);
+  registerAllocatingThread(&thread->allocatingThread,
+                           &layer->allocationsAllowed);
+  setWorkQueuePrivateData(thread);
+}
+
+/**********************************************************************/
+static void finishKVDORequestQueue(void *ptr)
+{
+  unregisterAllocatingThread();
+}
+
+/**********************************************************************/
+static const KvdoWorkQueueType requestQueueType = {
+  .start       = startKVDORequestQueue,
+  .finish      = finishKVDORequestQueue,
+  .actionTable = {
+    { .name = "req_completion",
+      .code = REQ_Q_ACTION_COMPLETION,
+      .priority = 1 },
+    { .name = "req_flush",
+      .code = REQ_Q_ACTION_FLUSH,
+      .priority = 2 },
+    { .name = "req_map_bio",
+      .code = REQ_Q_ACTION_MAP_BIO,
+      .priority = 0 },
+    { .name = "req_sync",
+      .code = REQ_Q_ACTION_SYNC,
+      .priority = 2 },
+    { .name = "req_vio_callback",
+      .code = REQ_Q_ACTION_VIO_CALLBACK,
+      .priority = 1 },
+  },
+};
+
+/**********************************************************************/
+int initializeKVDO(KVDO                *kvdo,
+                   const ThreadConfig  *threadConfig,
+                   char               **reason)
+{
+  unsigned int baseThreads = threadConfig->baseThreadCount;
+  int result = ALLOCATE(baseThreads, KVDOThread,
+                        "request processing work queue",
+                        &kvdo->threads);
+  if (result != VDO_SUCCESS) {
+    *reason = "Cannot allocation thread structures";
+    return result;
+  }
+  KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo);
+  for (kvdo->initializedThreadCount = 0;
+       kvdo->initializedThreadCount < baseThreads;
+       kvdo->initializedThreadCount++) {
+    KVDOThread *thread = &kvdo->threads[kvdo->initializedThreadCount];
+
+    thread->kvdo = kvdo;
+    thread->threadID = kvdo->initializedThreadCount;
+
+    char queueName[MAX_QUEUE_NAME_LEN];
+    // Copy only LEN - 1 bytes and ensure NULL termination.
+    getVDOThreadName(threadConfig, kvdo->initializedThreadCount,
+                     queueName, sizeof(queueName));
+    int result = makeWorkQueue(layer->threadNamePrefix, queueName,
+                               &layer->wqDirectory, layer, thread,
+                               &requestQueueType, 1, &thread->requestQueue);
+    if (result != VDO_SUCCESS) {
+      *reason = "Cannot initialize request queue";
+      while (kvdo->initializedThreadCount > 0) {
+        unsigned int threadToDestroy = kvdo->initializedThreadCount - 1;
+        thread = &kvdo->threads[threadToDestroy];
+        finishWorkQueue(thread->requestQueue);
+        freeWorkQueue(&thread->requestQueue);
+        kvdo->initializedThreadCount--;
+      }
+      FREE(kvdo->threads);
+      return result;
+    }
+
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int preloadKVDO(KVDO                 *kvdo,
+                PhysicalLayer        *common,
+                const VDOLoadConfig  *loadConfig,
+                bool                  vioTraceRecording,
+                char                **reason)
+{
+  KernelLayer *layer = asKernelLayer(common);
+  init_completion(&layer->callbackSync);
+  int result = prepareToLoadVDO(kvdo->vdo, loadConfig);
+  if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+    *reason = "Cannot load metadata from device";
+    return result;
+  }
+
+  setVDOTracingFlags(kvdo->vdo, vioTraceRecording);
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason)
+{
+  KernelLayer *layer = asKernelLayer(common);
+  init_completion(&layer->callbackSync);
+  int result = performVDOLoad(kvdo->vdo);
+  if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+    *reason = "Cannot load metadata from device";
+    return result;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int suspendKVDO(KVDO *kvdo)
+{
+  if (kvdo->vdo == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo);
+  init_completion(&layer->callbackSync);
+  int result = performVDOSuspend(kvdo->vdo, !layer->noFlushSuspend);
+  if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+    char errorName[80] = "";
+    char errorMessage[ERRBUF_SIZE] = "";
+    logError("%s: Suspend device failed %d (%s: %s)",
+             __func__, result,
+             stringErrorName(result, errorName, sizeof(errorName)),
+             stringError(result, errorMessage, sizeof(errorMessage)));
+    return result;
+  }
+
+  // Convert VDO_READ_ONLY to VDO_SUCCESS since a read-only suspension still
+  // leaves the VDO suspended.
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int resumeKVDO(KVDO *kvdo)
+{
+  if (kvdo->vdo == NULL) {
+    return VDO_SUCCESS;
+  }
+
+  KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo);
+  init_completion(&layer->callbackSync);
+  return performVDOResume(kvdo->vdo);
+}
+
+/**********************************************************************/
+void finishKVDO(KVDO *kvdo)
+{
+  for (int i = 0; i < kvdo->initializedThreadCount; i++) {
+    finishWorkQueue(kvdo->threads[i].requestQueue);
+  }
+}
+
+/**********************************************************************/
+void destroyKVDO(KVDO *kvdo)
+{
+  destroyVDO(kvdo->vdo);
+  for (int i = 0; i < kvdo->initializedThreadCount; i++) {
+    freeWorkQueue(&kvdo->threads[i].requestQueue);
+  }
+  FREE(kvdo->threads);
+  kvdo->threads = NULL;
+}
+
+
+/**********************************************************************/
+void dumpKVDOWorkQueue(KVDO *kvdo)
+{
+  for (int i = 0; i < kvdo->initializedThreadCount; i++) {
+    dumpWorkQueue(kvdo->threads[i].requestQueue);
+  }
+}
+
+/**********************************************************************/
+typedef struct {
+  KvdoWorkItem       workItem;
+  KVDO              *kvdo;
+  void              *data;
+  struct completion *completion;
+} SyncQueueWork;
+
+/**
+ * Initiate an arbitrary asynchronous base-code operation and wait for
+ * it.
+ *
+ * An async queue operation is performed and we wait for completion.
+ *
+ * @param kvdo       The kvdo data handle
+ * @param action     The operation to perform
+ * @param data       Unique data that can be used by the operation
+ * @param threadID   The thread on which to perform the operation
+ * @param completion The completion to wait on
+ *
+ * @return VDO_SUCCESS of an error code
+ **/
+static void performKVDOOperation(KVDO              *kvdo,
+                                 KvdoWorkFunction   action,
+                                 void              *data,
+                                 ThreadID           threadID,
+                                 struct completion *completion)
+{
+  SyncQueueWork  sync;
+
+  memset(&sync, 0, sizeof(sync));
+  setupWorkItem(&sync.workItem, action, NULL, REQ_Q_ACTION_SYNC);
+  sync.kvdo       = kvdo;
+  sync.data       = data;
+  sync.completion = completion;
+
+  init_completion(completion);
+  enqueueKVDOWork(kvdo, &sync.workItem, threadID);
+  wait_for_completion(completion);
+}
+
+/**********************************************************************/
+typedef struct {
+  bool enable;
+  bool wasEnabled;
+} VDOCompressData;
+
+/**
+ * Does the work of calling the base code to set compress state, then
+ * tells the function waiting on completion to go ahead.
+ *
+ * @param item  The work item
+ **/
+static void setCompressingWork(KvdoWorkItem *item)
+{
+  SyncQueueWork   *work  = container_of(item, SyncQueueWork, workItem);
+  VDOCompressData *data  = (VDOCompressData *)work->data;
+  data->wasEnabled = setVDOCompressing(getVDO(work->kvdo), data->enable);
+  complete(work->completion);
+}
+
+/***********************************************************************/
+bool setKVDOCompressing(KVDO *kvdo, bool enableCompression)
+{
+  struct completion compressWait;
+  VDOCompressData data;
+  data.enable = enableCompression;
+  performKVDOOperation(kvdo, setCompressingWork, &data,
+                       getPackerZoneThread(getThreadConfig(kvdo->vdo)),
+                       &compressWait);
+  return data.wasEnabled;
+}
+
+/**********************************************************************/
+typedef struct {
+  int result;
+} VDOReadOnlyData;
+
+/**********************************************************************/
+static void enterReadOnlyModeWork(KvdoWorkItem *item)
+{
+  SyncQueueWork   *work = container_of(item, SyncQueueWork, workItem);
+  VDOReadOnlyData *data = work->data;
+  makeVDOReadOnly(getVDO(work->kvdo), data->result);
+  complete(work->completion);
+}
+
+/***********************************************************************/
+void setKVDOReadOnly(KVDO *kvdo, int result)
+{
+  struct completion readOnlyWait;
+  VDOReadOnlyData data;
+  data.result = result;
+  performKVDOOperation(kvdo, enterReadOnlyModeWork, &data,
+                       getAdminThread(getThreadConfig(kvdo->vdo)),
+                       &readOnlyWait);
+}
+
+/**
+ * Does the work of calling the vdo statistics gathering tool
+ *
+ * @param item   The work item
+ **/
+static void getVDOStatisticsWork(KvdoWorkItem *item)
+{
+  SyncQueueWork *work  = container_of(item, SyncQueueWork, workItem);
+  VDOStatistics *stats = (VDOStatistics *)work->data;
+  getVDOStatistics(getVDO(work->kvdo), stats);
+  complete(work->completion);
+}
+
+/***********************************************************************/
+void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats)
+{
+  struct completion statsWait;
+  memset(stats, 0, sizeof(VDOStatistics));
+  performKVDOOperation(kvdo, getVDOStatisticsWork, stats,
+                       getAdminThread(getThreadConfig(kvdo->vdo)),
+                       &statsWait);
+}
+
+/**
+ * A structure to invoke an arbitrary VDO action.
+ **/
+typedef struct vdoActionData {
+  VDOAction          *action;
+  VDOCompletion      *vdoCompletion;
+  struct completion   waiter;
+} VDOActionData;
+
+/**
+ * Initialize a VDOActionData structure so that the specified action
+ * can be invoked on the specified completion.
+ *
+ * @param data              A VDOActionData.
+ * @param action            The VDOAction to execute.
+ * @param vdoCompletion     The VDO completion upon which the action acts.
+ **/
+static void initializeVDOActionData(VDOActionData *data,
+                                    VDOAction     *action,
+                                    VDOCompletion *vdoCompletion)
+{
+  *data = (VDOActionData) {
+    .action        = action,
+    .vdoCompletion = vdoCompletion,
+  };
+}
+
+/**
+ * The VDO callback that completes the KVDO completion.
+ *
+ * @param vdoCompletion     The VDO completion which was acted upon.
+ **/
+static void finishVDOAction(VDOCompletion *vdoCompletion)
+{
+  SyncQueueWork *work = vdoCompletion->parent;
+  complete(work->completion);
+}
+
+/**
+ * Perform a VDO base code action as specified by a VDOActionData.
+ *
+ * Sets the completion callback and parent inside the VDOActionData
+ * so that the corresponding kernel completion is completed when
+ * the VDO completion is.
+ *
+ * @param item          A KVDO work queue item.
+ **/
+static void performVDOActionWork(KvdoWorkItem *item)
+{
+  SyncQueueWork *work = container_of(item, SyncQueueWork, workItem);
+  VDOActionData *data = work->data;
+  ThreadID       id   = getPhysicalLayer()->getCurrentThreadID();
+
+  setCallbackWithParent(data->vdoCompletion, finishVDOAction, id, work);
+  data->action(data->vdoCompletion);
+}
+
+/**********************************************************************/
+int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv)
+{
+  VDOActionData        data;
+  VDOCommandCompletion cmd;
+
+  int result = initializeVDOCommandCompletion(&cmd, getVDO(kvdo), argc, argv);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  initializeVDOActionData(&data, executeVDOExtendedCommand, &cmd.completion);
+  performKVDOOperation(kvdo, performVDOActionWork, &data,
+                       getAdminThread(getThreadConfig(kvdo->vdo)),
+                       &data.waiter);
+
+  return destroyVDOCommandCompletion(&cmd);
+}
+
+/**********************************************************************/
+void dumpKVDOStatus(KVDO *kvdo)
+{
+  dumpVDOStatus(kvdo->vdo);
+}
+
+/**********************************************************************/
+bool getKVDOCompressing(KVDO *kvdo)
+{
+  return getVDOCompressing(kvdo->vdo);
+}
+
+/**********************************************************************/
+int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount)
+{
+  VDO *vdo = kvdo->vdo;
+  return prepareToGrowPhysical(vdo, physicalCount);
+}
+
+/**********************************************************************/
+int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount)
+{
+  KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo);
+  init_completion(&layer->callbackSync);
+  int result = performGrowPhysical(kvdo->vdo, physicalCount);
+  if (result != VDO_SUCCESS) {
+    logError("resize operation failed, result = %d", result);
+    return result;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount)
+{
+  VDO *vdo = kvdo->vdo;
+  return prepareToGrowLogical(vdo, logicalCount);
+}
+
+/**********************************************************************/
+int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount)
+{
+  KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo);
+  init_completion(&layer->callbackSync);
+  int result = performGrowLogical(kvdo->vdo, logicalCount);
+  if (result != VDO_SUCCESS) {
+    logError("grow logical operation failed, result = %d", result);
+  }
+
+  return result;
+}
+
+/**********************************************************************/
+WritePolicy getKVDOWritePolicy(KVDO *kvdo)
+{
+  return getWritePolicy(kvdo->vdo);
+}
+
+/**********************************************************************/
+void enqueueKVDOThreadWork(KVDOThread    *thread,
+                           KvdoWorkItem  *item)
+{
+  enqueueWorkQueue(thread->requestQueue, item);
+}
+
+/**********************************************************************/
+void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID)
+{
+  enqueueKVDOThreadWork(&kvdo->threads[threadID], item);
+}
+
+/**********************************************************************/
+void enqueueKVIO(KVIO             *kvio,
+                 KvdoWorkFunction  work,
+                 void             *statsFunction,
+                 unsigned int      action)
+{
+  ThreadID threadID = vioAsCompletion(kvio->vio)->callbackThreadID;
+  BUG_ON(threadID >= kvio->layer->kvdo.initializedThreadCount);
+  launchKVIO(kvio, work, statsFunction, action,
+             kvio->layer->kvdo.threads[threadID].requestQueue);
+}
+
+/**********************************************************************/
+static void kvdoEnqueueWork(KvdoWorkItem *workItem)
+{
+  KvdoEnqueueable *kvdoEnqueueable = container_of(workItem,
+                                                  KvdoEnqueueable,
+                                                  workItem);
+  runCallback(kvdoEnqueueable->enqueueable.completion);
+}
+
+/**********************************************************************/
+void kvdoEnqueue(Enqueueable *enqueueable)
+{
+  KvdoEnqueueable *kvdoEnqueueable = container_of(enqueueable,
+                                                  KvdoEnqueueable,
+                                                  enqueueable);
+  KernelLayer *layer    = asKernelLayer(enqueueable->completion->layer);
+  ThreadID     threadID = enqueueable->completion->callbackThreadID;
+  if (ASSERT(threadID < layer->kvdo.initializedThreadCount,
+             "threadID %u (completion type %d) is less than thread count %u",
+             threadID, enqueueable->completion->type,
+             layer->kvdo.initializedThreadCount) != UDS_SUCCESS) {
+    BUG();
+  }
+
+  if (enqueueable->completion->type == VIO_COMPLETION) {
+    vioAddTraceRecord(asVIO(enqueueable->completion),
+                      THIS_LOCATION("$F($cb)"));
+  }
+  setupWorkItem(&kvdoEnqueueable->workItem, kvdoEnqueueWork,
+                (KvdoWorkFunction) enqueueable->completion->callback,
+                REQ_Q_ACTION_COMPLETION);
+  enqueueKVDOThreadWork(&layer->kvdo.threads[threadID],
+                        &kvdoEnqueueable->workItem);
+}
+
+/**********************************************************************/
+ThreadID kvdoGetCurrentThreadID(void)
+{
+  KVDOThread *thread = getWorkQueuePrivateData();
+  if (thread == NULL) {
+    return INVALID_THREAD_ID;
+  }
+
+  ThreadID threadID = thread->threadID;
+  if (PARANOID_THREAD_CONSISTENCY_CHECKS) {
+    KVDO        *kvdo        = thread->kvdo;
+    KernelLayer *kernelLayer = asKernelLayer(getPhysicalLayer());
+    BUG_ON(&kernelLayer->kvdo != kvdo);
+    BUG_ON(threadID >= kvdo->initializedThreadCount);
+    BUG_ON(thread != &kvdo->threads[threadID]);
+  }
+  return threadID;
+}
+
+/**********************************************************************/
+static PhysicalLayer *getKernelPhysicalLayer(void)
+{
+  KVDOThread  *thread = getWorkQueuePrivateData();
+  if (thread == NULL) {
+    return NULL;
+  }
+  KVDO        *kvdo   = thread->kvdo;
+  KernelLayer *layer  = container_of(kvdo, KernelLayer, kvdo);
+  return &layer->common;
+}
+
+void initKernelVDOOnce(void)
+{
+  registerPhysicalLayerGetter(getKernelPhysicalLayer);
+}
diff --git a/vdo/kernel/kernelVDO.h b/vdo/kernel/kernelVDO.h
new file mode 100644
index 0000000..b65534d
--- /dev/null
+++ b/vdo/kernel/kernelVDO.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.h#4 $
+ */
+
+#ifndef KERNEL_VDO_H
+#define KERNEL_VDO_H
+
+#include "completion.h"
+#include "kernelTypes.h"
+#include "threadRegistry.h"
+#include "workQueue.h"
+
+typedef struct {
+  KVDO              *kvdo;
+  ThreadID           threadID;
+  KvdoWorkQueue     *requestQueue;
+  RegisteredThread   allocatingThread;
+} KVDOThread;
+
+struct kvdo {
+  KVDOThread        *threads;
+  ThreadID           initializedThreadCount;
+  KvdoWorkItem       workItem;
+  VDOAction         *action;
+  VDOCompletion     *completion;
+  // Base-code device info
+  VDO               *vdo;
+};
+
+typedef enum reqQAction {
+  REQ_Q_ACTION_COMPLETION,
+  REQ_Q_ACTION_FLUSH,
+  REQ_Q_ACTION_MAP_BIO,
+  REQ_Q_ACTION_SYNC,
+  REQ_Q_ACTION_VIO_CALLBACK
+} ReqQAction;
+
+/**
+ * Initialize the base code interface.
+ *
+ * @param [in]  kvdo          The KVDO to be initialized
+ * @param [in]  threadConfig  The base-code thread configuration
+ * @param [out] reason        The reason for failure
+ *
+ * @return  VDO_SUCCESS or an error code
+ **/
+int initializeKVDO(KVDO                *kvdo,
+                   const ThreadConfig  *threadConfig,
+                   char               **reason);
+
+/**
+ * Load the VDO state from disk but don't alter the on-disk state. This method
+ * is ultimately called from the constructor for devices which have not been
+ * resumed.
+ *
+ * @param [in]  kvdo                  The KVDO to be started
+ * @param [in]  common                The physical layer pointer
+ * @param [in]  loadConfig            Load-time parameters for the VDO
+ * @param [in]  vioTraceRecording     Debug flag to store
+ * @param [out] reason                The reason for failure
+ **/
+int preloadKVDO(KVDO                 *kvdo,
+                PhysicalLayer        *common,
+                const VDOLoadConfig  *loadConfig,
+                bool                  vioTraceRecording,
+                char                **reason);
+
+/**
+ * Starts the base VDO instance associated with the kernel layer. This method
+ * is ultimately called from preresume the first time an instance is resumed.
+ *
+ * @param [in]  kvdo                  The KVDO to be started
+ * @param [in]  common                The physical layer pointer
+ * @param [out] reason                The reason for failure
+ *
+ * @return VDO_SUCCESS if started, otherwise error
+ */
+int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason);
+
+/**
+ * Suspend the base VDO instance associated with the kernel layer.
+ *
+ * @param kvdo  The KVDO to be suspended
+ *
+ * @return VDO_SUCCESS if stopped, otherwise error
+ **/
+int suspendKVDO(KVDO *kvdo);
+
+/**
+ * Resume the base VDO instance associated with the kernel layer.
+ *
+ * @param kvdo  The KVDO to be resumed
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int resumeKVDO(KVDO *kvdo);
+
+/**
+ * Shut down the base code interface. The kvdo object must first be
+ * stopped.
+ *
+ * @param kvdo         The KVDO to be shut down
+ **/
+void finishKVDO(KVDO *kvdo);
+
+/**
+ * Free up storage of the base code interface. The KVDO object must
+ * first have been "finished".
+ *
+ * @param kvdo         The KVDO object to be destroyed
+ **/
+void destroyKVDO(KVDO *kvdo);
+
+
+/**
+ * Dump to the kernel log any work-queue info associated with the base
+ * code.
+ *
+ * @param kvdo     The KVDO object to be examined
+ **/
+void dumpKVDOWorkQueue(KVDO *kvdo);
+
+/**
+ * Get the VDO pointer for a kvdo object
+ *
+ * @param kvdo          The KVDO object
+ *
+ * @return the VDO pointer
+ */
+static inline VDO *getVDO(KVDO *kvdo)
+{
+  return kvdo->vdo;
+}
+
+/**
+ * Set whether compression is enabled.
+ *
+ * @param kvdo               The KVDO object
+ * @param enableCompression  The new compression mode
+ *
+ * @return state of compression before new value is set
+ **/
+bool setKVDOCompressing(KVDO *kvdo, bool enableCompression);
+
+/**
+ * Get the current compression mode
+ *
+ * @param kvdo          The KVDO object to be queried
+ *
+ * @return whether compression is currently enabled
+ */
+bool getKVDOCompressing(KVDO *kvdo);
+
+/**
+ * Gets the latest statistics gathered by the base code.
+ *
+ * @param kvdo  the KVDO object
+ * @param stats the statistics struct to fill in
+ */
+void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats);
+
+/**
+ * Get the current write policy
+ *
+ * @param kvdo          The KVDO to be queried
+ *
+ * @return  the write policy in effect
+ */
+WritePolicy getKVDOWritePolicy(KVDO *kvdo);
+
+/**
+ * Dump base code status information to the kernel log for debugging.
+ *
+ * @param kvdo          The KVDO to be examined
+ */
+void dumpKVDOStatus(KVDO *kvdo);
+
+/**
+ * Request the base code prepare to grow the physical space.
+ *
+ * @param kvdo           The KVDO to be updated
+ * @param physicalCount  The new size
+ *
+ * @return VDO_SUCCESS or error
+ */
+int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount);
+
+/**
+ * Notify the base code of resized physical storage.
+ *
+ * @param kvdo           The KVDO to be updated
+ * @param physicalCount  The new size
+ *
+ * @return VDO_SUCCESS or error
+ */
+int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount);
+
+/**
+ * Request the base code prepare to grow the logical space.
+ *
+ * @param kvdo          The KVDO to be updated
+ * @param logicalCount  The new size
+ *
+ * @return VDO_SUCCESS or error
+ */
+int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount);
+
+/**
+ * Request the base code grow the logical space.
+ *
+ * @param kvdo          The KVDO to be updated
+ * @param logicalCount  The new size
+ *
+ * @return VDO_SUCCESS or error
+ */
+int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount);
+
+/**
+ * Request the base code go read-only.
+ *
+ * @param kvdo          The KVDO to be updated
+ * @param result        The error code causing the read only
+ */
+void setKVDOReadOnly(KVDO *kvdo, int result);
+
+/**
+ * Perform an extended base-code command
+ *
+ * @param kvdo          The KVDO upon which to perform the operation.
+ * @param argc          The number of arguments to the command.
+ * @param argv          The command arguments. Note that all extended
+ *                        command argv[0] strings start with "x-".
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv);
+
+/**
+ * Enqueue a work item to be processed in the base code context.
+ *
+ * @param kvdo         The KVDO object in which to run the work item
+ * @param item         The work item to be run
+ * @param threadID     The thread on which to run the work item
+ **/
+void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID);
+
+/**
+ * Set up and enqueue a VIO's work item to be processed in the base code
+ * context.
+ *
+ * @param kvio           The VIO with the work item to be run
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ **/
+void enqueueKVIO(KVIO             *kvio,
+                 KvdoWorkFunction  work,
+                 void             *statsFunction,
+                 unsigned int      action);
+
+/**
+ * Enqueue an arbitrary completion for execution on its indicated
+ * thread.
+ *
+ * @param enqueueable  The Enqueueable object containing the completion pointer
+ **/
+void kvdoEnqueue(Enqueueable *enqueueable);
+
+/**
+ * Get the base-code thread index for the current execution context.
+ *
+ * @return    The thread ID, or (ThreadID)-1 if the current thread is
+ *            not a base-code thread, or in an interrupt context.
+ **/
+ThreadID kvdoGetCurrentThreadID(void);
+
+/**
+ * Do one-time initialization of kernelVDO interface.
+ **/
+void initKernelVDOOnce(void);
+
+#endif // KERNEL_VDO_H
diff --git a/vdo/kernel/kernelVDOInternals.h b/vdo/kernel/kernelVDOInternals.h
new file mode 100644
index 0000000..aefe05a
--- /dev/null
+++ b/vdo/kernel/kernelVDOInternals.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDOInternals.h#1 $
+ */
+
+#ifndef KERNEL_VDO_INTERNALS_H
+#define KERNEL_VDO_INTERNALS_H
+
+#include "kernelVDO.h"
+
+/**
+ * Enqueue a work item to be performed in the base code in a
+ * particular thread.
+ *
+ * @param thread         The KVDO thread on which to run the work item
+ * @param item           The work item to be run
+ **/
+void enqueueKVDOThreadWork(KVDOThread *thread, KvdoWorkItem *item);
+
+#endif // KERNEL_VDO_INTERNALS_H
diff --git a/vdo/kernel/ktrace.c b/vdo/kernel/ktrace.c
new file mode 100644
index 0000000..ebc654a
--- /dev/null
+++ b/vdo/kernel/ktrace.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.c#2 $
+ */
+
+#include "ktrace.h"
+
+#include "memoryAlloc.h"
+
+#include "dataVIO.h"
+
+#include "kvio.h"
+#include "logger.h"
+
+enum {
+  // How much data from a trace can we log in one call without messing
+  // up the log or losing data?
+  TRACE_LOG_MAX         = 820,
+
+  // What fraction (1 out of TRACE_SAMPLE_INTERVAL VIOs) to trace
+  TRACE_SAMPLE_INTERVAL = 3,
+};
+
+bool traceRecording = false;
+
+static struct {
+  char         buffer[2000];
+  unsigned int counter;
+  struct mutex lock;
+} traceLoggingState;
+
+/**
+ * Initialize a SampleCounter structure with the given sampling interval.
+ *
+ * @param counter    The counter to initialize
+ * @param interval   The desired sampling interval
+ **/
+static void initializeSampleCounter(SampleCounter *counter,
+                                    unsigned int   interval)
+{
+  spin_lock_init(&counter->lock);
+  counter->tick = 0;
+  counter->interval = interval;
+}
+
+/*************************************************************************/
+bool sampleThisOne(SampleCounter *counter)
+{
+  bool wantTracing = false;
+  spin_lock(&counter->lock);
+  counter->tick++;
+  if (counter->tick >= counter->interval) {
+    counter->tick = 0;
+    wantTracing = true;
+  }
+  spin_unlock(&counter->lock);
+  return wantTracing;
+}
+
+/*************************************************************************/
+static void freeTraceDataBuffer(void *poolData, void *data)
+{
+  Trace *trace = (Trace *) data;
+  FREE(trace);
+}
+
+/*************************************************************************/
+static int allocTraceDataBuffer(void *poolData, void **dataPtr)
+{
+  Trace *trace;
+  int result = ALLOCATE(1, Trace, __func__, &trace);
+  if (result != VDO_SUCCESS) {
+    logError("trace data allocation failure %d", result);
+    return result;
+  }
+
+  *dataPtr = trace;
+  return VDO_SUCCESS;
+}
+
+/*************************************************************************/
+int allocTraceFromPool(KernelLayer *layer, Trace **tracePointer)
+{
+  int result = allocBufferFromPool(layer->traceBufferPool,
+                                   (void **) tracePointer);
+  if (result == VDO_SUCCESS) {
+    (*tracePointer)->used = 0;
+  }
+  return result;
+}
+
+/*************************************************************************/
+void freeTraceToPool(KernelLayer *layer, Trace *trace)
+{
+  freeBufferToPool(layer->traceBufferPool, trace);
+}
+
+/*************************************************************************/
+int traceKernelLayerInit(KernelLayer *layer)
+{
+  layer->vioTraceRecording = traceRecording;
+  initializeSampleCounter(&layer->traceSampleCounter, TRACE_SAMPLE_INTERVAL);
+  unsigned int traceRecordsNeeded = 0;
+  if (layer->vioTraceRecording) {
+    traceRecordsNeeded += layer->requestLimiter.limit;
+  }
+  if (traceRecordsNeeded > 0) {
+    return makeBufferPool("KVDO Trace Data Pool", traceRecordsNeeded,
+                          allocTraceDataBuffer, freeTraceDataBuffer, NULL,
+                          layer, &layer->traceBufferPool);
+  }
+  return VDO_SUCCESS;
+}
+
+/*************************************************************************/
+void initializeTraceLoggingOnce(void)
+{
+  mutex_init(&traceLoggingState.lock);
+}
+
+/*************************************************************************/
+void logKvioTrace(KVIO *kvio)
+{
+  KernelLayer *layer = kvio->layer;
+
+  mutex_lock(&traceLoggingState.lock);
+  traceLoggingState.counter++;
+  // Log about 0.1% to avoid spewing data faster than syslog can keep up
+  // (on certain of Permabit's test machines).
+  // Yes, the 37 is arbitrary and meaningless.
+
+  if (layer->traceLogging && ((traceLoggingState.counter % 1024) == 37)) {
+    kvioAddTraceRecord(kvio, THIS_LOCATION(NULL));
+    size_t traceLen = 0;
+    formatTrace(kvio->vio->trace, traceLoggingState.buffer,
+                sizeof(traceLoggingState.buffer), &traceLen);
+
+    if (isMetadata(kvio)) {
+      logInfo("finishing kvio %s meta @%" PRIptr " %s",
+              (isWriteVIO(kvio->vio) ? "read" : "write"),
+              kvio, traceLoggingState.buffer);
+    } else if (isCompressedWriter(kvio)) {
+      logInfo("finishing kvio write comp @%" PRIptr " %s",
+              kvio, traceLoggingState.buffer);
+    } else {
+      const char *dupeLabel = "";
+      if (isWriteVIO(kvio->vio)) {
+        DataVIO *dataVIO = vioAsDataVIO(kvio->vio);
+        if (isTrimDataVIO(dataVIO)) {
+          dupeLabel = "trim ";
+        } else if (dataVIO->isZeroBlock) {
+          dupeLabel = "zero ";
+        } else if (dataVIO->isDuplicate) {
+          dupeLabel = "dupe ";
+        } else {
+          dupeLabel = "new ";
+        }
+      }
+
+      logInfo("finishing kvio %s data %s@%" PRIptr " %.*s",
+              (isWriteVIO(kvio->vio) ? "read" : "write"),
+              dupeLabel, kvio, TRACE_LOG_MAX, traceLoggingState.buffer);
+      char *buf = traceLoggingState.buffer;
+      while (traceLen > TRACE_LOG_MAX) {
+        traceLen -= TRACE_LOG_MAX;
+        buf += TRACE_LOG_MAX;
+        logInfo("more kvio %" PRIptr " path: %.*s", kvio, TRACE_LOG_MAX, buf);
+      }
+    }
+  }
+
+  mutex_unlock(&traceLoggingState.lock);
+}
diff --git a/vdo/kernel/ktrace.h b/vdo/kernel/ktrace.h
new file mode 100644
index 0000000..99cda7a
--- /dev/null
+++ b/vdo/kernel/ktrace.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.h#1 $
+ */
+
+#ifndef KTRACE_H
+#define KTRACE_H
+
+#include <linux/device-mapper.h>
+
+#include "common.h"
+#include "trace.h"
+
+struct kernelLayer;
+struct kvio;
+
+// Implement event sampling once per N.
+typedef struct {
+  unsigned int interval;
+  unsigned int tick;
+  spinlock_t   lock;
+} SampleCounter;
+
+/**
+ * Flag indicating whether newly created VDO devices should record trace info.
+ **/
+extern bool traceRecording;
+
+/**
+ * Updates the counter state and returns true once each time the
+ * sampling interval is reached.
+ *
+ * @param counter    The sampling counter info
+ *
+ * @return whether to do sampling on this invocation
+ **/
+bool sampleThisOne(SampleCounter *counter);
+
+/**
+ * Initialize trace data in the KernelLayer
+ *
+ * @param layer  The KernelLayer
+ *
+ * @return VDO_SUCCESS, or an error code
+ **/
+int traceKernelLayerInit(struct kernelLayer *layer);
+
+/**
+ * Initialize the mutex used when logging latency tracing data.
+ **/
+void initializeTraceLoggingOnce(void);
+
+/**
+ * Allocate a trace buffer
+ *
+ * @param layer         The KernelLayer
+ * @param tracePointer  The trace buffer is returned here
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int allocTraceFromPool(struct kernelLayer *layer, Trace **tracePointer);
+
+/**
+ * Free a trace buffer
+ *
+ * @param layer  The KernelLayer
+ * @param trace  The trace buffer
+ **/
+void freeTraceToPool(struct kernelLayer *layer, Trace *trace);
+
+/**
+ * Log the trace at kvio freeing time
+ *
+ * @param kvio  The kvio structure
+ **/
+void logKvioTrace(struct kvio *kvio);
+
+#endif /* KTRACE_H */
diff --git a/vdo/kernel/kvdoFlush.c b/vdo/kernel/kvdoFlush.c
new file mode 100644
index 0000000..7b38af1
--- /dev/null
+++ b/vdo/kernel/kvdoFlush.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.c#6 $
+ */
+
+#include "kvdoFlush.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "threadConfig.h"
+
+#include "bio.h"
+#include "ioSubmitter.h"
+
+/**
+ * A specific (concrete) encapsulation of flush requests.
+ *
+ * <p>We attempt to allocate a KVDOFlush objects for each incoming flush bio.
+ * In case the allocate fails, a spare object is pre-allocated by and stored
+ * in the kernel layer. The first time an allocation fails, the spare is used.
+ * If another allocation fails while the spare is in use, it will merely be
+ * queued for later processing.
+ *
+ * <p>When a KVDOFlush is complete, it will either be freed, immediately
+ * re-used for queued flushes, or stashed in the kernel layer as the new spare
+ * object. This ensures that we will always make forward progress.
+ **/
+struct kvdoFlush {
+  KvdoWorkItem     workItem;
+  KernelLayer     *layer;
+  struct bio_list  bios;
+  Jiffies          arrivalTime;  // Time when earliest bio appeared
+  VDOFlush         vdoFlush;
+};
+
+/**********************************************************************/
+int makeKVDOFlush(KVDOFlush **flushPtr)
+{
+  return ALLOCATE(1, KVDOFlush, __func__, flushPtr);
+}
+
+/**********************************************************************/
+bool shouldProcessFlush(KernelLayer *layer)
+{
+  return (getKVDOWritePolicy(&layer->kvdo) != WRITE_POLICY_SYNC);
+}
+
+/**
+ * Function call to handle an empty flush request from the request queue.
+ *
+ * @param item  The work item representing the flush request
+ **/
+static void kvdoFlushWork(KvdoWorkItem *item)
+{
+  KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem);
+  flush(kvdoFlush->layer->kvdo.vdo, &kvdoFlush->vdoFlush);
+}
+
+/**
+ * Initialize a KVDOFlush object, transferring all the bios in the kernel
+ * layer's waitingFlushes list to it. The caller MUST already hold the layer's
+ * flushLock.
+ *
+ * @param kvdoFlush  The flush to initialize
+ * @param layer      The kernel layer on which the flushLock is held
+ **/
+static void initializeKVDOFlush(KVDOFlush *kvdoFlush, KernelLayer *layer)
+{
+  kvdoFlush->layer = layer;
+  bio_list_init(&kvdoFlush->bios);
+  bio_list_merge(&kvdoFlush->bios, &layer->waitingFlushes);
+  bio_list_init(&layer->waitingFlushes);
+  kvdoFlush->arrivalTime = layer->flushArrivalTime;
+}
+
+/**********************************************************************/
+static void enqueueKVDOFlush(KVDOFlush *kvdoFlush)
+{
+  setupWorkItem(&kvdoFlush->workItem, kvdoFlushWork, NULL, REQ_Q_ACTION_FLUSH);
+  KVDO *kvdo = &kvdoFlush->layer->kvdo;
+  enqueueKVDOWork(kvdo, &kvdoFlush->workItem,
+                  getPackerZoneThread(getThreadConfig(kvdo->vdo)));
+}
+
+/**********************************************************************/
+void launchKVDOFlush(KernelLayer *layer, BIO *bio)
+{
+  // Try to allocate a KVDOFlush to represent the flush request. If the
+  // allocation fails, we'll deal with it later.
+  KVDOFlush *kvdoFlush = ALLOCATE_NOWAIT(KVDOFlush, __func__);
+
+  spin_lock(&layer->flushLock);
+
+  // We have a new bio to start.  Add it to the list.  If it becomes the
+  // only entry on the list, record the time.
+  if (bio_list_empty(&layer->waitingFlushes)) {
+    layer->flushArrivalTime = jiffies;
+  }
+  bio_list_add(&layer->waitingFlushes, bio);
+
+  if (kvdoFlush == NULL) {
+    // The KVDOFlush allocation failed. Try to use the spare KVDOFlush object.
+    if (layer->spareKVDOFlush == NULL) {
+      // The spare is already in use. This bio is on waitingFlushes and it
+      // will be handled by a flush completion or by a bio that can allocate.
+      spin_unlock(&layer->flushLock);
+      return;
+    }
+
+    // Take and use the spare KVDOFlush object.
+    kvdoFlush = layer->spareKVDOFlush;
+    layer->spareKVDOFlush = NULL;
+  }
+
+  // We have flushes to start. Capture them in the KVDOFlush object.
+  initializeKVDOFlush(kvdoFlush, layer);
+
+  spin_unlock(&layer->flushLock);
+
+  // Finish launching the flushes.
+  enqueueKVDOFlush(kvdoFlush);
+}
+
+/**
+ * Release a KVDOFlush object that has completed its work. If there are any
+ * pending flush requests whose KVDOFlush allocation failed, they will be
+ * launched by immediately re-using the released KVDOFlush. If there is no
+ * spare KVDOFlush, the released object will become the spare. Otherwise, the
+ * KVDOFlush will be freed.
+ *
+ * @param kvdoFlush  The completed flush object to re-use or free
+ **/
+static void releaseKVDOFlush(KVDOFlush *kvdoFlush)
+{
+  KernelLayer *layer = kvdoFlush->layer;
+  bool relaunchFlush = false;
+  bool freeFlush     = false;
+
+  spin_lock(&layer->flushLock);
+  if (bio_list_empty(&layer->waitingFlushes)) {
+    // Nothing needs to be started.  Save one spare KVDOFlush object.
+    if (layer->spareKVDOFlush == NULL) {
+      // Make the new spare all zero, just like a newly allocated one.
+      memset(kvdoFlush, 0, sizeof(*kvdoFlush));
+      layer->spareKVDOFlush = kvdoFlush;
+    } else {
+      freeFlush = true;
+    }
+  } else {
+    // We have flushes to start.  Capture them in the KVDOFlush object.
+    initializeKVDOFlush(kvdoFlush, layer);
+    relaunchFlush = true;
+  }
+  spin_unlock(&layer->flushLock);
+
+  if (relaunchFlush) {
+    // Finish launching the flushes.
+    enqueueKVDOFlush(kvdoFlush);
+  } else if (freeFlush) {
+    FREE(kvdoFlush);
+  }
+}
+
+/**
+ * Function called to complete and free a flush request
+ *
+ * @param item    The flush-request work item
+ **/
+static void kvdoCompleteFlushWork(KvdoWorkItem *item)
+{
+  KVDOFlush   *kvdoFlush = container_of(item, KVDOFlush, workItem);
+  KernelLayer *layer     = kvdoFlush->layer;
+
+  BIO *bio;
+  while ((bio = bio_list_pop(&kvdoFlush->bios)) != NULL) {
+    // We're not acknowledging this bio now, but we'll never touch it
+    // again, so this is the last chance to account for it.
+    countBios(&layer->biosAcknowledged, bio);
+
+    // Make sure the bio is a empty flush bio.
+    prepareFlushBIO(bio, bio->bi_private, getKernelLayerBdev(layer),
+                    bio->bi_end_io);
+    atomic64_inc(&layer->flushOut);
+    generic_make_request(bio);
+  }
+
+
+  // Release the KVDOFlush object, freeing it, re-using it as the spare, or
+  // using it to launch any flushes that had to wait when allocations failed.
+  releaseKVDOFlush(kvdoFlush);
+}
+
+/**********************************************************************/
+void kvdoCompleteFlush(VDOFlush **kfp)
+{
+  if (*kfp != NULL) {
+    KVDOFlush *kvdoFlush = container_of(*kfp, KVDOFlush, vdoFlush);
+    setupWorkItem(&kvdoFlush->workItem, kvdoCompleteFlushWork, NULL,
+                  BIO_Q_ACTION_FLUSH);
+    enqueueBioWorkItem(kvdoFlush->layer->ioSubmitter,
+                       &kvdoFlush->workItem);
+    *kfp = NULL;
+  }
+}
+
+/**********************************************************************/
+int synchronousFlush(KernelLayer *layer)
+{
+  BIO bio;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
+  bio_init(&bio, 0, 0);
+#else
+  bio_init(&bio);
+#endif
+  int result = 0;
+
+  prepareFlushBIO(&bio, layer, getKernelLayerBdev(layer), NULL);
+  result = submitBioAndWait(&bio);
+  atomic64_inc(&layer->flushOut);
+  if (result != 0) {
+    logErrorWithStringError(result, "synchronous flush failed");
+    result = -EIO;
+  }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0)
+  bio_uninit(&bio);
+#endif
+  return result;
+}
diff --git a/vdo/kernel/kvdoFlush.h b/vdo/kernel/kvdoFlush.h
new file mode 100644
index 0000000..2d90953
--- /dev/null
+++ b/vdo/kernel/kvdoFlush.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.h#1 $
+ */
+
+#ifndef KVDO_FLUSH_H
+#define KVDO_FLUSH_H
+
+#include "flush.h"
+
+#include "kernelLayer.h"
+
+/**
+ * Create a KVDOFlush.
+ *
+ * @param flushPtr  A pointer to hold the new flush
+ **/
+int makeKVDOFlush(KVDOFlush **flushPtr);
+
+/**
+ * Answer the question as to whether VDO should be processing REQ_FLUSH
+ * requests or not.
+ *
+ * @param layer    The layer
+ *
+ * @return true if VDO should process empty flush requests, or false if
+ *         they should just be forwarded to our storage device.
+ **/
+bool shouldProcessFlush(KernelLayer *layer);
+
+/**
+ * Function called to start processing a flush request. It is called when we
+ * receive an empty flush bio from the block layer, and before acknowledging a
+ * non-empty bio with the FUA flag set.
+ *
+ * @param layer  The physical layer
+ * @param bio    The bio containing an empty flush request
+ **/
+void launchKVDOFlush(KernelLayer *layer, BIO *bio);
+
+/**
+ * Function called from base VDO to complete and free a flush request.
+ *
+ * @param kfp  Pointer to the flush request
+ **/
+void kvdoCompleteFlush(VDOFlush **kfp);
+
+/**
+ * Issue a flush request and wait for it to complete.
+ *
+ * @param layer The kernel layer
+ *
+ * @return VDO_SUCCESS or an error
+ */
+int synchronousFlush(KernelLayer *layer);
+
+#endif /* KVDO_FLUSH_H */
diff --git a/vdo/kernel/kvio.c b/vdo/kernel/kvio.c
new file mode 100644
index 0000000..336f86e
--- /dev/null
+++ b/vdo/kernel/kvio.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.c#7 $
+ */
+
+#include "kvio.h"
+
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "numUtils.h"
+#include "vdo.h"
+#include "waitQueue.h"
+
+#include "bio.h"
+#include "ioSubmitter.h"
+#include "kvdoFlush.h"
+
+/**
+ * A function to tell vdo that we have completed the requested async
+ * operation for a vio
+ *
+ * @param item    The work item of the VIO to complete
+ **/
+static void kvdoHandleVIOCallback(KvdoWorkItem *item)
+{
+  KVIO *kvio = workItemAsKVIO(item);
+  runCallback(vioAsCompletion(kvio->vio));
+}
+
+/**********************************************************************/
+void kvdoEnqueueVIOCallback(KVIO *kvio)
+{
+  enqueueKVIO(kvio, kvdoHandleVIOCallback,
+              (KvdoWorkFunction) vioAsCompletion(kvio->vio)->callback,
+              REQ_Q_ACTION_VIO_CALLBACK);
+}
+
+/**********************************************************************/
+void kvdoContinueKvio(KVIO *kvio, int error)
+{
+  if (unlikely(error != VDO_SUCCESS)) {
+    setCompletionResult(vioAsCompletion(kvio->vio), error);
+  }
+  kvdoEnqueueVIOCallback(kvio);
+}
+
+/**********************************************************************/
+// noinline ensures systemtap can hook in here
+static noinline void maybeLogKvioTrace(KVIO *kvio)
+{
+  if (kvio->layer->traceLogging) {
+    logKvioTrace(kvio);
+  }
+}
+
+/**********************************************************************/
+static void freeKVIO(KVIO **kvioPtr)
+{
+  KVIO *kvio = *kvioPtr;
+  if (kvio == NULL) {
+    return;
+  }
+
+  if (unlikely(kvio->vio->trace != NULL)) {
+    maybeLogKvioTrace(kvio);
+    FREE(kvio->vio->trace);
+  }
+
+  freeBio(kvio->bio, kvio->layer);
+  FREE(kvio);
+  *kvioPtr = NULL;
+}
+
+/**********************************************************************/
+void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr)
+{
+  freeKVIO((KVIO **) metadataKVIOPtr);
+}
+
+/**********************************************************************/
+void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr)
+{
+  freeKVIO((KVIO **) compressedWriteKVIOPtr);
+}
+
+/**********************************************************************/
+void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO)
+{
+  // This method assumes that compressed writes never set the flush or FUA
+  // bits.
+  CompressedWriteKVIO *compressedWriteKVIO
+    = allocatingVIOAsCompressedWriteKVIO(allocatingVIO);
+  KVIO *kvio = compressedWriteKVIOAsKVIO(compressedWriteKVIO);
+  BIO  *bio  = kvio->bio;
+  resetBio(bio, kvio->layer);
+  setBioOperationWrite(bio);
+  setBioSector(bio, blockToSector(kvio->layer, kvio->vio->physical));
+  submitBio(bio, BIO_Q_ACTION_COMPRESSED_DATA);
+}
+
+/**
+ * Get the BioQueue action for a metadata VIO based on that VIO's priority.
+ *
+ * @param vio  The VIO
+ *
+ * @return The action with which to submit the VIO's BIO.
+ **/
+static inline BioQAction getMetadataAction(VIO *vio)
+{
+  return ((vio->priority == VIO_PRIORITY_HIGH)
+          ? BIO_Q_ACTION_HIGH : BIO_Q_ACTION_METADATA);
+}
+
+/**********************************************************************/
+void kvdoSubmitMetadataVIO(VIO *vio)
+{
+  KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio));
+  BIO  *bio  = kvio->bio;
+  resetBio(bio, kvio->layer);
+
+  setBioSector(bio, blockToSector(kvio->layer, vio->physical));
+
+  // Metadata I/Os bypass the read cache.
+  if (isReadVIO(vio)) {
+    ASSERT_LOG_ONLY(!vioRequiresFlushBefore(vio),
+                    "read VIO does not require flush before");
+    vioAddTraceRecord(vio, THIS_LOCATION("$F;io=readMeta"));
+    setBioOperationRead(bio);
+  } else {
+    KernelLayerState state = getKernelLayerState(kvio->layer);
+    ASSERT_LOG_ONLY(((state == LAYER_RUNNING)
+                     || (state == LAYER_RESUMING)
+                     || (state = LAYER_STARTING)),
+                    "write metadata in allowed state %d", state);
+    if (vioRequiresFlushBefore(vio)) {
+      setBioOperationWrite(bio);
+      setBioOperationFlagPreflush(bio);
+      vioAddTraceRecord(vio, THIS_LOCATION("$F;io=flushWriteMeta"));
+    } else {
+      setBioOperationWrite(bio);
+      vioAddTraceRecord(vio, THIS_LOCATION("$F;io=writeMeta"));
+    }
+  }
+
+  if (vioRequiresFlushAfter(vio)) {
+    setBioOperationFlagFua(bio);
+  }
+  submitBio(bio, getMetadataAction(vio));
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+/**
+ * Handle the completion of a base-code initiated flush by continuing the flush
+ * VIO.
+ *
+ * @param bio    The bio to complete
+ **/
+static void completeFlushBio(BIO *bio)
+#else
+/**
+ * Handle the completion of a base-code initiated flush by continuing the flush
+ * VIO.
+ *
+ * @param bio    The bio to complete
+ * @param error  Possible error from underlying block device
+ **/
+static void completeFlushBio(BIO *bio, int error)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
+  int error = getBioResult(bio);
+#endif
+  KVIO *kvio = (KVIO *) bio->bi_private;
+  // XXX This assumes a VDO-created bio around a buffer contains exactly 1
+  // page, which we believe is true, but do not assert.
+  bio->bi_vcnt = 1;
+  // Restore the bio's notion of its own data.
+  resetBio(bio, kvio->layer);
+  kvdoContinueKvio(kvio, error);
+}
+
+/**********************************************************************/
+void kvdoFlushVIO(VIO *vio)
+{
+  KVIO        *kvio  = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio));
+  BIO         *bio   = kvio->bio;
+  KernelLayer *layer = kvio->layer;
+  resetBio(bio, layer);
+  prepareFlushBIO(bio, kvio, getKernelLayerBdev(layer), completeFlushBio);
+  submitBio(bio, getMetadataAction(vio));
+}
+
+/*
+ * Hook for a SystemTap probe to potentially restrict the choices
+ * of which VIOs should have their latencies tracked.
+ *
+ * Normally returns true. Even if true is returned, sampleThisOne may
+ * cut down the monitored VIOs by some fraction so as to reduce the
+ * impact on system performance.
+ *
+ * Must be "noinline" so that SystemTap can find the return
+ * instruction and modify the return value.
+ *
+ * @param kvio   The KVIO being initialized
+ * @param layer  The kernel layer
+ * @param bio    The incoming I/O request
+ *
+ * @return whether it's useful to track latency for VIOs looking like
+ *         this one
+ */
+static noinline bool
+sampleThisVIO(KVIO *kvio, KernelLayer *layer, BIO *bio)
+{
+  bool result = true;
+  // Ensure the arguments and result exist at the same time, for SystemTap.
+  __asm__ __volatile__(""
+                       : "=g" (result)
+                       : "0" (result),
+                         "g" (kvio),
+                         "g" (layer),
+                         "g" (bio)
+                       : "memory");
+  return result;
+}
+
+/**********************************************************************/
+void initializeKVIO(KVIO        *kvio,
+                    KernelLayer *layer,
+                    VIOType      vioType,
+                    VIOPriority  priority,
+                    void        *parent,
+                    BIO         *bio)
+{
+  if (layer->vioTraceRecording
+      && sampleThisVIO(kvio, layer, bio)
+      && sampleThisOne(&layer->traceSampleCounter)) {
+    int result = (isDataVIOType(vioType)
+                  ? allocTraceFromPool(layer, &kvio->vio->trace)
+                  : ALLOCATE(1, Trace, "trace", &kvio->vio->trace));
+    if (result != VDO_SUCCESS) {
+      logError("trace record allocation failure %d", result);
+    }
+  }
+
+  kvio->bio   = bio;
+  kvio->layer = layer;
+  if (bio != NULL) {
+    bio->bi_private = kvio;
+  }
+
+  initializeVIO(kvio->vio, vioType, priority, parent, getVDO(&layer->kvdo),
+                &layer->common);
+
+  // XXX: The "init" label should be replaced depending on the
+  // write/read/flush path followed.
+  kvioAddTraceRecord(kvio, THIS_LOCATION("$F;io=?init;j=normal"));
+
+  VDOCompletion *completion                = vioAsCompletion(kvio->vio);
+  kvio->enqueueable.enqueueable.completion = completion;
+  completion->enqueueable                  = &kvio->enqueueable.enqueueable;
+}
+
+/**
+ * Construct a metadata KVIO.
+ *
+ * @param [in]  layer            The physical layer
+ * @param [in]  vioType          The type of VIO to create
+ * @param [in]  priority         The relative priority to assign to the
+ *                               MetadataKVIO
+ * @param [in]  parent           The parent of the MetadataKVIO completion
+ * @param [in]  bio              The bio to associate with this MetadataKVIO
+ * @param [out] metadataKVIOPtr  A pointer to hold the new MetadataKVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int makeMetadataKVIO(KernelLayer   *layer,
+                            VIOType        vioType,
+                            VIOPriority    priority,
+                            void          *parent,
+                            BIO           *bio,
+                            MetadataKVIO **metadataKVIOPtr)
+{
+  // If MetadataKVIO grows past 256 bytes, we'll lose benefits of VDOSTORY-176.
+  STATIC_ASSERT(sizeof(MetadataKVIO) <= 256);
+
+  // Metadata VIOs should use direct allocation and not use the buffer pool,
+  // which is reserved for submissions from the linux block layer.
+  MetadataKVIO *metadataKVIO;
+  int result = ALLOCATE(1, MetadataKVIO, __func__, &metadataKVIO);
+  if (result != VDO_SUCCESS) {
+    logError("metadata KVIO allocation failure %d", result);
+    return result;
+  }
+
+  KVIO *kvio = &metadataKVIO->kvio;
+  kvio->vio  = &metadataKVIO->vio;
+  initializeKVIO(kvio, layer, vioType, priority, parent, bio);
+  *metadataKVIOPtr = metadataKVIO;
+  return VDO_SUCCESS;
+}
+
+/**
+ * Construct a CompressedWriteKVIO.
+ *
+ * @param [in]  layer                   The physical layer
+ * @param [in]  parent                  The parent of the CompressedWriteKVIO
+ *                                      completion
+ * @param [in]  bio                     The bio to associate with this
+ *                                      CompressedWriteKVIO
+ * @param [out] compressedWriteKVIOPtr  A pointer to hold the new
+ *                                      CompressedWriteKVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+__attribute__((warn_unused_result))
+static int
+makeCompressedWriteKVIO(KernelLayer          *layer,
+                        void                 *parent,
+                        BIO                  *bio,
+                        CompressedWriteKVIO **compressedWriteKVIOPtr)
+{
+  // Compressed write VIOs should use direct allocation and not use the buffer
+  // pool, which is reserved for submissions from the linux block layer.
+  CompressedWriteKVIO *compressedWriteKVIO;
+  int result = ALLOCATE(1, CompressedWriteKVIO, __func__,
+                        &compressedWriteKVIO);
+  if (result != VDO_SUCCESS) {
+    logError("compressed write KVIO allocation failure %d", result);
+    return result;
+  }
+
+  KVIO *kvio = &compressedWriteKVIO->kvio;
+  kvio->vio  = allocatingVIOAsVIO(&compressedWriteKVIO->allocatingVIO);
+  initializeKVIO(kvio, layer, VIO_TYPE_COMPRESSED_BLOCK,
+                 VIO_PRIORITY_COMPRESSED_DATA, parent, bio);
+  *compressedWriteKVIOPtr = compressedWriteKVIO;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int kvdoCreateMetadataVIO(PhysicalLayer  *layer,
+                          VIOType         vioType,
+                          VIOPriority     priority,
+                          void           *parent,
+                          char           *data,
+                          VIO           **vioPtr)
+{
+  int result = ASSERT(isMetadataVIOType(vioType),
+                      "%d is a metadata type", vioType);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  BIO *bio;
+  KernelLayer *kernelLayer = asKernelLayer(layer);
+  result = createBio(kernelLayer, data, &bio);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  MetadataKVIO *metadataKVIO;
+  result = makeMetadataKVIO(kernelLayer, vioType, priority, parent, bio,
+                            &metadataKVIO);
+  if (result != VDO_SUCCESS) {
+    freeBio(bio, kernelLayer);
+    return result;
+  }
+
+  *vioPtr = &metadataKVIO->vio;
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+int kvdoCreateCompressedWriteVIO(PhysicalLayer  *layer,
+                                 void           *parent,
+                                 char           *data,
+                                 AllocatingVIO **allocatingVIOPtr)
+{
+  BIO *bio;
+  KernelLayer *kernelLayer = asKernelLayer(layer);
+  int result = createBio(kernelLayer, data, &bio);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  CompressedWriteKVIO *compressedWriteKVIO;
+  result = makeCompressedWriteKVIO(kernelLayer, parent, bio,
+                                   &compressedWriteKVIO);
+  if (result != VDO_SUCCESS) {
+    freeBio(bio, kernelLayer);
+    return result;
+  }
+
+  *allocatingVIOPtr = &compressedWriteKVIO->allocatingVIO;
+  return VDO_SUCCESS;
+}
diff --git a/vdo/kernel/kvio.h b/vdo/kernel/kvio.h
new file mode 100644
index 0000000..64200cd
--- /dev/null
+++ b/vdo/kernel/kvio.h
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.h#3 $
+ */
+
+#ifndef KVIO_H
+#define KVIO_H
+
+#include "allocatingVIO.h"
+#include "vio.h"
+
+#include "kernelLayer.h"
+
+/**
+ * A specific (semi-opaque) encapsulation of a single block
+ **/
+struct kvio {
+  KvdoEnqueueable    enqueueable;
+  VIO               *vio;
+  KernelLayer       *layer;
+  BIO               *bio;
+
+  /**
+   * A bio pointer used in enqueueBioMap (used via submitBio etc), to
+   * pass information -- which bio to submit to the storage device --
+   * across a thread switch. This may match another bio pointer in
+   * this structure, or could point somewhere else.
+   **/
+  BIO               *bioToSubmit;
+  /**
+   * A list of enqueued bios with consecutive block numbers, stored by
+   * enqueueBioMap under the first-enqueued KVIO. The other KVIOs are
+   * found via their bio entries in this list, and are not added to
+   * the work queue as separate work items.
+   **/
+  struct bio_list    biosMerged;
+  /** A slot for an arbitrary bit of data, for use by systemtap. */
+  long               debugSlot;
+};
+
+typedef struct {
+  KVIO kvio;
+  VIO  vio;
+} MetadataKVIO;
+
+typedef struct {
+  KVIO          kvio;
+  AllocatingVIO allocatingVIO;
+} CompressedWriteKVIO;
+
+/**
+ * Determine whether a KVIO is a data VIO or not
+ *
+ * @param kvio  The KVIO to check
+ *
+ * @return <code>true</code> if a data KVIO
+ */
+static inline bool isData(KVIO *kvio)
+{
+  return isDataVIO(kvio->vio);
+}
+
+/**
+ * Determine whether a KVIO is a compressed block write VIO or not
+ *
+ * @param kvio  The KVIO to check
+ *
+ * @return <code>true</code> if a compressed block writer
+ */
+static inline bool isCompressedWriter(KVIO *kvio)
+{
+  return isCompressedWriteVIO(kvio->vio);
+}
+
+/**
+ * Determine whether a KVIO is a metadata VIO or not
+ *
+ * @param kvio  The KVIO to check
+ *
+ * @return <code>true</code> if a metadata KVIO
+ */
+static inline bool isMetadata(KVIO *kvio)
+{
+  return isMetadataVIO(kvio->vio);
+}
+
+/**
+ * Convert a VIO to a MetadataKVIO.
+ *
+ * @param vio  The VIO to convert
+ *
+ * @return the VIO as a KVIO
+ **/
+static inline MetadataKVIO *vioAsMetadataKVIO(VIO *vio)
+{
+  ASSERT_LOG_ONLY(isMetadataVIO(vio), "VIO is a metadata VIO");
+  return container_of(vio, MetadataKVIO, vio);
+}
+
+/**
+ * Convert a MetadataKVIO to a KVIO.
+ *
+ * @param metadataKVIO  The MetadataKVIO to convert
+ *
+ * @return The MetadataKVIO as a KVIO
+ **/
+static inline KVIO *metadataKVIOAsKVIO(MetadataKVIO *metadataKVIO)
+{
+  return &metadataKVIO->kvio;
+}
+
+/**
+ * Returns a pointer to the CompressedWriteKVIO wrapping an AllocatingVIO.
+ *
+ * @param allocatingVIO  The AllocatingVIO to convert
+ *
+ * @return the CompressedWriteKVIO
+ **/
+static inline CompressedWriteKVIO *
+allocatingVIOAsCompressedWriteKVIO(AllocatingVIO *allocatingVIO)
+{
+  ASSERT_LOG_ONLY(isCompressedWriteAllocatingVIO(allocatingVIO),
+                  "AllocatingVIO is a compressed write");
+  return container_of(allocatingVIO, CompressedWriteKVIO, allocatingVIO);
+}
+
+/**
+ * Convert a CompressedWriteKVIO to a KVIO.
+ *
+ * @param compressedWriteKVIO  The CompressedWriteKVIO to convert
+ *
+ * @return The CompressedWriteKVIO as a KVIO
+ **/
+static inline
+KVIO *compressedWriteKVIOAsKVIO(CompressedWriteKVIO *compressedWriteKVIO)
+{
+  return &compressedWriteKVIO->kvio;
+}
+
+/**
+ * Returns a pointer to the KVIO wrapping a work item
+ *
+ * @param item  the work item
+ *
+ * @return the KVIO
+ **/
+static inline KVIO *workItemAsKVIO(KvdoWorkItem *item)
+{
+  return container_of(item, KVIO, enqueueable.workItem);
+}
+
+/**
+ * Enqueue a KVIO on a work queue.
+ *
+ * @param queue  The queue
+ * @param kvio   The KVIO
+ **/
+static inline void enqueueKVIOWork(KvdoWorkQueue *queue, KVIO *kvio)
+{
+  enqueueWorkQueue(queue, &kvio->enqueueable.workItem);
+}
+
+/**
+ * Add a trace record for the current source location.
+ *
+ * @param kvio      The KVIO structure to be updated
+ * @param location  The source-location descriptor to be recorded
+ **/
+static inline void kvioAddTraceRecord(KVIO *kvio, TraceLocation location)
+{
+  vioAddTraceRecord(kvio->vio, location);
+}
+
+/**
+ * Set up the work item for a KVIO.
+ *
+ * @param kvio           The KVIO to set up
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ **/
+static inline void setupKVIOWork(KVIO             *kvio,
+                                 KvdoWorkFunction  work,
+                                 void             *statsFunction,
+                                 unsigned int      action)
+{
+  setupWorkItem(&kvio->enqueueable.workItem, work, statsFunction, action);
+}
+
+/**
+ * Set up and enqueue a KVIO.
+ *
+ * @param kvio           The KVIO to set up
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, mapping to a relative priority
+ * @param queue          The queue on which to enqueue the KVIO
+ **/
+static inline void launchKVIO(KVIO             *kvio,
+                              KvdoWorkFunction  work,
+                              void             *statsFunction,
+                              unsigned int      action,
+                              KvdoWorkQueue    *queue)
+{
+  setupKVIOWork(kvio, work, statsFunction, action);
+  enqueueKVIOWork(queue, kvio);
+}
+
+/**
+ * Move a KVIO back to the base threads.
+ *
+ * @param kvio The KVIO to enqueue
+ **/
+void kvdoEnqueueVIOCallback(KVIO *kvio);
+
+/**
+ * Handles kvio-related I/O post-processing.
+ *
+ * @param kvio  The kvio to finalize
+ * @param error Possible error
+ **/
+void kvdoContinueKvio(KVIO *kvio, int error);
+
+/**
+ * Initialize a KVIO.
+ *
+ * @param kvio      The KVIO to initialize
+ * @param layer     The physical layer
+ * @param vioType   The type of VIO to create
+ * @param priority  The relative priority to assign to the KVIO
+ * @param parent    The parent of the KVIO completion
+ * @param bio       The bio to associate with this KVIO
+ **/
+void initializeKVIO(KVIO        *kvio,
+                    KernelLayer *layer,
+                    VIOType      vioType,
+                    VIOPriority  priority,
+                    void        *parent,
+                    BIO         *bio);
+
+/**
+ * Destroy a MetadataKVIO and NULL out the pointer to it.
+ *
+ * @param metadataKVIOPtr  A pointer to the MetadataKVIO to destroy
+ **/
+void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr);
+
+/**
+ * Destroy a CompressedWriteKVIO and NULL out the pointer to it.
+ *
+ * @param compressedWriteKVIOPtr  A pointer to the CompressedWriteKVIO to
+ *                                destroy
+ **/
+void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr);
+
+/**
+ * Create a new VIO (and its enclosing KVIO) for metadata operations.
+ *
+ * <p>Implements MetadataVIOCreator.
+ *
+ * @param [in]  layer     The physical layer
+ * @param [in]  vioType   The type of VIO to create
+ * @param [in]  priority  The relative priority to assign to the VIO
+ * @param [in]  parent    The parent to assign to the VIO's completion
+ * @param [in]  data      The buffer
+ * @param [out] vioPtr    A pointer to hold new VIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int kvdoCreateMetadataVIO(PhysicalLayer  *layer,
+                          VIOType         vioType,
+                          VIOPriority     priority,
+                          void           *parent,
+                          char           *data,
+                          VIO           **vioPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Create a new AllocatingVIO (and its enclosing KVIO) for compressed writes.
+ *
+ * <p>Implements CompressedWriteVIOCreator.
+ *
+ * @param [in]  layer             The physical layer
+ * @param [in]  parent            The parent to assign to the AllocatingVIO's
+ *                                completion
+ * @param [in]  data              The buffer
+ * @param [out] allocatingVIOPtr  A pointer to hold new AllocatingVIO
+ *
+ * @return VDO_SUCCESS or an error
+ **/
+int kvdoCreateCompressedWriteVIO(PhysicalLayer  *layer,
+                                 void           *parent,
+                                 char           *data,
+                                 AllocatingVIO **allocatingVIOPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Submit a compressed block write.
+ *
+ * <p>Implements CompressedWriter.
+ *
+ * @param allocatingVIO  The AllocatingVIO for the compressed write
+ **/
+void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO);
+
+/**
+ * Read or write a single metadata VIO.
+ *
+ * <p>Implements MetadataReader and MetadataWriter
+ *
+ * @param vio  The VIO to read or write
+ **/
+void kvdoSubmitMetadataVIO(VIO *vio);
+
+/**
+ * Issue an empty flush to the lower layer using the BIO in a metadata VIO.
+ *
+ * <p>Implements MetadataWriter.
+ *
+ * @param vio  The VIO to flush
+ **/
+void kvdoFlushVIO(VIO *vio);
+
+#endif /* KVIO_H */
diff --git a/vdo/kernel/limiter.c b/vdo/kernel/limiter.c
new file mode 100644
index 0000000..72a4bb5
--- /dev/null
+++ b/vdo/kernel/limiter.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.c#2 $
+ */
+
+#include "limiter.h"
+
+#include <linux/sched.h>
+
+/**********************************************************************/
+void getLimiterValuesAtomically(Limiter  *limiter,
+                                uint32_t *active,
+                                uint32_t *maximum)
+{
+  spin_lock(&limiter->lock);
+  *active  = limiter->active;
+  *maximum = limiter->maximum;
+  spin_unlock(&limiter->lock);
+}
+
+/**********************************************************************/
+void initializeLimiter(Limiter *limiter, uint32_t limit)
+{
+  limiter->active  = 0;
+  limiter->limit   = limit;
+  limiter->maximum = 0;
+  init_waitqueue_head(&limiter->waiterQueue);
+  spin_lock_init(&limiter->lock);
+}
+
+/**********************************************************************/
+bool limiterIsIdle(Limiter *limiter)
+{
+  spin_lock(&limiter->lock);
+  bool idle = limiter->active == 0;
+  spin_unlock(&limiter->lock);
+  return idle;
+}
+
+/**********************************************************************/
+void limiterReleaseMany(Limiter *limiter, uint32_t count)
+{
+  spin_lock(&limiter->lock);
+  limiter->active -= count;
+  spin_unlock(&limiter->lock);
+  if (waitqueue_active(&limiter->waiterQueue)) {
+    wake_up_nr(&limiter->waiterQueue, count);
+  }
+}
+
+/**********************************************************************/
+void limiterWaitForIdle(Limiter *limiter)
+{
+  spin_lock(&limiter->lock);
+  while (limiter->active > 0) {
+    DEFINE_WAIT(wait);
+    prepare_to_wait_exclusive(&limiter->waiterQueue, &wait,
+                              TASK_UNINTERRUPTIBLE);
+    spin_unlock(&limiter->lock);
+    io_schedule();
+    spin_lock(&limiter->lock);
+    finish_wait(&limiter->waiterQueue, &wait);
+  };
+  spin_unlock(&limiter->lock);
+}
+
+/**
+ * Take one permit from the limiter, if one is available, and update
+ * the maximum active count if appropriate.
+ *
+ * The limiter's lock must already be locked.
+ *
+ * @param limiter  The limiter to update
+ *
+ * @return  true iff the permit was acquired
+ **/
+static bool takePermitLocked(Limiter *limiter)
+{
+  if (limiter->active >= limiter->limit) {
+    return false;
+  }
+  limiter->active += 1;
+  if (limiter->active > limiter->maximum) {
+    limiter->maximum = limiter->active;
+  }
+  return true;
+}
+
+/**********************************************************************/
+void limiterWaitForOneFree(Limiter *limiter)
+{
+  spin_lock(&limiter->lock);
+  while (!takePermitLocked(limiter)) {
+    DEFINE_WAIT(wait);
+    prepare_to_wait_exclusive(&limiter->waiterQueue, &wait,
+                              TASK_UNINTERRUPTIBLE);
+    spin_unlock(&limiter->lock);
+    io_schedule();
+    spin_lock(&limiter->lock);
+    finish_wait(&limiter->waiterQueue, &wait);
+  };
+  spin_unlock(&limiter->lock);
+}
+
+/**********************************************************************/
+bool limiterPoll(Limiter *limiter)
+{
+  spin_lock(&limiter->lock);
+  bool acquired = takePermitLocked(limiter);
+  spin_unlock(&limiter->lock);
+  return acquired;
+}
diff --git a/vdo/kernel/limiter.h b/vdo/kernel/limiter.h
new file mode 100644
index 0000000..a9ee8fc
--- /dev/null
+++ b/vdo/kernel/limiter.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.h#2 $
+ */
+
+#ifndef LIMITER_H
+#define LIMITER_H
+
+#include <linux/wait.h>
+
+/*
+ * A Limiter is a fancy counter used to limit resource usage.  We have a
+ * limit to number of resources that we are willing to use, and a Limiter
+ * holds us to that limit.
+ */
+
+typedef struct limiter {
+  // A spinlock controlling access to the contents of this struct
+  spinlock_t        lock;
+  // The queue of threads waiting for a resource to become available
+  wait_queue_head_t waiterQueue;
+  // The number of resources in use
+  uint32_t          active;
+  // The maximum number number of resources that have ever been in use
+  uint32_t          maximum;
+  // The limit to the number of resources that are allowed to be used
+  uint32_t          limit;
+} Limiter;
+
+/**
+ * Get the Limiter variable values (atomically under the lock)
+ *
+ * @param limiter  The limiter
+ * @param active   The number of requests in progress
+ * @param maximum  The maximum number of requests that have ever been active
+ **/
+void getLimiterValuesAtomically(Limiter  *limiter,
+                                uint32_t *active,
+                                uint32_t *maximum);
+
+/**
+ * Initialize a Limiter
+ *
+ * @param limiter  The limiter
+ * @param limit    The limit to the number of active resources
+ **/
+void initializeLimiter(Limiter *limiter, uint32_t limit);
+
+/**
+ * Determine whether there are any active resources
+ *
+ * @param limiter  The limiter
+ *
+ * @return true if there are no active resources
+ **/
+bool limiterIsIdle(Limiter *limiter);
+
+/**
+ * Release resources, making them available for other uses
+ *
+ * @param limiter  The limiter
+ * @param count    The number of resources to release
+ **/
+void limiterReleaseMany(Limiter *limiter, uint32_t count);
+
+/**
+ * Release one resource, making it available for another use
+ *
+ * @param limiter  The limiter
+ **/
+static inline void limiterRelease(Limiter *limiter)
+{
+  limiterReleaseMany(limiter, 1);
+}
+
+/**
+ * Wait until there are no active resources
+ *
+ * @param limiter  The limiter
+ **/
+void limiterWaitForIdle(Limiter *limiter);
+
+/**
+ * Prepare to start using one resource, waiting if there are too many resources
+ * already in use. After returning from this routine, the caller may use the
+ * resource, and must call limiterRelease after freeing the resource.
+ *
+ * @param limiter  The limiter
+ **/
+void limiterWaitForOneFree(Limiter *limiter);
+
+/**
+ * Attempt to reserve one resource, without waiting. After returning from this
+ * routine, if allocation was successful, the caller may use the resource, and
+ * must call limiterRelease after freeing the resource.
+ *
+ * @param limiter  The limiter
+ *
+ * @return true iff the resource was allocated
+ **/
+bool limiterPoll(Limiter *limiter);
+
+#endif /* LIMITER_H */
diff --git a/vdo/kernel/logger.c b/vdo/kernel/logger.c
new file mode 100644
index 0000000..d18f5ea
--- /dev/null
+++ b/vdo/kernel/logger.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.c#4 $
+ */
+
+#include "logger.h"
+
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+
+#include "errors.h"
+#include "threadDevice.h"
+
+static const int  DEFAULT_PRIORITY = LOG_INFO;
+
+typedef struct {
+  const char *name;
+  const int   priority;
+} PRIORITY_NAMES;
+
+static const PRIORITY_NAMES PRIORITIES[] = {
+  { "ALERT",     LOG_ALERT },
+  { "CRIT",      LOG_CRIT },
+  { "CRITICAL",  LOG_CRIT },
+  { "DEBUG",     LOG_DEBUG },
+  { "EMERG",     LOG_EMERG },
+  { "EMERGENCY", LOG_EMERG },
+  { "ERR",       LOG_ERR },
+  { "ERROR",     LOG_ERR },
+  { "INFO",      LOG_INFO },
+  { "NOTICE",    LOG_NOTICE },
+  { "PANIC",     LOG_EMERG },
+  { "WARN",      LOG_WARNING },
+  { "WARNING",   LOG_WARNING },
+  { NULL, -1 },
+};
+
+enum {
+  PRIORITY_COUNT = 8
+};
+
+static const char *PRIORITY_STRINGS[] = {
+  "EMERGENCY",
+  "ALERT",
+  "CRITICAL",
+  "ERROR",
+  "WARN",
+  "NOTICE",
+  "INFO",
+  "DEBUG",
+};
+
+static int logLevel = LOG_INFO;
+
+/**********************************************************************/
+int stringToPriority(const char *string)
+{
+  for (int i = 0; PRIORITIES[i].name != NULL; i++) {
+    if (strcasecmp(string, PRIORITIES[i].name) == 0) {
+      return PRIORITIES[i].priority;
+    }
+  }
+  return DEFAULT_PRIORITY;
+}
+
+/**********************************************************************/
+int getLogLevel(void)
+{
+  return logLevel;
+}
+
+/**********************************************************************/
+void setLogLevel(int newLogLevel)
+{
+  logLevel = newLogLevel;
+}
+
+/**********************************************************************/
+const char *priorityToString(int priority)
+{
+  if ((priority < 0) || (priority >= PRIORITY_COUNT)) {
+    return "unknown";
+  }
+  return PRIORITY_STRINGS[priority];
+}
+
+/**********************************************************************/
+static const char *priorityToLogLevel(int priority)
+{
+  switch (priority) {
+    case LOG_EMERG:
+    case LOG_ALERT:
+    case LOG_CRIT:
+      return KERN_CRIT;
+    case LOG_ERR:
+      return KERN_ERR;
+    case LOG_WARNING:
+      return KERN_WARNING;
+    case LOG_NOTICE:
+      return KERN_NOTICE;
+    case LOG_INFO:
+      return KERN_INFO;
+    case LOG_DEBUG:
+      return KERN_DEBUG;
+    default:
+      return "";
+  }
+}
+
+/**********************************************************************/
+static const char *getCurrentInterruptType(void)
+{
+  if (in_nmi()) {
+    return "NMI";
+  }
+  if (in_irq()) {
+    return "HI";
+  }
+  if (in_softirq()) {
+    return "SI";
+  }
+  return "INTR";
+}
+
+/**
+ * Emit a log message to the kernel log in a format suited to the current
+ * thread context. Context info formats:
+ *
+ * interrupt:       kvdo[NMI]: blah
+ * thread w/dev id: kvdo12:myprog: blah
+ * kvdo thread:     kvdo12:foobarQ: blah
+ * other thread:    kvdo: myprog: blah
+ *
+ * Fields: module name, interrupt level, process name, device ID.
+ *
+ * @param level       A string describing the logging level
+ * @param moduleName  The name of the module doing the logging
+ * @param prefix      The prefix of the log message
+ * @param vaf1        The first message format descriptor
+ * @param vaf2        The second message format descriptor
+ **/
+static void emitLogMessage(const char             *level,
+                           const char             *moduleName,
+                           const char             *prefix,
+                           const struct va_format *vaf1,
+                           const struct va_format *vaf2)
+{
+  if (in_interrupt()) {
+    printk("%s%s[%s]: %s%pV%pV\n",
+           level, moduleName, getCurrentInterruptType(),
+           prefix, vaf1, vaf2);
+    return;
+  }
+
+  // Not at interrupt level; we have a process we can look at, and
+  // might have a device ID.
+  int deviceInstance = getThreadDeviceID();
+  if (deviceInstance != -1) {
+    printk("%s%s%u:%s: %s%pV%pV\n",
+           level, moduleName, deviceInstance, current->comm,
+           prefix, vaf1, vaf2);
+    return;
+  }
+
+  if (((current->flags & PF_KTHREAD) != 0)
+      && (strncmp(moduleName, current->comm, strlen(moduleName)) == 0)) {
+    /*
+     * It's a kernel thread starting with "kvdo" (or whatever). Assume it's
+     * ours and that its name is sufficient.
+     */
+    printk("%s%s: %s%pV%pV\n",
+           level, current->comm,
+           prefix, vaf1, vaf2);
+    return;
+  }
+
+  // Identify the module and the process.
+  printk("%s%s: %s: %s%pV%pV\n",
+         level, moduleName, current->comm,
+         prefix, vaf1, vaf2);
+}
+
+/**********************************************************************/
+void logMessagePack(int         priority,
+                    const char *prefix,
+                    const char *fmt1,
+                    va_list     args1,
+                    const char *fmt2,
+                    va_list     args2)
+{
+  if (priority > getLogLevel()) {
+    return;
+  }
+
+  /*
+   * The kernel's printk has some magic for indirection to a secondary
+   * va_list. It wants us to supply a pointer to the va_list.
+   *
+   * However, va_list varies across platforms and can be an array
+   * type, which makes passing it around as an argument kind of
+   * tricky, due to the automatic conversion to a pointer. This makes
+   * taking the address of the argument a dicey thing; if we use "&a"
+   * it works fine for non-array types, but for array types we get the
+   * address of a pointer. Functions like va_copy and sprintf don't
+   * care as they get "va_list" values passed and are written to do
+   * the right thing, but printk explicitly wants the address of the
+   * va_list.
+   *
+   * So, we copy the va_list values to ensure that "&" consistently
+   * works the way we want.
+   */
+  va_list args1Copy;
+  va_copy(args1Copy, args1);
+  va_list args2Copy;
+  va_copy(args2Copy, args2);
+  struct va_format vaf1 = {
+    .fmt = (fmt1 != NULL) ? fmt1 : "",
+    .va  = &args1Copy,
+  };
+  struct va_format vaf2 = {
+    .fmt = (fmt2 != NULL) ? fmt2 : "",
+    .va  = &args2Copy,
+  };
+
+  if (prefix == NULL) {
+    prefix = "";
+  }
+
+  emitLogMessage(priorityToLogLevel(priority), THIS_MODULE->name,
+                 prefix, &vaf1, &vaf2);
+
+  va_end(args1Copy);
+  va_end(args2Copy);
+}
+
+/**********************************************************************/
+void logEmbeddedMessage(int         priority,
+                        const char *prefix,
+                        const char *fmt1,
+                        va_list     args1,
+                        const char *fmt2,
+                        ...)
+{
+  va_list ap;
+  va_start(ap, fmt2);
+  logMessagePack(priority, prefix, fmt1, args1, fmt2, ap);
+  va_end(ap);
+}
+
+#pragma GCC diagnostic push
+/*
+ * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems
+ * to think that this function should get a printf format
+ * attribute. But we have no second format string, and no additional
+ * arguments at the call site, and GCC also gets unhappy trying to
+ * analyze the format and values when there are none. So we'll just
+ * shut it up.
+ */
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+/**
+ * Log a message.
+ *
+ * This helper function exists solely to create a valid va_list with
+ * no useful info. It does the real work of vLogMessage, which wants a
+ * second va_list object to pass down.
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The variadic argument list of format parameters.
+ **/
+static void vLogMessageHelper(int         priority,
+                              const char *format,
+                              va_list     args,
+                              ...)
+{
+  va_list dummy;
+  va_start(dummy, args);
+  logMessagePack(priority, NULL, format, args, NULL, dummy);
+  va_end(dummy);
+}
+#pragma GCC diagnostic pop
+
+/*****************************************************************************/
+void vLogMessage(int priority, const char *format, va_list args)
+{
+  vLogMessageHelper(priority, format, args);
+}
+
+/**********************************************************************/
+void logMessage(int priority, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(priority, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+__attribute__((format(printf, 2, 3)))
+static void logAtLevel(int priority, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(priority, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void logDebug(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_DEBUG, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void logInfo(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_INFO, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void logNotice(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_NOTICE, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void logWarning(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_WARNING, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void logError(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_ERR, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void vLogError(const char *format, va_list args)
+{
+  vLogMessage(LOG_ERR, format, args);
+}
+
+/**********************************************************************/
+void logBacktrace(int priority)
+{
+  logAtLevel(priority, "[backtrace]");
+  if (priority > logLevel) {
+    return;
+  }
+  dump_stack();
+}
+
+/**********************************************************************/
+int vLogWithStringError(int         priority,
+                        int         errnum,
+                        const char *format,
+                        va_list     args)
+{
+  char errbuf[ERRBUF_SIZE] = "";
+  logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)",
+                     stringError(errnum, errbuf, sizeof(errbuf)),
+                     errnum);
+  return errnum;
+}
+
+/**********************************************************************/
+int logWithStringError(int priority, int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(priority, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logErrorWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_ERR, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int vLogErrorWithStringError(int errnum, const char *format, va_list args)
+{
+  vLogWithStringError(LOG_ERR, errnum, format, args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logWarningWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_WARNING, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logDebugWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_DEBUG, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logInfoWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_INFO, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logNoticeWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_NOTICE, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logFatalWithStringError(int errnum, const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogWithStringError(LOG_CRIT, errnum, format, args);
+  va_end(args);
+  return errnum;
+}
+
+/**********************************************************************/
+int logUnrecoverable(int errnum, const char *format, ...)
+{
+  if ((errnum == UDS_SUCCESS || errnum == UDS_QUEUED) || (errnum == 0)) {
+    return errnum;
+  }
+
+  va_list args;
+  va_start(args, format);
+  vLogWithStringError(LOG_CRIT, errnum, format, args);
+  va_end(args);
+  return makeUnrecoverable(errnum);
+}
+
+/**********************************************************************/
+void logFatal(const char *format, ...)
+{
+  va_list args;
+
+  va_start(args, format);
+  vLogMessage(LOG_CRIT, format, args);
+  va_end(args);
+}
+
+/**********************************************************************/
+void pauseForLogger(void)
+{
+  // Hopefully, a few milliseconds of sleep will be large enough
+  // for the kernel log buffer to be flushed.
+  msleep(4);
+}
diff --git a/vdo/kernel/logger.h b/vdo/kernel/logger.h
new file mode 100644
index 0000000..6e8088e
--- /dev/null
+++ b/vdo/kernel/logger.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.h#2 $
+ */
+
+#ifndef LOGGER_H
+#define LOGGER_H 1
+
+#include <stdarg.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#define LOG_EMERG       0       /* system is unusable */
+#define LOG_ALERT       1       /* action must be taken immediately */
+#define LOG_CRIT        2       /* critical conditions */
+#define LOG_ERR         3       /* error conditions */
+#define LOG_WARNING     4       /* warning conditions */
+#define LOG_NOTICE      5       /* normal but significant condition */
+#define LOG_INFO        6       /* informational */
+#define LOG_DEBUG       7       /* debug-level messages */
+
+// Make it easy to log real pointer values using %px when in development.
+#define PRIptr "pK"
+
+/**
+ * @file
+ *
+ * The functions in this file are not thread safe in the sense that nothing
+ * prevents multiple threads from opening or closing loggers out from under
+ * other threads. In reality this isn't a problem since the only calls in
+ * production code to openLogger() and closeLogger() are made in uds.c while
+ * uds mutex is held, and uds does not make any logging calls before it calls
+ * openLogger or after it calls closeLogger().
+ *
+ * All of the log<Level>() functions will preserve the callers value of errno.
+ **/
+
+/**
+ * Get the current logging level.
+ *
+ * @return   the current logging priority level.
+ **/
+int getLogLevel(void);
+
+/**
+ * Set the current logging level.
+ *
+ * @param newLogLevel  the new value for the logging priority level.
+ **/
+void setLogLevel(int newLogLevel);
+
+/**
+ * Return the integer logging priority represented by a name.
+ *
+ * @param string   the name of the logging priority (case insensitive).
+ *
+ * @return         the integer priority named by string, or DEFAULT_PRIORITY
+ *                 if not recognized.
+ **/
+int stringToPriority(const char *string);
+
+/**
+ * Return the printable name of a logging priority.
+ *
+ * @return the priority name
+ **/
+const char *priorityToString(int priority);
+
+/**
+ * Log a debug message.
+ *
+ * @param format The format of the message (a printf style format)
+ **/
+void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log an informational message.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a normal (but notable) condition.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a warning.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log an error.
+ *
+ * @param  format The format of the message (a printf style format)
+  **/
+void logError(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log an error.
+ *
+ * @param  format The format of the message (a printf style format)
+ * @param  args args for format.
+ **/
+
+void vLogError(const char *format, va_list args)
+  __attribute__((format(printf, 1, 0)));
+
+/**
+ * Log a message embedded within another message.
+ *
+ * @param priority      the priority at which to log the message
+ * @param prefix        optional string prefix to message, may be NULL
+ * @param fmt1          format of message first part, may be NULL
+ * @param args1         arguments for message first part
+ * @param fmt2          format of message second part
+ **/
+void logEmbeddedMessage(int         priority,
+                        const char *prefix,
+                        const char *fmt1,
+                        va_list     args1,
+                        const char *fmt2,
+                        ...)
+  __attribute__((format(printf, 3, 0), format(printf, 5, 6)));
+
+/**
+ * Log a message pack consisting of multiple variable sections.
+ *
+ * @param priority      the priority at which to log the message
+ * @param prefix        optional string prefix to message, may be NULL
+ * @param fmt1          format of message first part, may be NULL
+ * @param args1         arguments for message first part
+ * @param fmt2          format of message second part, may be NULL
+ * @param args2         arguments for message second part
+ **/
+void logMessagePack(int         priority,
+                    const char *prefix,
+                    const char *fmt1,
+                    va_list     args1,
+                    const char *fmt2,
+                    va_list     args2)
+  __attribute__((format(printf, 3, 0)));
+
+/**
+ * Log a stack backtrace.
+ *
+ * @param  priority The priority at which to log the backtrace
+ **/
+void logBacktrace(int priority);
+
+/**
+ * Log a message with an error from an error code.
+ *
+ * @param  priority The priority of the logging entry
+ * @param  errnum   Int value of errno or a UDS_* value.
+ * @param  format   The format of the message (a printf style format)
+ *
+ * @return errnum
+ **/
+int logWithStringError(int priority, int errnum, const char *format, ...)
+  __attribute__((format(printf, 3, 4)));
+
+/**
+ * Log a message with an error from an error code.
+ *
+ * @param  priority The priority of the logging entry
+ * @param  errnum   Int value of errno or a UDS_* value.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The list of arguments with format.
+ *
+ * @return errnum
+ **/
+int vLogWithStringError(int         priority,
+                        int         errnum,
+                        const char *format,
+                        va_list     args)
+  __attribute__((format(printf, 3, 0)));
+
+/**
+ * Log an error prefixed with the string associated with the errnum.
+ *
+ * @param errnum Int value of errno or a UDS_* value.
+ * @param format The format of the message (a printf style format)
+ *
+ * @return errnum
+ **/
+int logErrorWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logDebugWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logInfoWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logNoticeWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logWarningWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**********************************************************************/
+int logFatalWithStringError(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Log an error prefixed with the string associated with the errnum.
+ *
+ * @param  errnum Int value of errno or a UDS_* value.
+ * @param  format The format of the message (a printf style format)
+ * @param  args   a va_list of args for the format.
+ * @return errnum
+ **/
+int vLogErrorWithStringError(int errnum, const char *format, va_list args)
+  __attribute__((format(printf, 2, 0)));
+
+/**
+ * Log an ERROR level message and return makeUnrecoverable(errnum)
+ * UDS_SUCCESS is ignored and returned.
+ *
+ * @param  errnum Int value of errno or a UDS_* value.
+ * @param  format The format of the message (a printf style format)
+ * @return makeUnrecoverable(errnum) or UDS_SUCCESS.
+ **/
+int logUnrecoverable(int errnum, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Log a fatal error.
+ *
+ * @param  format The format of the message (a printf style format)
+ **/
+void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
+/**
+ * Log a message -- for internal use only.
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ * @param  args     The variadic argument list of format parameters.
+ **/
+void vLogMessage(int priority, const char *format, va_list args)
+  __attribute__((format(printf, 2, 0)));
+
+/**
+ * Log a message.
+ *
+ * @param  priority The syslog priority value for the message.
+ * @param  format   The format of the message (a printf style format)
+ **/
+void logMessage(int priority, const char *format, ...)
+  __attribute__((format(printf, 2, 3)));
+
+/**
+ * Sleep or delay a short time (likely a few milliseconds) in an attempt allow
+ * the log buffers to be written out in case they might be overrun. This is
+ * unnecessary in user-space (and is a no-op there), but is needed when
+ * quickly issuing a lot of log output in the Linux kernel, as when dumping a
+ * large number of data structures.
+ **/
+void pauseForLogger(void);
+
+#endif /* LOGGER_H */
diff --git a/vdo/kernel/memoryUsage.c b/vdo/kernel/memoryUsage.c
new file mode 100644
index 0000000..86521a4
--- /dev/null
+++ b/vdo/kernel/memoryUsage.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.c#3 $
+ */
+
+#include "memoryUsage.h"
+
+#include "memoryAlloc.h"
+
+#include "kernelStatistics.h"
+
+/**********************************************************************/
+MemoryUsage getMemoryUsage()
+{
+  MemoryUsage memoryUsage;
+  getMemoryStats(&memoryUsage.bytesUsed, &memoryUsage.peakBytesUsed);
+  return memoryUsage;
+}
+
diff --git a/vdo/kernel/memoryUsage.h b/vdo/kernel/memoryUsage.h
new file mode 100644
index 0000000..336ab0a
--- /dev/null
+++ b/vdo/kernel/memoryUsage.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.h#1 $
+ */
+
+#ifndef MEMORY_USAGE_H
+#define MEMORY_USAGE_H 1
+
+#include "memoryAlloc.h"
+
+#include "kernelStatistics.h"
+
+/**
+ * Get the memory usage for statistics reporting.
+ *
+ * @return The memory usage
+ **/
+MemoryUsage getMemoryUsage(void)
+  __attribute__((warn_unused_result));
+
+#endif /* MEMORY_USAGE_H */
diff --git a/vdo/kernel/poolSysfs.c b/vdo/kernel/poolSysfs.c
new file mode 100644
index 0000000..7f37480
--- /dev/null
+++ b/vdo/kernel/poolSysfs.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.c#1 $
+ */
+
+#include "poolSysfs.h"
+
+#include "memoryAlloc.h"
+
+#include "vdo.h"
+
+#include "dedupeIndex.h"
+
+typedef struct poolAttribute {
+  struct attribute attr;
+  ssize_t (*show)(KernelLayer *layer, char *buf);
+  ssize_t (*store)(KernelLayer *layer, const char *value, size_t count);
+} PoolAttribute;
+
+/**********************************************************************/
+static ssize_t vdoPoolAttrShow(struct kobject   *kobj,
+                               struct attribute *attr,
+                               char             *buf)
+{
+  PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr);
+  if (poolAttr->show == NULL) {
+    return -EINVAL;
+  }
+  KernelLayer *layer = container_of(kobj, KernelLayer, kobj);
+  return poolAttr->show(layer, buf);
+}
+
+/**********************************************************************/
+static ssize_t vdoPoolAttrStore(struct kobject   *kobj,
+                                struct attribute *attr,
+                                const char       *buf,
+                                size_t            length)
+{
+  PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr);
+  if (poolAttr->store == NULL) {
+    return -EINVAL;
+  }
+  KernelLayer *layer = container_of(kobj, KernelLayer, kobj);
+  return poolAttr->store(layer, buf, length);
+}
+
+static struct sysfs_ops vdoPoolSysfsOps = {
+  .show  = vdoPoolAttrShow,
+  .store = vdoPoolAttrStore,
+};
+
+/**********************************************************************/
+static ssize_t poolCompressingShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%s\n", (getKVDOCompressing(&layer->kvdo) ? "1" : "0"));
+}
+
+/**********************************************************************/
+static ssize_t poolDiscardsActiveShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.active);
+}
+
+/**********************************************************************/
+static ssize_t poolDiscardsLimitShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.limit);
+}
+
+/**********************************************************************/
+static ssize_t poolDiscardsLimitStore(KernelLayer *layer,
+                                      const char  *buf,
+                                      size_t       length)
+{
+  unsigned int value;
+  if ((length > 12) || (sscanf(buf, "%u", &value) != 1) || (value < 1)) {
+    return -EINVAL;
+  }
+  layer->discardLimiter.limit = value;
+  return length;
+}
+
+/**********************************************************************/
+static ssize_t poolDiscardsMaximumShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.maximum);
+}
+
+/**********************************************************************/
+static ssize_t poolInstanceShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%u\n", layer->instance);
+}
+
+/**********************************************************************/
+static ssize_t poolRequestsActiveShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.active);
+}
+
+/**********************************************************************/
+static ssize_t poolRequestsLimitShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.limit);
+}
+
+/**********************************************************************/
+static ssize_t poolRequestsMaximumShow(KernelLayer *layer, char *buf)
+{
+  return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.maximum);
+}
+
+/**********************************************************************/
+static void vdoPoolRelease(struct kobject *kobj)
+{
+  KernelLayer *layer = container_of(kobj, KernelLayer, kobj);
+  freeVDO(&layer->kvdo.vdo);
+  FREE(layer);
+}
+
+static PoolAttribute vdoPoolCompressingAttr = {
+  .attr  = { .name = "compressing", .mode = 0444, },
+  .show  = poolCompressingShow,
+};
+
+static PoolAttribute vdoPoolDiscardsActiveAttr = {
+  .attr  = { .name = "discards_active", .mode = 0444, },
+  .show  = poolDiscardsActiveShow,
+};
+
+static PoolAttribute vdoPoolDiscardsLimitAttr = {
+  .attr  = { .name = "discards_limit", .mode = 0644, },
+  .show  = poolDiscardsLimitShow,
+  .store = poolDiscardsLimitStore,
+};
+
+static PoolAttribute vdoPoolDiscardsMaximumAttr = {
+  .attr  = { .name = "discards_maximum", .mode = 0444, },
+  .show  = poolDiscardsMaximumShow,
+};
+
+static PoolAttribute vdoPoolInstanceAttr = {
+  .attr  = { .name = "instance", .mode = 0444, },
+  .show  = poolInstanceShow,
+};
+
+static PoolAttribute vdoPoolRequestsActiveAttr = {
+  .attr  = { .name = "requests_active", .mode = 0444, },
+  .show  = poolRequestsActiveShow,
+};
+
+static PoolAttribute vdoPoolRequestsLimitAttr = {
+  .attr  = { .name = "requests_limit", .mode = 0444, },
+  .show  = poolRequestsLimitShow,
+};
+
+static PoolAttribute vdoPoolRequestsMaximumAttr = {
+  .attr  = { .name = "requests_maximum", .mode = 0444, },
+  .show  = poolRequestsMaximumShow,
+};
+
+static struct attribute *poolAttrs[] = {
+  &vdoPoolCompressingAttr.attr,
+  &vdoPoolDiscardsActiveAttr.attr,
+  &vdoPoolDiscardsLimitAttr.attr,
+  &vdoPoolDiscardsMaximumAttr.attr,
+  &vdoPoolInstanceAttr.attr,
+  &vdoPoolRequestsActiveAttr.attr,
+  &vdoPoolRequestsLimitAttr.attr,
+  &vdoPoolRequestsMaximumAttr.attr,
+  NULL,
+};
+
+struct kobj_type kernelLayerKobjType = {
+  .release       = vdoPoolRelease,
+  .sysfs_ops     = &vdoPoolSysfsOps,
+  .default_attrs = poolAttrs,
+};
+
+/**********************************************************************/
+static void workQueueDirectoryRelease(struct kobject *kobj)
+{
+  /*
+   * The workQueueDirectory holds an implicit reference to its parent,
+   * the kernelLayer object (->kobj), so even if there are some
+   * external references held to the workQueueDirectory when work
+   * queue shutdown calls kobject_put on the kernelLayer object, the
+   * kernelLayer object won't actually be released and won't free the
+   * KernelLayer storage until the workQueueDirectory object is
+   * released first.
+   *
+   * So, we don't need to do any additional explicit management here.
+   *
+   * (But we aren't allowed to use a NULL function pointer to indicate
+   * a no-op.)
+   */
+}
+
+/**********************************************************************/
+static struct attribute *noAttrs[] = {
+  NULL,
+};
+
+static struct sysfs_ops noSysfsOps = {
+  // These should never be reachable since there are no attributes.
+  .show  = NULL,
+  .store = NULL,
+};
+
+struct kobj_type workQueueDirectoryKobjType = {
+  .release       = workQueueDirectoryRelease,
+  .sysfs_ops     = &noSysfsOps,
+  .default_attrs = noAttrs,
+};
diff --git a/vdo/kernel/poolSysfs.h b/vdo/kernel/poolSysfs.h
new file mode 100644
index 0000000..85fe11c
--- /dev/null
+++ b/vdo/kernel/poolSysfs.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.h#1 $
+ */
+
+#ifndef POOL_SYSFS_H
+#define POOL_SYSFS_H
+
+#include <linux/kobject.h>
+
+// The kobj_type used for setting up the kernel layer kobject.
+extern struct kobj_type kernelLayerKobjType;
+// The kobj_type used for the "work_queues" subdirectory.
+extern struct kobj_type workQueueDirectoryKobjType;
+
+// The sysfs_ops used for the "statistics" subdirectory.
+extern struct sysfs_ops poolStatsSysfsOps;
+// The attribute used for the "statistics" subdirectory.
+extern struct attribute *poolStatsAttrs[];
+
+#endif /* POOL_SYSFS_H */
diff --git a/vdo/kernel/poolSysfsStats.c b/vdo/kernel/poolSysfsStats.c
new file mode 100644
index 0000000..daa0cf0
--- /dev/null
+++ b/vdo/kernel/poolSysfsStats.c
@@ -0,0 +1,2628 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ */
+
+#include "dedupeIndex.h"
+#include "logger.h"
+#include "poolSysfs.h"
+#include "statistics.h"
+#include "statusProcfs.h"
+#include "threadDevice.h"
+#include "vdo.h"
+
+typedef struct poolStatsAttribute {
+  struct attribute attr;
+  ssize_t (*show)(KernelLayer *layer, char *buf);
+} PoolStatsAttribute;
+
+static ssize_t poolStatsAttrShow(struct kobject   *kobj,
+                                 struct attribute *attr,
+                                 char             *buf)
+{
+  PoolStatsAttribute *poolStatsAttr = container_of(attr, PoolStatsAttribute,
+                                                   attr);
+
+  if (poolStatsAttr->show == NULL) {
+    return -EINVAL;
+  }
+  KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory);
+  return poolStatsAttr->show(layer, buf);
+}
+
+struct sysfs_ops poolStatsSysfsOps = {
+  .show  = poolStatsAttrShow,
+  .store = NULL,
+};
+
+/**********************************************************************/
+/** Number of blocks used for data */
+static ssize_t poolStatsDataBlocksUsedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.dataBlocksUsed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsDataBlocksUsedAttr = {
+  .attr  = { .name = "data_blocks_used", .mode = 0444, },
+  .show  = poolStatsDataBlocksUsedShow,
+};
+
+/**********************************************************************/
+/** Number of blocks used for VDO metadata */
+static ssize_t poolStatsOverheadBlocksUsedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.overheadBlocksUsed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsOverheadBlocksUsedAttr = {
+  .attr  = { .name = "overhead_blocks_used", .mode = 0444, },
+  .show  = poolStatsOverheadBlocksUsedShow,
+};
+
+/**********************************************************************/
+/** Number of logical blocks that are currently mapped to physical blocks */
+static ssize_t poolStatsLogicalBlocksUsedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocksUsed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsLogicalBlocksUsedAttr = {
+  .attr  = { .name = "logical_blocks_used", .mode = 0444, },
+  .show  = poolStatsLogicalBlocksUsedShow,
+};
+
+/**********************************************************************/
+/** number of physical blocks */
+static ssize_t poolStatsPhysicalBlocksShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.physicalBlocks);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsPhysicalBlocksAttr = {
+  .attr  = { .name = "physical_blocks", .mode = 0444, },
+  .show  = poolStatsPhysicalBlocksShow,
+};
+
+/**********************************************************************/
+/** number of logical blocks */
+static ssize_t poolStatsLogicalBlocksShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocks);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsLogicalBlocksAttr = {
+  .attr  = { .name = "logical_blocks", .mode = 0444, },
+  .show  = poolStatsLogicalBlocksShow,
+};
+
+/**********************************************************************/
+/** Size of the block map page cache, in bytes */
+static ssize_t poolStatsBlockMapCacheSizeShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMapCacheSize);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapCacheSizeAttr = {
+  .attr  = { .name = "block_map_cache_size", .mode = 0444, },
+  .show  = poolStatsBlockMapCacheSizeShow,
+};
+
+/**********************************************************************/
+/** String describing the active write policy of the VDO */
+static ssize_t poolStatsWritePolicyShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.writePolicy);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsWritePolicyAttr = {
+  .attr  = { .name = "write_policy", .mode = 0444, },
+  .show  = poolStatsWritePolicyShow,
+};
+
+/**********************************************************************/
+/** The physical block size */
+static ssize_t poolStatsBlockSizeShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockSize);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockSizeAttr = {
+  .attr  = { .name = "block_size", .mode = 0444, },
+  .show  = poolStatsBlockSizeShow,
+};
+
+/**********************************************************************/
+/** Number of times the VDO has successfully recovered */
+static ssize_t poolStatsCompleteRecoveriesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.completeRecoveries);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsCompleteRecoveriesAttr = {
+  .attr  = { .name = "complete_recoveries", .mode = 0444, },
+  .show  = poolStatsCompleteRecoveriesShow,
+};
+
+/**********************************************************************/
+/** Number of times the VDO has recovered from read-only mode */
+static ssize_t poolStatsReadOnlyRecoveriesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.readOnlyRecoveries);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsReadOnlyRecoveriesAttr = {
+  .attr  = { .name = "read_only_recoveries", .mode = 0444, },
+  .show  = poolStatsReadOnlyRecoveriesShow,
+};
+
+/**********************************************************************/
+/** String describing the operating mode of the VDO */
+static ssize_t poolStatsModeShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.mode);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsModeAttr = {
+  .attr  = { .name = "mode", .mode = 0444, },
+  .show  = poolStatsModeShow,
+};
+
+/**********************************************************************/
+/** Whether the VDO is in recovery mode */
+static ssize_t poolStatsInRecoveryModeShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%d\n", layer->vdoStatsStorage.inRecoveryMode);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsInRecoveryModeAttr = {
+  .attr  = { .name = "in_recovery_mode", .mode = 0444, },
+  .show  = poolStatsInRecoveryModeShow,
+};
+
+/**********************************************************************/
+/** What percentage of recovery mode work has been completed */
+static ssize_t poolStatsRecoveryPercentageShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%u\n", layer->vdoStatsStorage.recoveryPercentage);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsRecoveryPercentageAttr = {
+  .attr  = { .name = "recovery_percentage", .mode = 0444, },
+  .show  = poolStatsRecoveryPercentageShow,
+};
+
+/**********************************************************************/
+/** Number of compressed data items written since startup */
+static ssize_t poolStatsPackerCompressedFragmentsWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsWritten);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsPackerCompressedFragmentsWrittenAttr = {
+  .attr  = { .name = "packer_compressed_fragments_written", .mode = 0444, },
+  .show  = poolStatsPackerCompressedFragmentsWrittenShow,
+};
+
+/**********************************************************************/
+/** Number of blocks containing compressed items written since startup */
+static ssize_t poolStatsPackerCompressedBlocksWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedBlocksWritten);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsPackerCompressedBlocksWrittenAttr = {
+  .attr  = { .name = "packer_compressed_blocks_written", .mode = 0444, },
+  .show  = poolStatsPackerCompressedBlocksWrittenShow,
+};
+
+/**********************************************************************/
+/** Number of VIOs that are pending in the packer */
+static ssize_t poolStatsPackerCompressedFragmentsInPackerShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsInPacker);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsPackerCompressedFragmentsInPackerAttr = {
+  .attr  = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, },
+  .show  = poolStatsPackerCompressedFragmentsInPackerShow,
+};
+
+/**********************************************************************/
+/** The total number of slabs from which blocks may be allocated */
+static ssize_t poolStatsAllocatorSlabCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsAllocatorSlabCountAttr = {
+  .attr  = { .name = "allocator_slab_count", .mode = 0444, },
+  .show  = poolStatsAllocatorSlabCountShow,
+};
+
+/**********************************************************************/
+/** The total number of slabs from which blocks have ever been allocated */
+static ssize_t poolStatsAllocatorSlabsOpenedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsOpened);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsAllocatorSlabsOpenedAttr = {
+  .attr  = { .name = "allocator_slabs_opened", .mode = 0444, },
+  .show  = poolStatsAllocatorSlabsOpenedShow,
+};
+
+/**********************************************************************/
+/** The number of times since loading that a slab has been re-opened */
+static ssize_t poolStatsAllocatorSlabsReopenedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsReopened);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsAllocatorSlabsReopenedAttr = {
+  .attr  = { .name = "allocator_slabs_reopened", .mode = 0444, },
+  .show  = poolStatsAllocatorSlabsReopenedShow,
+};
+
+/**********************************************************************/
+/** Number of times the on-disk journal was full */
+static ssize_t poolStatsJournalDiskFullShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.diskFull);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalDiskFullAttr = {
+  .attr  = { .name = "journal_disk_full", .mode = 0444, },
+  .show  = poolStatsJournalDiskFullShow,
+};
+
+/**********************************************************************/
+/** Number of times the recovery journal requested slab journal commits. */
+static ssize_t poolStatsJournalSlabJournalCommitsRequestedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.slabJournalCommitsRequested);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalSlabJournalCommitsRequestedAttr = {
+  .attr  = { .name = "journal_slab_journal_commits_requested", .mode = 0444, },
+  .show  = poolStatsJournalSlabJournalCommitsRequestedShow,
+};
+
+/**********************************************************************/
+/** The total number of items on which processing has started */
+static ssize_t poolStatsJournalEntriesStartedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.started);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalEntriesStartedAttr = {
+  .attr  = { .name = "journal_entries_started", .mode = 0444, },
+  .show  = poolStatsJournalEntriesStartedShow,
+};
+
+/**********************************************************************/
+/** The total number of items for which a write operation has been issued */
+static ssize_t poolStatsJournalEntriesWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.written);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalEntriesWrittenAttr = {
+  .attr  = { .name = "journal_entries_written", .mode = 0444, },
+  .show  = poolStatsJournalEntriesWrittenShow,
+};
+
+/**********************************************************************/
+/** The total number of items for which a write operation has completed */
+static ssize_t poolStatsJournalEntriesCommittedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.committed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalEntriesCommittedAttr = {
+  .attr  = { .name = "journal_entries_committed", .mode = 0444, },
+  .show  = poolStatsJournalEntriesCommittedShow,
+};
+
+/**********************************************************************/
+/** The total number of items on which processing has started */
+static ssize_t poolStatsJournalBlocksStartedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.started);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalBlocksStartedAttr = {
+  .attr  = { .name = "journal_blocks_started", .mode = 0444, },
+  .show  = poolStatsJournalBlocksStartedShow,
+};
+
+/**********************************************************************/
+/** The total number of items for which a write operation has been issued */
+static ssize_t poolStatsJournalBlocksWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.written);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalBlocksWrittenAttr = {
+  .attr  = { .name = "journal_blocks_written", .mode = 0444, },
+  .show  = poolStatsJournalBlocksWrittenShow,
+};
+
+/**********************************************************************/
+/** The total number of items for which a write operation has completed */
+static ssize_t poolStatsJournalBlocksCommittedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.committed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsJournalBlocksCommittedAttr = {
+  .attr  = { .name = "journal_blocks_committed", .mode = 0444, },
+  .show  = poolStatsJournalBlocksCommittedShow,
+};
+
+/**********************************************************************/
+/** Number of times the on-disk journal was full */
+static ssize_t poolStatsSlabJournalDiskFullCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.diskFullCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabJournalDiskFullCountAttr = {
+  .attr  = { .name = "slab_journal_disk_full_count", .mode = 0444, },
+  .show  = poolStatsSlabJournalDiskFullCountShow,
+};
+
+/**********************************************************************/
+/** Number of times an entry was added over the flush threshold */
+static ssize_t poolStatsSlabJournalFlushCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.flushCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabJournalFlushCountAttr = {
+  .attr  = { .name = "slab_journal_flush_count", .mode = 0444, },
+  .show  = poolStatsSlabJournalFlushCountShow,
+};
+
+/**********************************************************************/
+/** Number of times an entry was added over the block threshold */
+static ssize_t poolStatsSlabJournalBlockedCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blockedCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabJournalBlockedCountAttr = {
+  .attr  = { .name = "slab_journal_blocked_count", .mode = 0444, },
+  .show  = poolStatsSlabJournalBlockedCountShow,
+};
+
+/**********************************************************************/
+/** Number of times a tail block was written */
+static ssize_t poolStatsSlabJournalBlocksWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blocksWritten);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabJournalBlocksWrittenAttr = {
+  .attr  = { .name = "slab_journal_blocks_written", .mode = 0444, },
+  .show  = poolStatsSlabJournalBlocksWrittenShow,
+};
+
+/**********************************************************************/
+/** Number of times we had to wait for the tail to write */
+static ssize_t poolStatsSlabJournalTailBusyCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.tailBusyCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabJournalTailBusyCountAttr = {
+  .attr  = { .name = "slab_journal_tail_busy_count", .mode = 0444, },
+  .show  = poolStatsSlabJournalTailBusyCountShow,
+};
+
+/**********************************************************************/
+/** Number of blocks written */
+static ssize_t poolStatsSlabSummaryBlocksWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabSummary.blocksWritten);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsSlabSummaryBlocksWrittenAttr = {
+  .attr  = { .name = "slab_summary_blocks_written", .mode = 0444, },
+  .show  = poolStatsSlabSummaryBlocksWrittenShow,
+};
+
+/**********************************************************************/
+/** Number of reference blocks written */
+static ssize_t poolStatsRefCountsBlocksWrittenShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.refCounts.blocksWritten);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsRefCountsBlocksWrittenAttr = {
+  .attr  = { .name = "ref_counts_blocks_written", .mode = 0444, },
+  .show  = poolStatsRefCountsBlocksWrittenShow,
+};
+
+/**********************************************************************/
+/** number of dirty (resident) pages */
+static ssize_t poolStatsBlockMapDirtyPagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.dirtyPages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapDirtyPagesAttr = {
+  .attr  = { .name = "block_map_dirty_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapDirtyPagesShow,
+};
+
+/**********************************************************************/
+/** number of clean (resident) pages */
+static ssize_t poolStatsBlockMapCleanPagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cleanPages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapCleanPagesAttr = {
+  .attr  = { .name = "block_map_clean_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapCleanPagesShow,
+};
+
+/**********************************************************************/
+/** number of free pages */
+static ssize_t poolStatsBlockMapFreePagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.freePages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFreePagesAttr = {
+  .attr  = { .name = "block_map_free_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapFreePagesShow,
+};
+
+/**********************************************************************/
+/** number of pages in failed state */
+static ssize_t poolStatsBlockMapFailedPagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.failedPages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFailedPagesAttr = {
+  .attr  = { .name = "block_map_failed_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapFailedPagesShow,
+};
+
+/**********************************************************************/
+/** number of pages incoming */
+static ssize_t poolStatsBlockMapIncomingPagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.incomingPages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapIncomingPagesAttr = {
+  .attr  = { .name = "block_map_incoming_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapIncomingPagesShow,
+};
+
+/**********************************************************************/
+/** number of pages outgoing */
+static ssize_t poolStatsBlockMapOutgoingPagesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.outgoingPages);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapOutgoingPagesAttr = {
+  .attr  = { .name = "block_map_outgoing_pages", .mode = 0444, },
+  .show  = poolStatsBlockMapOutgoingPagesShow,
+};
+
+/**********************************************************************/
+/** how many times free page not avail */
+static ssize_t poolStatsBlockMapCachePressureShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cachePressure);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapCachePressureAttr = {
+  .attr  = { .name = "block_map_cache_pressure", .mode = 0444, },
+  .show  = poolStatsBlockMapCachePressureShow,
+};
+
+/**********************************************************************/
+/** number of getVDOPageAsync() for read */
+static ssize_t poolStatsBlockMapReadCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapReadCountAttr = {
+  .attr  = { .name = "block_map_read_count", .mode = 0444, },
+  .show  = poolStatsBlockMapReadCountShow,
+};
+
+/**********************************************************************/
+/** number or getVDOPageAsync() for write */
+static ssize_t poolStatsBlockMapWriteCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.writeCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapWriteCountAttr = {
+  .attr  = { .name = "block_map_write_count", .mode = 0444, },
+  .show  = poolStatsBlockMapWriteCountShow,
+};
+
+/**********************************************************************/
+/** number of times pages failed to read */
+static ssize_t poolStatsBlockMapFailedReadsShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedReads);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFailedReadsAttr = {
+  .attr  = { .name = "block_map_failed_reads", .mode = 0444, },
+  .show  = poolStatsBlockMapFailedReadsShow,
+};
+
+/**********************************************************************/
+/** number of times pages failed to write */
+static ssize_t poolStatsBlockMapFailedWritesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedWrites);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFailedWritesAttr = {
+  .attr  = { .name = "block_map_failed_writes", .mode = 0444, },
+  .show  = poolStatsBlockMapFailedWritesShow,
+};
+
+/**********************************************************************/
+/** number of gets that are reclaimed */
+static ssize_t poolStatsBlockMapReclaimedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.reclaimed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapReclaimedAttr = {
+  .attr  = { .name = "block_map_reclaimed", .mode = 0444, },
+  .show  = poolStatsBlockMapReclaimedShow,
+};
+
+/**********************************************************************/
+/** number of gets for outgoing pages */
+static ssize_t poolStatsBlockMapReadOutgoingShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readOutgoing);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapReadOutgoingAttr = {
+  .attr  = { .name = "block_map_read_outgoing", .mode = 0444, },
+  .show  = poolStatsBlockMapReadOutgoingShow,
+};
+
+/**********************************************************************/
+/** number of gets that were already there */
+static ssize_t poolStatsBlockMapFoundInCacheShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.foundInCache);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFoundInCacheAttr = {
+  .attr  = { .name = "block_map_found_in_cache", .mode = 0444, },
+  .show  = poolStatsBlockMapFoundInCacheShow,
+};
+
+/**********************************************************************/
+/** number of gets requiring discard */
+static ssize_t poolStatsBlockMapDiscardRequiredShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.discardRequired);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapDiscardRequiredAttr = {
+  .attr  = { .name = "block_map_discard_required", .mode = 0444, },
+  .show  = poolStatsBlockMapDiscardRequiredShow,
+};
+
+/**********************************************************************/
+/** number of gets enqueued for their page */
+static ssize_t poolStatsBlockMapWaitForPageShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.waitForPage);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapWaitForPageAttr = {
+  .attr  = { .name = "block_map_wait_for_page", .mode = 0444, },
+  .show  = poolStatsBlockMapWaitForPageShow,
+};
+
+/**********************************************************************/
+/** number of gets that have to fetch */
+static ssize_t poolStatsBlockMapFetchRequiredShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.fetchRequired);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFetchRequiredAttr = {
+  .attr  = { .name = "block_map_fetch_required", .mode = 0444, },
+  .show  = poolStatsBlockMapFetchRequiredShow,
+};
+
+/**********************************************************************/
+/** number of page fetches */
+static ssize_t poolStatsBlockMapPagesLoadedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesLoaded);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapPagesLoadedAttr = {
+  .attr  = { .name = "block_map_pages_loaded", .mode = 0444, },
+  .show  = poolStatsBlockMapPagesLoadedShow,
+};
+
+/**********************************************************************/
+/** number of page saves */
+static ssize_t poolStatsBlockMapPagesSavedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesSaved);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapPagesSavedAttr = {
+  .attr  = { .name = "block_map_pages_saved", .mode = 0444, },
+  .show  = poolStatsBlockMapPagesSavedShow,
+};
+
+/**********************************************************************/
+/** the number of flushes issued */
+static ssize_t poolStatsBlockMapFlushCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.flushCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBlockMapFlushCountAttr = {
+  .attr  = { .name = "block_map_flush_count", .mode = 0444, },
+  .show  = poolStatsBlockMapFlushCountShow,
+};
+
+/**********************************************************************/
+/** Number of times the UDS advice proved correct */
+static ssize_t poolStatsHashLockDedupeAdviceValidShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceValid);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsHashLockDedupeAdviceValidAttr = {
+  .attr  = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, },
+  .show  = poolStatsHashLockDedupeAdviceValidShow,
+};
+
+/**********************************************************************/
+/** Number of times the UDS advice proved incorrect */
+static ssize_t poolStatsHashLockDedupeAdviceStaleShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceStale);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsHashLockDedupeAdviceStaleAttr = {
+  .attr  = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, },
+  .show  = poolStatsHashLockDedupeAdviceStaleShow,
+};
+
+/**********************************************************************/
+/** Number of writes with the same data as another in-flight write */
+static ssize_t poolStatsHashLockConcurrentDataMatchesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentDataMatches);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsHashLockConcurrentDataMatchesAttr = {
+  .attr  = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, },
+  .show  = poolStatsHashLockConcurrentDataMatchesShow,
+};
+
+/**********************************************************************/
+/** Number of writes whose hash collided with an in-flight write */
+static ssize_t poolStatsHashLockConcurrentHashCollisionsShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentHashCollisions);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsHashLockConcurrentHashCollisionsAttr = {
+  .attr  = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, },
+  .show  = poolStatsHashLockConcurrentHashCollisionsShow,
+};
+
+/**********************************************************************/
+/** number of times VDO got an invalid dedupe advice PBN from UDS */
+static ssize_t poolStatsErrorsInvalidAdvicePBNCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.invalidAdvicePBNCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsErrorsInvalidAdvicePBNCountAttr = {
+  .attr  = { .name = "errors_invalid_advicePBNCount", .mode = 0444, },
+  .show  = poolStatsErrorsInvalidAdvicePBNCountShow,
+};
+
+/**********************************************************************/
+/** number of times a VIO completed with a VDO_NO_SPACE error */
+static ssize_t poolStatsErrorsNoSpaceErrorCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.noSpaceErrorCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsErrorsNoSpaceErrorCountAttr = {
+  .attr  = { .name = "errors_no_space_error_count", .mode = 0444, },
+  .show  = poolStatsErrorsNoSpaceErrorCountShow,
+};
+
+/**********************************************************************/
+/** number of times a VIO completed with a VDO_READ_ONLY error */
+static ssize_t poolStatsErrorsReadOnlyErrorCountShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.readOnlyErrorCount);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsErrorsReadOnlyErrorCountAttr = {
+  .attr  = { .name = "errors_read_only_error_count", .mode = 0444, },
+  .show  = poolStatsErrorsReadOnlyErrorCountShow,
+};
+
+/**********************************************************************/
+/** The VDO instance */
+static ssize_t poolStatsInstanceShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.instance);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsInstanceAttr = {
+  .attr  = { .name = "instance", .mode = 0444, },
+  .show  = poolStatsInstanceShow,
+};
+
+/**********************************************************************/
+/** Current number of active VIOs */
+static ssize_t poolStatsCurrentVIOsInProgressShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.currentVIOsInProgress);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsCurrentVIOsInProgressAttr = {
+  .attr  = { .name = "currentVIOs_in_progress", .mode = 0444, },
+  .show  = poolStatsCurrentVIOsInProgressShow,
+};
+
+/**********************************************************************/
+/** Maximum number of active VIOs */
+static ssize_t poolStatsMaxVIOsShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.maxVIOs);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsMaxVIOsAttr = {
+  .attr  = { .name = "maxVIOs", .mode = 0444, },
+  .show  = poolStatsMaxVIOsShow,
+};
+
+/**********************************************************************/
+/** Number of times the UDS index was too slow in responding */
+static ssize_t poolStatsDedupeAdviceTimeoutsShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.dedupeAdviceTimeouts);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsDedupeAdviceTimeoutsAttr = {
+  .attr  = { .name = "dedupe_advice_timeouts", .mode = 0444, },
+  .show  = poolStatsDedupeAdviceTimeoutsShow,
+};
+
+/**********************************************************************/
+/** Number of flush requests submitted to the storage device */
+static ssize_t poolStatsFlushOutShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.flushOut);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsFlushOutAttr = {
+  .attr  = { .name = "flush_out", .mode = 0444, },
+  .show  = poolStatsFlushOutShow,
+};
+
+/**********************************************************************/
+/** Logical block size */
+static ssize_t poolStatsLogicalBlockSizeShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.logicalBlockSize);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsLogicalBlockSizeAttr = {
+  .attr  = { .name = "logical_block_size", .mode = 0444, },
+  .show  = poolStatsLogicalBlockSizeShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosInReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInReadAttr = {
+  .attr  = { .name = "bios_in_read", .mode = 0444, },
+  .show  = poolStatsBiosInReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosInWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInWriteAttr = {
+  .attr  = { .name = "bios_in_write", .mode = 0444, },
+  .show  = poolStatsBiosInWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosInDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInDiscardAttr = {
+  .attr  = { .name = "bios_in_discard", .mode = 0444, },
+  .show  = poolStatsBiosInDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosInFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInFlushAttr = {
+  .attr  = { .name = "bios_in_flush", .mode = 0444, },
+  .show  = poolStatsBiosInFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosInFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInFuaAttr = {
+  .attr  = { .name = "bios_in_fua", .mode = 0444, },
+  .show  = poolStatsBiosInFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosInPartialReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInPartialReadAttr = {
+  .attr  = { .name = "bios_in_partial_read", .mode = 0444, },
+  .show  = poolStatsBiosInPartialReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosInPartialWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInPartialWriteAttr = {
+  .attr  = { .name = "bios_in_partial_write", .mode = 0444, },
+  .show  = poolStatsBiosInPartialWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosInPartialDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInPartialDiscardAttr = {
+  .attr  = { .name = "bios_in_partial_discard", .mode = 0444, },
+  .show  = poolStatsBiosInPartialDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosInPartialFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInPartialFlushAttr = {
+  .attr  = { .name = "bios_in_partial_flush", .mode = 0444, },
+  .show  = poolStatsBiosInPartialFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosInPartialFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInPartialFuaAttr = {
+  .attr  = { .name = "bios_in_partial_fua", .mode = 0444, },
+  .show  = poolStatsBiosInPartialFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosOutReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutReadAttr = {
+  .attr  = { .name = "bios_out_read", .mode = 0444, },
+  .show  = poolStatsBiosOutReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosOutWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutWriteAttr = {
+  .attr  = { .name = "bios_out_write", .mode = 0444, },
+  .show  = poolStatsBiosOutWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosOutDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutDiscardAttr = {
+  .attr  = { .name = "bios_out_discard", .mode = 0444, },
+  .show  = poolStatsBiosOutDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosOutFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutFlushAttr = {
+  .attr  = { .name = "bios_out_flush", .mode = 0444, },
+  .show  = poolStatsBiosOutFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosOutFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutFuaAttr = {
+  .attr  = { .name = "bios_out_fua", .mode = 0444, },
+  .show  = poolStatsBiosOutFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosMetaReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaReadAttr = {
+  .attr  = { .name = "bios_meta_read", .mode = 0444, },
+  .show  = poolStatsBiosMetaReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosMetaWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaWriteAttr = {
+  .attr  = { .name = "bios_meta_write", .mode = 0444, },
+  .show  = poolStatsBiosMetaWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosMetaDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaDiscardAttr = {
+  .attr  = { .name = "bios_meta_discard", .mode = 0444, },
+  .show  = poolStatsBiosMetaDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosMetaFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaFlushAttr = {
+  .attr  = { .name = "bios_meta_flush", .mode = 0444, },
+  .show  = poolStatsBiosMetaFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosMetaFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaFuaAttr = {
+  .attr  = { .name = "bios_meta_fua", .mode = 0444, },
+  .show  = poolStatsBiosMetaFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosJournalReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalReadAttr = {
+  .attr  = { .name = "bios_journal_read", .mode = 0444, },
+  .show  = poolStatsBiosJournalReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosJournalWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalWriteAttr = {
+  .attr  = { .name = "bios_journal_write", .mode = 0444, },
+  .show  = poolStatsBiosJournalWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosJournalDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalDiscardAttr = {
+  .attr  = { .name = "bios_journal_discard", .mode = 0444, },
+  .show  = poolStatsBiosJournalDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosJournalFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalFlushAttr = {
+  .attr  = { .name = "bios_journal_flush", .mode = 0444, },
+  .show  = poolStatsBiosJournalFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosJournalFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalFuaAttr = {
+  .attr  = { .name = "bios_journal_fua", .mode = 0444, },
+  .show  = poolStatsBiosJournalFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosPageCacheReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheReadAttr = {
+  .attr  = { .name = "bios_page_cache_read", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosPageCacheWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheWriteAttr = {
+  .attr  = { .name = "bios_page_cache_write", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosPageCacheDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheDiscardAttr = {
+  .attr  = { .name = "bios_page_cache_discard", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosPageCacheFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheFlushAttr = {
+  .attr  = { .name = "bios_page_cache_flush", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosPageCacheFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheFuaAttr = {
+  .attr  = { .name = "bios_page_cache_fua", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosOutCompletedReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutCompletedReadAttr = {
+  .attr  = { .name = "bios_out_completed_read", .mode = 0444, },
+  .show  = poolStatsBiosOutCompletedReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosOutCompletedWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutCompletedWriteAttr = {
+  .attr  = { .name = "bios_out_completed_write", .mode = 0444, },
+  .show  = poolStatsBiosOutCompletedWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosOutCompletedDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutCompletedDiscardAttr = {
+  .attr  = { .name = "bios_out_completed_discard", .mode = 0444, },
+  .show  = poolStatsBiosOutCompletedDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosOutCompletedFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutCompletedFlushAttr = {
+  .attr  = { .name = "bios_out_completed_flush", .mode = 0444, },
+  .show  = poolStatsBiosOutCompletedFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosOutCompletedFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosOutCompletedFuaAttr = {
+  .attr  = { .name = "bios_out_completed_fua", .mode = 0444, },
+  .show  = poolStatsBiosOutCompletedFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosMetaCompletedReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaCompletedReadAttr = {
+  .attr  = { .name = "bios_meta_completed_read", .mode = 0444, },
+  .show  = poolStatsBiosMetaCompletedReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosMetaCompletedWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaCompletedWriteAttr = {
+  .attr  = { .name = "bios_meta_completed_write", .mode = 0444, },
+  .show  = poolStatsBiosMetaCompletedWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosMetaCompletedDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaCompletedDiscardAttr = {
+  .attr  = { .name = "bios_meta_completed_discard", .mode = 0444, },
+  .show  = poolStatsBiosMetaCompletedDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosMetaCompletedFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaCompletedFlushAttr = {
+  .attr  = { .name = "bios_meta_completed_flush", .mode = 0444, },
+  .show  = poolStatsBiosMetaCompletedFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosMetaCompletedFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosMetaCompletedFuaAttr = {
+  .attr  = { .name = "bios_meta_completed_fua", .mode = 0444, },
+  .show  = poolStatsBiosMetaCompletedFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosJournalCompletedReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalCompletedReadAttr = {
+  .attr  = { .name = "bios_journal_completed_read", .mode = 0444, },
+  .show  = poolStatsBiosJournalCompletedReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosJournalCompletedWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalCompletedWriteAttr = {
+  .attr  = { .name = "bios_journal_completed_write", .mode = 0444, },
+  .show  = poolStatsBiosJournalCompletedWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosJournalCompletedDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalCompletedDiscardAttr = {
+  .attr  = { .name = "bios_journal_completed_discard", .mode = 0444, },
+  .show  = poolStatsBiosJournalCompletedDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosJournalCompletedFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalCompletedFlushAttr = {
+  .attr  = { .name = "bios_journal_completed_flush", .mode = 0444, },
+  .show  = poolStatsBiosJournalCompletedFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosJournalCompletedFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosJournalCompletedFuaAttr = {
+  .attr  = { .name = "bios_journal_completed_fua", .mode = 0444, },
+  .show  = poolStatsBiosJournalCompletedFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosPageCacheCompletedReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheCompletedReadAttr = {
+  .attr  = { .name = "bios_page_cache_completed_read", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheCompletedReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosPageCacheCompletedWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheCompletedWriteAttr = {
+  .attr  = { .name = "bios_page_cache_completed_write", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheCompletedWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosPageCacheCompletedDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheCompletedDiscardAttr = {
+  .attr  = { .name = "bios_page_cache_completed_discard", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheCompletedDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosPageCacheCompletedFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheCompletedFlushAttr = {
+  .attr  = { .name = "bios_page_cache_completed_flush", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheCompletedFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosPageCacheCompletedFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosPageCacheCompletedFuaAttr = {
+  .attr  = { .name = "bios_page_cache_completed_fua", .mode = 0444, },
+  .show  = poolStatsBiosPageCacheCompletedFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosAcknowledgedReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedReadAttr = {
+  .attr  = { .name = "bios_acknowledged_read", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosAcknowledgedWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedWriteAttr = {
+  .attr  = { .name = "bios_acknowledged_write", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosAcknowledgedDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedDiscardAttr = {
+  .attr  = { .name = "bios_acknowledged_discard", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosAcknowledgedFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedFlushAttr = {
+  .attr  = { .name = "bios_acknowledged_flush", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosAcknowledgedFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedFuaAttr = {
+  .attr  = { .name = "bios_acknowledged_fua", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosAcknowledgedPartialReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedPartialReadAttr = {
+  .attr  = { .name = "bios_acknowledged_partial_read", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedPartialReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosAcknowledgedPartialWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedPartialWriteAttr = {
+  .attr  = { .name = "bios_acknowledged_partial_write", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedPartialWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosAcknowledgedPartialDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedPartialDiscardAttr = {
+  .attr  = { .name = "bios_acknowledged_partial_discard", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedPartialDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosAcknowledgedPartialFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFlushAttr = {
+  .attr  = { .name = "bios_acknowledged_partial_flush", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedPartialFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosAcknowledgedPartialFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFuaAttr = {
+  .attr  = { .name = "bios_acknowledged_partial_fua", .mode = 0444, },
+  .show  = poolStatsBiosAcknowledgedPartialFuaShow,
+};
+
+/**********************************************************************/
+/** Number of not REQ_WRITE bios */
+static ssize_t poolStatsBiosInProgressReadShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.read);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInProgressReadAttr = {
+  .attr  = { .name = "bios_in_progress_read", .mode = 0444, },
+  .show  = poolStatsBiosInProgressReadShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_WRITE bios */
+static ssize_t poolStatsBiosInProgressWriteShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.write);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInProgressWriteAttr = {
+  .attr  = { .name = "bios_in_progress_write", .mode = 0444, },
+  .show  = poolStatsBiosInProgressWriteShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_DISCARD bios */
+static ssize_t poolStatsBiosInProgressDiscardShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.discard);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInProgressDiscardAttr = {
+  .attr  = { .name = "bios_in_progress_discard", .mode = 0444, },
+  .show  = poolStatsBiosInProgressDiscardShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FLUSH bios */
+static ssize_t poolStatsBiosInProgressFlushShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.flush);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInProgressFlushAttr = {
+  .attr  = { .name = "bios_in_progress_flush", .mode = 0444, },
+  .show  = poolStatsBiosInProgressFlushShow,
+};
+
+/**********************************************************************/
+/** Number of REQ_FUA bios */
+static ssize_t poolStatsBiosInProgressFuaShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.fua);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsBiosInProgressFuaAttr = {
+  .attr  = { .name = "bios_in_progress_fua", .mode = 0444, },
+  .show  = poolStatsBiosInProgressFuaShow,
+};
+
+/**********************************************************************/
+/** Tracked bytes currently allocated. */
+static ssize_t poolStatsMemoryUsageBytesUsedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.bytesUsed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsMemoryUsageBytesUsedAttr = {
+  .attr  = { .name = "memory_usage_bytes_used", .mode = 0444, },
+  .show  = poolStatsMemoryUsageBytesUsedShow,
+};
+
+/**********************************************************************/
+/** Maximum tracked bytes allocated. */
+static ssize_t poolStatsMemoryUsagePeakBytesUsedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.peakBytesUsed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsMemoryUsagePeakBytesUsedAttr = {
+  .attr  = { .name = "memory_usage_peak_bytes_used", .mode = 0444, },
+  .show  = poolStatsMemoryUsagePeakBytesUsedShow,
+};
+
+/**********************************************************************/
+/** Number of chunk names stored in the index */
+static ssize_t poolStatsIndexEntriesIndexedShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.entriesIndexed);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexEntriesIndexedAttr = {
+  .attr  = { .name = "index_entries_indexed", .mode = 0444, },
+  .show  = poolStatsIndexEntriesIndexedShow,
+};
+
+/**********************************************************************/
+/** Number of post calls that found an existing entry */
+static ssize_t poolStatsIndexPostsFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexPostsFoundAttr = {
+  .attr  = { .name = "index_posts_found", .mode = 0444, },
+  .show  = poolStatsIndexPostsFoundShow,
+};
+
+/**********************************************************************/
+/** Number of post calls that added a new entry */
+static ssize_t poolStatsIndexPostsNotFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsNotFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexPostsNotFoundAttr = {
+  .attr  = { .name = "index_posts_not_found", .mode = 0444, },
+  .show  = poolStatsIndexPostsNotFoundShow,
+};
+
+/**********************************************************************/
+/** Number of query calls that found an existing entry */
+static ssize_t poolStatsIndexQueriesFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexQueriesFoundAttr = {
+  .attr  = { .name = "index_queries_found", .mode = 0444, },
+  .show  = poolStatsIndexQueriesFoundShow,
+};
+
+/**********************************************************************/
+/** Number of query calls that added a new entry */
+static ssize_t poolStatsIndexQueriesNotFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesNotFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexQueriesNotFoundAttr = {
+  .attr  = { .name = "index_queries_not_found", .mode = 0444, },
+  .show  = poolStatsIndexQueriesNotFoundShow,
+};
+
+/**********************************************************************/
+/** Number of update calls that found an existing entry */
+static ssize_t poolStatsIndexUpdatesFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexUpdatesFoundAttr = {
+  .attr  = { .name = "index_updates_found", .mode = 0444, },
+  .show  = poolStatsIndexUpdatesFoundShow,
+};
+
+/**********************************************************************/
+/** Number of update calls that added a new entry */
+static ssize_t poolStatsIndexUpdatesNotFoundShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesNotFound);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexUpdatesNotFoundAttr = {
+  .attr  = { .name = "index_updates_not_found", .mode = 0444, },
+  .show  = poolStatsIndexUpdatesNotFoundShow,
+};
+
+/**********************************************************************/
+/** Current number of dedupe queries that are in flight */
+static ssize_t poolStatsIndexCurrDedupeQueriesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.currDedupeQueries);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexCurrDedupeQueriesAttr = {
+  .attr  = { .name = "index_curr_dedupe_queries", .mode = 0444, },
+  .show  = poolStatsIndexCurrDedupeQueriesShow,
+};
+
+/**********************************************************************/
+/** Maximum number of dedupe queries that have been in flight */
+static ssize_t poolStatsIndexMaxDedupeQueriesShow(KernelLayer *layer, char *buf)
+{
+  ssize_t retval;
+  mutex_lock(&layer->statsMutex);
+  getKernelStats(layer, &layer->kernelStatsStorage);
+  retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.maxDedupeQueries);
+  mutex_unlock(&layer->statsMutex);
+  return retval;
+}
+
+static PoolStatsAttribute poolStatsIndexMaxDedupeQueriesAttr = {
+  .attr  = { .name = "index_max_dedupe_queries", .mode = 0444, },
+  .show  = poolStatsIndexMaxDedupeQueriesShow,
+};
+
+struct attribute *poolStatsAttrs[] = {
+  &poolStatsDataBlocksUsedAttr.attr,
+  &poolStatsOverheadBlocksUsedAttr.attr,
+  &poolStatsLogicalBlocksUsedAttr.attr,
+  &poolStatsPhysicalBlocksAttr.attr,
+  &poolStatsLogicalBlocksAttr.attr,
+  &poolStatsBlockMapCacheSizeAttr.attr,
+  &poolStatsWritePolicyAttr.attr,
+  &poolStatsBlockSizeAttr.attr,
+  &poolStatsCompleteRecoveriesAttr.attr,
+  &poolStatsReadOnlyRecoveriesAttr.attr,
+  &poolStatsModeAttr.attr,
+  &poolStatsInRecoveryModeAttr.attr,
+  &poolStatsRecoveryPercentageAttr.attr,
+  &poolStatsPackerCompressedFragmentsWrittenAttr.attr,
+  &poolStatsPackerCompressedBlocksWrittenAttr.attr,
+  &poolStatsPackerCompressedFragmentsInPackerAttr.attr,
+  &poolStatsAllocatorSlabCountAttr.attr,
+  &poolStatsAllocatorSlabsOpenedAttr.attr,
+  &poolStatsAllocatorSlabsReopenedAttr.attr,
+  &poolStatsJournalDiskFullAttr.attr,
+  &poolStatsJournalSlabJournalCommitsRequestedAttr.attr,
+  &poolStatsJournalEntriesStartedAttr.attr,
+  &poolStatsJournalEntriesWrittenAttr.attr,
+  &poolStatsJournalEntriesCommittedAttr.attr,
+  &poolStatsJournalBlocksStartedAttr.attr,
+  &poolStatsJournalBlocksWrittenAttr.attr,
+  &poolStatsJournalBlocksCommittedAttr.attr,
+  &poolStatsSlabJournalDiskFullCountAttr.attr,
+  &poolStatsSlabJournalFlushCountAttr.attr,
+  &poolStatsSlabJournalBlockedCountAttr.attr,
+  &poolStatsSlabJournalBlocksWrittenAttr.attr,
+  &poolStatsSlabJournalTailBusyCountAttr.attr,
+  &poolStatsSlabSummaryBlocksWrittenAttr.attr,
+  &poolStatsRefCountsBlocksWrittenAttr.attr,
+  &poolStatsBlockMapDirtyPagesAttr.attr,
+  &poolStatsBlockMapCleanPagesAttr.attr,
+  &poolStatsBlockMapFreePagesAttr.attr,
+  &poolStatsBlockMapFailedPagesAttr.attr,
+  &poolStatsBlockMapIncomingPagesAttr.attr,
+  &poolStatsBlockMapOutgoingPagesAttr.attr,
+  &poolStatsBlockMapCachePressureAttr.attr,
+  &poolStatsBlockMapReadCountAttr.attr,
+  &poolStatsBlockMapWriteCountAttr.attr,
+  &poolStatsBlockMapFailedReadsAttr.attr,
+  &poolStatsBlockMapFailedWritesAttr.attr,
+  &poolStatsBlockMapReclaimedAttr.attr,
+  &poolStatsBlockMapReadOutgoingAttr.attr,
+  &poolStatsBlockMapFoundInCacheAttr.attr,
+  &poolStatsBlockMapDiscardRequiredAttr.attr,
+  &poolStatsBlockMapWaitForPageAttr.attr,
+  &poolStatsBlockMapFetchRequiredAttr.attr,
+  &poolStatsBlockMapPagesLoadedAttr.attr,
+  &poolStatsBlockMapPagesSavedAttr.attr,
+  &poolStatsBlockMapFlushCountAttr.attr,
+  &poolStatsHashLockDedupeAdviceValidAttr.attr,
+  &poolStatsHashLockDedupeAdviceStaleAttr.attr,
+  &poolStatsHashLockConcurrentDataMatchesAttr.attr,
+  &poolStatsHashLockConcurrentHashCollisionsAttr.attr,
+  &poolStatsErrorsInvalidAdvicePBNCountAttr.attr,
+  &poolStatsErrorsNoSpaceErrorCountAttr.attr,
+  &poolStatsErrorsReadOnlyErrorCountAttr.attr,
+  &poolStatsInstanceAttr.attr,
+  &poolStatsCurrentVIOsInProgressAttr.attr,
+  &poolStatsMaxVIOsAttr.attr,
+  &poolStatsDedupeAdviceTimeoutsAttr.attr,
+  &poolStatsFlushOutAttr.attr,
+  &poolStatsLogicalBlockSizeAttr.attr,
+  &poolStatsBiosInReadAttr.attr,
+  &poolStatsBiosInWriteAttr.attr,
+  &poolStatsBiosInDiscardAttr.attr,
+  &poolStatsBiosInFlushAttr.attr,
+  &poolStatsBiosInFuaAttr.attr,
+  &poolStatsBiosInPartialReadAttr.attr,
+  &poolStatsBiosInPartialWriteAttr.attr,
+  &poolStatsBiosInPartialDiscardAttr.attr,
+  &poolStatsBiosInPartialFlushAttr.attr,
+  &poolStatsBiosInPartialFuaAttr.attr,
+  &poolStatsBiosOutReadAttr.attr,
+  &poolStatsBiosOutWriteAttr.attr,
+  &poolStatsBiosOutDiscardAttr.attr,
+  &poolStatsBiosOutFlushAttr.attr,
+  &poolStatsBiosOutFuaAttr.attr,
+  &poolStatsBiosMetaReadAttr.attr,
+  &poolStatsBiosMetaWriteAttr.attr,
+  &poolStatsBiosMetaDiscardAttr.attr,
+  &poolStatsBiosMetaFlushAttr.attr,
+  &poolStatsBiosMetaFuaAttr.attr,
+  &poolStatsBiosJournalReadAttr.attr,
+  &poolStatsBiosJournalWriteAttr.attr,
+  &poolStatsBiosJournalDiscardAttr.attr,
+  &poolStatsBiosJournalFlushAttr.attr,
+  &poolStatsBiosJournalFuaAttr.attr,
+  &poolStatsBiosPageCacheReadAttr.attr,
+  &poolStatsBiosPageCacheWriteAttr.attr,
+  &poolStatsBiosPageCacheDiscardAttr.attr,
+  &poolStatsBiosPageCacheFlushAttr.attr,
+  &poolStatsBiosPageCacheFuaAttr.attr,
+  &poolStatsBiosOutCompletedReadAttr.attr,
+  &poolStatsBiosOutCompletedWriteAttr.attr,
+  &poolStatsBiosOutCompletedDiscardAttr.attr,
+  &poolStatsBiosOutCompletedFlushAttr.attr,
+  &poolStatsBiosOutCompletedFuaAttr.attr,
+  &poolStatsBiosMetaCompletedReadAttr.attr,
+  &poolStatsBiosMetaCompletedWriteAttr.attr,
+  &poolStatsBiosMetaCompletedDiscardAttr.attr,
+  &poolStatsBiosMetaCompletedFlushAttr.attr,
+  &poolStatsBiosMetaCompletedFuaAttr.attr,
+  &poolStatsBiosJournalCompletedReadAttr.attr,
+  &poolStatsBiosJournalCompletedWriteAttr.attr,
+  &poolStatsBiosJournalCompletedDiscardAttr.attr,
+  &poolStatsBiosJournalCompletedFlushAttr.attr,
+  &poolStatsBiosJournalCompletedFuaAttr.attr,
+  &poolStatsBiosPageCacheCompletedReadAttr.attr,
+  &poolStatsBiosPageCacheCompletedWriteAttr.attr,
+  &poolStatsBiosPageCacheCompletedDiscardAttr.attr,
+  &poolStatsBiosPageCacheCompletedFlushAttr.attr,
+  &poolStatsBiosPageCacheCompletedFuaAttr.attr,
+  &poolStatsBiosAcknowledgedReadAttr.attr,
+  &poolStatsBiosAcknowledgedWriteAttr.attr,
+  &poolStatsBiosAcknowledgedDiscardAttr.attr,
+  &poolStatsBiosAcknowledgedFlushAttr.attr,
+  &poolStatsBiosAcknowledgedFuaAttr.attr,
+  &poolStatsBiosAcknowledgedPartialReadAttr.attr,
+  &poolStatsBiosAcknowledgedPartialWriteAttr.attr,
+  &poolStatsBiosAcknowledgedPartialDiscardAttr.attr,
+  &poolStatsBiosAcknowledgedPartialFlushAttr.attr,
+  &poolStatsBiosAcknowledgedPartialFuaAttr.attr,
+  &poolStatsBiosInProgressReadAttr.attr,
+  &poolStatsBiosInProgressWriteAttr.attr,
+  &poolStatsBiosInProgressDiscardAttr.attr,
+  &poolStatsBiosInProgressFlushAttr.attr,
+  &poolStatsBiosInProgressFuaAttr.attr,
+  &poolStatsMemoryUsageBytesUsedAttr.attr,
+  &poolStatsMemoryUsagePeakBytesUsedAttr.attr,
+  &poolStatsIndexEntriesIndexedAttr.attr,
+  &poolStatsIndexPostsFoundAttr.attr,
+  &poolStatsIndexPostsNotFoundAttr.attr,
+  &poolStatsIndexQueriesFoundAttr.attr,
+  &poolStatsIndexQueriesNotFoundAttr.attr,
+  &poolStatsIndexUpdatesFoundAttr.attr,
+  &poolStatsIndexUpdatesNotFoundAttr.attr,
+  &poolStatsIndexCurrDedupeQueriesAttr.attr,
+  &poolStatsIndexMaxDedupeQueriesAttr.attr,
+  NULL,
+};
diff --git a/vdo/kernel/statusCodeBlocks.h b/vdo/kernel/statusCodeBlocks.h
new file mode 100644
index 0000000..bca19c5
--- /dev/null
+++ b/vdo/kernel/statusCodeBlocks.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusCodeBlocks.h#1 $
+ */
+
+#ifndef STATUS_CODE_BLOCKS_H
+#define STATUS_CODE_BLOCKS_H
+
+enum {
+  UDS_BLOCK_SIZE  = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+  VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END,
+  VDO_BLOCK_END   = VDO_BLOCK_START + UDS_BLOCK_SIZE,
+  PRP_BLOCK_START = VDO_BLOCK_END,
+  PRP_BLOCK_END   = PRP_BLOCK_START + UDS_BLOCK_SIZE,
+};
+
+#endif // STATUS_CODE_BLOCKS_H
diff --git a/vdo/kernel/statusProcfs.c b/vdo/kernel/statusProcfs.c
new file mode 100644
index 0000000..70e8c9b
--- /dev/null
+++ b/vdo/kernel/statusProcfs.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.c#4 $
+ *
+ * Proc filesystem interface to the old GET_DEDUPE_STATS and
+ * GET_KERNEL_STATS ioctls, which can no longer be supported in 4.4
+ * and later kernels. These files return the same data as the old
+ * ioctls do, in order to require minimal changes to our (and
+ * customers') utilties and test code.
+ *
+ * +--+-----  /proc/vdo           procfsRoot
+ *    |
+ *    +-+-----  vdo<n>            config->poolName
+ *      |
+ *      +-------  dedupe_stats    GET_DEDUPE_STATS ioctl
+ *      +-------  kernel_stats    GET_KERNEL_STATS ioctl
+ *
+ */
+#include "statusProcfs.h"
+
+#include <linux/version.h>
+
+#include "memoryAlloc.h"
+
+#include "releaseVersions.h"
+#include "statistics.h"
+#include "vdo.h"
+
+#include "dedupeIndex.h"
+#include "ioSubmitter.h"
+#include "kernelStatistics.h"
+#include "logger.h"
+#include "memoryUsage.h"
+#include "threadDevice.h"
+#include "vdoCommon.h"
+
+static struct proc_dir_entry *procfsRoot = NULL;
+
+/**********************************************************************/
+static int statusDedupeShow(struct seq_file *m, void *v)
+{
+  KernelLayer *layer = (KernelLayer *) m->private;
+  VDOStatistics *stats;
+  size_t len = sizeof(VDOStatistics);
+  RegisteredThread allocatingThread, instanceThread;
+  registerAllocatingThread(&allocatingThread, NULL);
+  registerThreadDevice(&instanceThread, layer);
+  int result = ALLOCATE(1, VDOStatistics, __func__, &stats);
+  if (result == VDO_SUCCESS) {
+    getKVDOStatistics(&layer->kvdo, stats);
+    seq_write(m, stats, len);
+    FREE(stats);
+  }
+  unregisterThreadDeviceID();
+  unregisterAllocatingThread();
+  return result;
+}
+
+/**********************************************************************/
+static int statusDedupeOpen(struct inode *inode, struct file *file)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+  return single_open(file, statusDedupeShow, PDE_DATA(inode));
+#else
+  return single_open(file, statusDedupeShow, PDE(inode)->data);
+#endif
+}
+
+static const struct file_operations vdoProcfsDedupeOps = {
+  .open = statusDedupeOpen,
+  .read = seq_read,
+  .llseek = seq_lseek,
+  .release = single_release,
+};
+
+/**********************************************************************/
+static void copyBioStat(BioStats *b, const AtomicBioStats *a)
+{
+  b->read    = atomic64_read(&a->read);
+  b->write   = atomic64_read(&a->write);
+  b->discard = atomic64_read(&a->discard);
+  b->flush   = atomic64_read(&a->flush);
+  b->fua     = atomic64_read(&a->fua);
+}
+
+/**********************************************************************/
+static BioStats subtractBioStats(BioStats minuend, BioStats subtrahend)
+{
+  return (BioStats) {
+    .read    = minuend.read - subtrahend.read,
+    .write   = minuend.write - subtrahend.write,
+    .discard = minuend.discard - subtrahend.discard,
+    .flush   = minuend.flush - subtrahend.flush,
+    .fua     = minuend.fua - subtrahend.fua,
+  };
+}
+
+/**********************************************************************/
+void getKernelStats(KernelLayer *layer, KernelStatistics *stats)
+{
+  stats->version        = STATISTICS_VERSION;
+  stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER;
+  stats->instance       = layer->instance;
+  getLimiterValuesAtomically(&layer->requestLimiter,
+                             &stats->currentVIOsInProgress, &stats->maxVIOs);
+  // albireoTimeoutReport gives the number of timeouts, and dedupeContextBusy
+  // gives the number of queries not made because of earlier timeouts.
+  stats->dedupeAdviceTimeouts = (getEventCount(&layer->albireoTimeoutReporter)
+                                 + atomic64_read(&layer->dedupeContextBusy));
+  stats->flushOut             = atomic64_read(&layer->flushOut);
+  stats->logicalBlockSize     = layer->deviceConfig->logicalBlockSize;
+  copyBioStat(&stats->biosIn, &layer->biosIn);
+  copyBioStat(&stats->biosInPartial, &layer->biosInPartial);
+  copyBioStat(&stats->biosOut, &layer->biosOut);
+  copyBioStat(&stats->biosMeta, &layer->biosMeta);
+  copyBioStat(&stats->biosJournal, &layer->biosJournal);
+  copyBioStat(&stats->biosPageCache, &layer->biosPageCache);
+  copyBioStat(&stats->biosOutCompleted, &layer->biosOutCompleted);
+  copyBioStat(&stats->biosMetaCompleted, &layer->biosMetaCompleted);
+  copyBioStat(&stats->biosJournalCompleted, &layer->biosJournalCompleted);
+  copyBioStat(&stats->biosPageCacheCompleted,
+              &layer->biosPageCacheCompleted);
+  copyBioStat(&stats->biosAcknowledged, &layer->biosAcknowledged);
+  copyBioStat(&stats->biosAcknowledgedPartial,
+              &layer->biosAcknowledgedPartial);
+  stats->biosInProgress = subtractBioStats(stats->biosIn,
+                                           stats->biosAcknowledged);
+  stats->memoryUsage = getMemoryUsage();
+  getIndexStatistics(layer->dedupeIndex, &stats->index);
+}
+
+/**********************************************************************/
+static int statusKernelShow(struct seq_file *m, void *v)
+{
+  KernelLayer *layer = (KernelLayer *) m->private;
+  KernelStatistics *stats;
+  size_t len = sizeof(KernelStatistics);
+  RegisteredThread allocatingThread, instanceThread;
+  registerAllocatingThread(&allocatingThread, NULL);
+  registerThreadDevice(&instanceThread, layer);
+  int result = ALLOCATE(1, KernelStatistics, __func__, &stats);
+  if (result == VDO_SUCCESS) {
+    getKernelStats(layer, stats);
+    seq_write(m, stats, len);
+    FREE(stats);
+  }
+  unregisterThreadDeviceID();
+  unregisterAllocatingThread();
+  return result;
+}
+
+/**********************************************************************/
+static int statusKernelOpen(struct inode *inode, struct file *file)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+  return single_open(file, statusKernelShow, PDE_DATA(inode));
+#else
+  return single_open(file, statusKernelShow, PDE(inode)->data);
+#endif
+}
+
+static const struct file_operations vdoProcfsKernelOps = {
+  .open = statusKernelOpen,
+  .read = seq_read,
+  .llseek = seq_lseek,
+  .release = single_release,
+};
+
+/**********************************************************************/
+int vdoInitProcfs()
+{
+  const char *procfsName = getProcRoot();
+  procfsRoot = proc_mkdir(procfsName, NULL);
+  if (procfsRoot == NULL) {
+    logWarning("Could not create proc filesystem root %s\n", procfsName);
+    return -ENOMEM;
+  }
+  return VDO_SUCCESS;
+}
+
+/**********************************************************************/
+void vdoDestroyProcfs()
+{
+  remove_proc_entry(getProcRoot(), NULL);
+  procfsRoot = NULL;
+}
+
+/**********************************************************************/
+int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private)
+{
+  int result = VDO_SUCCESS;
+
+  if (procfsRoot != NULL) {
+    struct proc_dir_entry *fsDir;
+    fsDir = proc_mkdir(name, procfsRoot);
+    if (fsDir == NULL) {
+      result = -ENOMEM;
+    } else {
+      if (proc_create_data(getVDOStatisticsProcFile(), 0644, fsDir,
+                           &vdoProcfsDedupeOps, layer) == NULL) {
+        result = -ENOMEM;
+      } else if (proc_create_data(getKernelStatisticsProcFile(), 0644, fsDir,
+                                  &vdoProcfsKernelOps, layer) == NULL) {
+        result = -ENOMEM;
+      }
+    }
+    if (result < 0) {
+      vdoDestroyProcfsEntry(name, fsDir);
+    } else {
+      *private = fsDir;
+    }
+  } else {
+    logWarning("No proc filesystem root set, skipping %s\n", name);
+  }
+  return result;
+}
+
+/**********************************************************************/
+void vdoDestroyProcfsEntry(const char *name, void *private)
+{
+  if (procfsRoot != NULL) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+    remove_proc_subtree(name, procfsRoot);
+#else
+    struct proc_dir_entry *fsDir = (struct proc_dir_entry *) private;
+    remove_proc_entry(getVDOStatisticsProcFile(), fsDir);
+    remove_proc_entry(getKernelStatisticsProcFile(), fsDir);
+    remove_proc_entry(name, procfsRoot);
+#endif
+  }
+}
diff --git a/vdo/kernel/statusProcfs.h b/vdo/kernel/statusProcfs.h
new file mode 100644
index 0000000..a884c8e
--- /dev/null
+++ b/vdo/kernel/statusProcfs.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.h#1 $
+ *
+ */
+
+#ifndef STATUS_PROC_H
+#define STATUS_PROC_H
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "kernelLayer.h"
+
+/**
+ * Initializes the /proc/vdo directory. Should be called once when the
+ * module is loaded.
+ *
+ * @return 0 on success, nonzero on failure
+ */
+int vdoInitProcfs(void);
+
+/**
+ * Destroys the /proc/vdo directory. Should be called once when the
+ * module is unloaded.
+ */
+void vdoDestroyProcfs(void);
+
+/**
+ * Creates a subdirectory in the /proc/vdo filesystem for a particular
+ * vdo.
+ *
+ * @param layer    the kernel layer
+ * @param name     the subdirectory name
+ * @param private  pointer to private storage for procfs data
+ *
+ * @return 0 on success, nonzero on failure
+ */
+int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private);
+
+/**
+ * Destroys a subdirectory in the /proc/vdo filesystem for a
+ * particular vdo.
+ *
+ * @param name     the subdirectory name
+ * @param private  private storage for procfs data
+ */
+void vdoDestroyProcfsEntry(const char *name, void *private);
+
+/**
+ * Retrieves the current kernel statistics.
+ *
+ * @param layer    the kernel layer
+ * @param stats    pointer to the structure to fill in
+ */
+void getKernelStats(KernelLayer *layer, KernelStatistics *stats);
+
+#endif  /* STATUS_PROC_H */
diff --git a/vdo/kernel/sysfs.c b/vdo/kernel/sysfs.c
new file mode 100644
index 0000000..9244bf1
--- /dev/null
+++ b/vdo/kernel/sysfs.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.c#5 $
+ */
+
+#include "sysfs.h"
+
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include "dedupeIndex.h"
+#include "dmvdo.h"
+#include "logger.h"
+
+extern int defaultMaxRequestsActive;
+
+typedef struct vdoAttribute {
+  struct attribute  attr;
+  ssize_t (*show)(struct kvdoDevice *d, struct attribute *attr, char *buf);
+  ssize_t (*store)(struct kvdoDevice *d, const char *value, size_t count);
+  // Location of value, if .show == showInt or showUInt or showBool.
+  void     *valuePtr;
+} VDOAttribute;
+
+static char *statusStrings[] = {
+  "UNINITIALIZED",
+  "READY",
+  "SHUTTING DOWN",
+};
+
+/**********************************************************************/
+static ssize_t vdoStatusShow(struct kvdoDevice *device,
+                             struct attribute  *attr,
+                             char              *buf)
+{
+  return sprintf(buf, "%s\n", statusStrings[device->status]);
+}
+
+/**********************************************************************/
+static ssize_t vdoLogLevelShow(struct kvdoDevice *device,
+                               struct attribute  *attr,
+                               char              *buf)
+{
+  return sprintf(buf, "%s\n", priorityToString(getLogLevel()));
+}
+
+/**********************************************************************/
+static ssize_t vdoLogLevelStore(struct kvdoDevice *device,
+                                const char *buf, size_t n)
+{
+  static char internalBuf[11];
+
+  if (n > 10) {
+    return -EINVAL;
+  }
+
+  memset(internalBuf, '\000', sizeof(internalBuf));
+  memcpy(internalBuf, buf, n);
+  if (internalBuf[n - 1] == '\n') {
+    internalBuf[n - 1] = '\000';
+  }
+  setLogLevel(stringToPriority(internalBuf));
+  return n;
+}
+
+/**********************************************************************/
+static ssize_t scanInt(const char *buf,
+                       size_t      n,
+                       int        *valuePtr,
+                       int         minimum,
+                       int         maximum)
+{
+  if (n > 12) {
+    return -EINVAL;
+  }
+  unsigned int value;
+  if (sscanf(buf, "%d", &value) != 1) {
+    return -EINVAL;
+  }
+  if (value < minimum) {
+    value = minimum;
+  } else if (value > maximum) {
+    value = maximum;
+  }
+  *valuePtr = value;
+  return n;
+}
+
+/**********************************************************************/
+static ssize_t showInt(struct kvdoDevice *device,
+                       struct attribute  *attr,
+                       char              *buf)
+{
+  VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr);
+
+  return sprintf(buf, "%d\n", *(int *)vdoAttr->valuePtr);
+}
+
+/**********************************************************************/
+static ssize_t scanUInt(const char   *buf,
+                        size_t        n,
+                        unsigned int *valuePtr,
+                        unsigned int  minimum,
+                        unsigned int  maximum)
+{
+  if (n > 12) {
+    return -EINVAL;
+  }
+  unsigned int value;
+  if (sscanf(buf, "%u", &value) != 1) {
+    return -EINVAL;
+  }
+  if (value < minimum) {
+    value = minimum;
+  } else if (value > maximum) {
+    value = maximum;
+  }
+  *valuePtr = value;
+  return n;
+}
+
+/**********************************************************************/
+static ssize_t showUInt(struct kvdoDevice *device,
+                        struct attribute  *attr,
+                        char              *buf)
+{
+  VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr);
+
+  return sprintf(buf, "%u\n", *(unsigned int *)vdoAttr->valuePtr);
+}
+
+/**********************************************************************/
+static ssize_t scanBool(const char *buf, size_t n, bool *valuePtr)
+{
+  unsigned int intValue = 0;
+  n = scanUInt(buf, n, &intValue, 0, 1);
+  if (n > 0) {
+    *valuePtr = (intValue != 0);
+  }
+  return n;
+}
+
+/**********************************************************************/
+static ssize_t showBool(struct kvdoDevice *device,
+                        struct attribute  *attr,
+                        char              *buf)
+{
+  VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr);
+
+  return sprintf(buf, "%u\n", *(bool *)vdoAttr->valuePtr ? 1 : 0);
+}
+
+/**********************************************************************/
+static ssize_t vdoTraceRecordingStore(struct kvdoDevice *device,
+                                      const char        *buf,
+                                      size_t             n)
+{
+  return scanBool(buf, n, &traceRecording);
+}
+
+/**********************************************************************/
+static ssize_t vdoMaxReqActiveStore(struct kvdoDevice *device,
+                                    const char        *buf,
+                                    size_t             n)
+{
+  /*
+   * The base code has some hardcoded assumptions about the maximum
+   * number of requests that can be in progress. Maybe someday we'll
+   * do calculations with the actual number; for now, just make sure
+   * the assumption holds.
+   */
+  return scanInt(buf, n, &defaultMaxRequestsActive, 1, MAXIMUM_USER_VIOS);
+}
+
+/**********************************************************************/
+static ssize_t vdoAlbireoTimeoutIntervalStore(struct kvdoDevice *device,
+                                              const char        *buf,
+                                              size_t             n)
+{
+  unsigned int value;
+  ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX);
+  if (result > 0) {
+    setAlbireoTimeoutInterval(value);
+  }
+  return result;
+}
+
+/**********************************************************************/
+static ssize_t vdoMinAlbireoTimerIntervalStore(struct kvdoDevice *device,
+                                               const char        *buf,
+                                               size_t             n)
+{
+  unsigned int value;
+  ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX);
+  if (result > 0) {
+    setMinAlbireoTimerInterval(value);
+  }
+  return result;
+}
+
+/**********************************************************************/
+static ssize_t vdoVersionShow(struct kvdoDevice *device,
+                              struct attribute  *attr,
+                              char              *buf)
+{
+  return sprintf(buf, "%s\n", CURRENT_VERSION);
+}
+
+/**********************************************************************/
+static ssize_t vdoAttrShow(struct kobject   *kobj,
+                           struct attribute *attr,
+                           char             *buf)
+{
+  VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr);
+  if (vdoAttr->show == NULL) {
+    return -EINVAL;
+  }
+
+  struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj);
+  return (*vdoAttr->show)(device, attr, buf);
+}
+
+/**********************************************************************/
+static ssize_t vdoAttrStore(struct kobject   *kobj,
+                            struct attribute *attr,
+                            const char       *buf,
+                            size_t            length)
+{
+  VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr);
+  if (vdoAttr->store == NULL) {
+    return -EINVAL;
+  }
+
+  struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj);
+  return (*vdoAttr->store)(device, buf, length);
+}
+
+static VDOAttribute vdoStatusAttr = {
+  .attr  = { .name = "status", .mode = 0444, },
+  .show  = vdoStatusShow,
+};
+
+static VDOAttribute vdoLogLevelAttr = {
+  .attr  = {.name = "log_level", .mode = 0644, },
+  .show  = vdoLogLevelShow,
+  .store = vdoLogLevelStore,
+};
+
+static VDOAttribute vdoMaxReqActiveAttr = {
+  .attr     = {.name = "max_requests_active", .mode = 0644, },
+  .show     = showInt,
+  .store    = vdoMaxReqActiveStore,
+  .valuePtr = &defaultMaxRequestsActive,
+};
+
+static VDOAttribute vdoAlbireoTimeoutInterval = {
+  .attr     = {.name = "deduplication_timeout_interval", .mode = 0644, },
+  .show     = showUInt,
+  .store    = vdoAlbireoTimeoutIntervalStore,
+  .valuePtr = &albireoTimeoutInterval,
+};
+
+static VDOAttribute vdoMinAlbireoTimerInterval = {
+  .attr     = {.name = "min_deduplication_timer_interval", .mode = 0644, },
+  .show     = showUInt,
+  .store    = vdoMinAlbireoTimerIntervalStore,
+  .valuePtr = &minAlbireoTimerInterval,
+};
+
+static VDOAttribute vdoTraceRecording = {
+  .attr     = {.name = "trace_recording", .mode = 0644, },
+  .show     = showBool,
+  .store    = vdoTraceRecordingStore,
+  .valuePtr = &traceRecording,
+};
+
+static VDOAttribute vdoVersionAttr = {
+  .attr  = { .name = "version", .mode = 0444, },
+  .show  = vdoVersionShow,
+};
+
+static struct attribute *defaultAttrs[] = {
+  &vdoStatusAttr.attr,
+  &vdoLogLevelAttr.attr,
+  &vdoMaxReqActiveAttr.attr,
+  &vdoAlbireoTimeoutInterval.attr,
+  &vdoMinAlbireoTimerInterval.attr,
+  &vdoTraceRecording.attr,
+  &vdoVersionAttr.attr,
+  NULL
+};
+
+static struct sysfs_ops vdoSysfsOps = {
+  .show  = vdoAttrShow,
+  .store = vdoAttrStore,
+};
+
+/**********************************************************************/
+static void vdoRelease(struct kobject *kobj)
+{
+  return;
+}
+
+struct kobj_type vdo_ktype = {
+  .release   = vdoRelease,
+  .sysfs_ops = &vdoSysfsOps,
+  .default_attrs = defaultAttrs,
+};
+
+/**********************************************************************/
+int vdoInitSysfs(struct kobject *deviceObject)
+{
+  kobject_init(deviceObject, &vdo_ktype);
+  int result = kobject_add(deviceObject, NULL, THIS_MODULE->name);
+  if (result < 0) {
+    logError("kobject_add failed with status %d", -result);
+    kobject_put(deviceObject);
+  }
+  logDebug("added sysfs objects");
+  return result;
+};
+
+/**********************************************************************/
+void vdoPutSysfs(struct kobject *deviceObject)
+{
+  kobject_put(deviceObject);
+}
diff --git a/vdo/kernel/sysfs.h b/vdo/kernel/sysfs.h
new file mode 100644
index 0000000..3dbac04
--- /dev/null
+++ b/vdo/kernel/sysfs.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.h#2 $
+ */
+
+#ifndef ALBIREO_SYSFS_H
+#define ALBIREO_SYSFS_H
+
+#include "kernelLayer.h"
+
+struct kvdoDevice;
+
+/**
+* Initializes the sysfs objects global to all vdo devices.
+*
+* @param deviceObject  the kobject of the kvdoDevice to initialize.
+*/
+int vdoInitSysfs(struct kobject *deviceObject);
+
+/**
+ * Releases the global sysfs objects.
+ *
+ * @param deviceObject  the kobject of the kvdoDevice to release.
+ */
+void vdoPutSysfs(struct kobject *deviceObject);
+
+#endif /* ALBIREO_SYSFS_H */
diff --git a/vdo/kernel/threadDevice.c b/vdo/kernel/threadDevice.c
new file mode 100644
index 0000000..49fb909
--- /dev/null
+++ b/vdo/kernel/threadDevice.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.c#1 $
+ */
+
+#include "threadDevice.h"
+
+#include "threadRegistry.h"
+
+/*
+ * A registry of all threads temporarily associated with particular
+ * VDO devices.
+ */
+static ThreadRegistry deviceIDThreadRegistry;
+
+/**********************************************************************/
+void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr)
+{
+  registerThread(&deviceIDThreadRegistry, newThread, idPtr);
+}
+
+/**********************************************************************/
+void unregisterThreadDeviceID(void)
+{
+  unregisterThread(&deviceIDThreadRegistry);
+}
+
+/**********************************************************************/
+int getThreadDeviceID(void)
+{
+  const unsigned int *pointer = lookupThread(&deviceIDThreadRegistry);
+  return pointer ? *pointer : -1;
+}
+
+/**********************************************************************/
+void initializeThreadDeviceRegistry(void)
+{
+  initializeThreadRegistry(&deviceIDThreadRegistry);
+}
diff --git a/vdo/kernel/threadDevice.h b/vdo/kernel/threadDevice.h
new file mode 100644
index 0000000..61b4ce6
--- /dev/null
+++ b/vdo/kernel/threadDevice.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.h#1 $
+ */
+
+#include "kernelLayer.h"
+
+/**
+ * Temporarily register the current thread as being associated with a
+ * VDO device id number, for logging purposes.
+ *
+ * Any such registered thread must later be unregistered via
+ * unregisterThreadDeviceID.
+ *
+ * The pointed-to ID number should be nonzero.
+ *
+ * @param newThread  RegisteredThread structure to use for the current thread
+ * @param idPtr      Location where the ID number is stored
+ **/
+void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr);
+
+/**
+ * Temporarily register the current thread as being associated with an
+ * existing VDO device, for logging purposes.
+ *
+ * Any such registered thread must later be unregistered via
+ * unregisterThreadDeviceID.
+ *
+ * @param newThread  RegisteredThread structure to use for the current thread
+ * @param layer      The KernelLayer object for the VDO device
+ **/
+static inline void registerThreadDevice(RegisteredThread *newThread,
+                                        KernelLayer      *layer)
+{
+  registerThreadDeviceID(newThread, &layer->instance);
+}
+
+/**
+ * Cancel registration of the current thread as being associated with
+ * a VDO device or device ID number.
+ **/
+void unregisterThreadDeviceID(void);
+
+/**
+ * Get the VDO device ID number temporarily associated with the
+ * current thread, if any.
+ *
+ * @return  the device ID number, if any, or -1
+ **/
+int getThreadDeviceID(void);
+
+/**
+ * Initialize the thread device-ID registry.
+ **/
+void initializeThreadDeviceRegistry(void);
diff --git a/vdo/kernel/threadRegistry.c b/vdo/kernel/threadRegistry.c
new file mode 100644
index 0000000..6184d3c
--- /dev/null
+++ b/vdo/kernel/threadRegistry.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.c#1 $
+ */
+
+#include "threadRegistry.h"
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include "permassert.h"
+
+/*
+ * We need to be careful when using other facilities that may use
+ * threadRegistry functions in their normal operation.  For example,
+ * we do not want to invoke the logger while holding a lock.
+ */
+
+/*****************************************************************************/
+void registerThread(ThreadRegistry   *registry,
+                    RegisteredThread *newThread,
+                    const void       *pointer)
+{
+  INIT_LIST_HEAD(&newThread->links);
+  newThread->pointer = pointer;
+  newThread->task    = current;
+
+  bool foundIt = false;
+  RegisteredThread *thread;
+  write_lock(&registry->lock);
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      // This should not have been there.
+      // We'll complain after releasing the lock.
+      list_del_init(&thread->links);
+      foundIt = true;
+      break;
+    }
+  }
+  list_add_tail(&newThread->links, &registry->links);
+  write_unlock(&registry->lock);
+  ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry");
+}
+
+/*****************************************************************************/
+void unregisterThread(ThreadRegistry *registry)
+{
+  bool foundIt = false;
+  RegisteredThread *thread;
+  write_lock(&registry->lock);
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      list_del_init(&thread->links);
+      foundIt = true;
+      break;
+    }
+  }
+  write_unlock(&registry->lock);
+  ASSERT_LOG_ONLY(foundIt, "thread found in registry");
+}
+
+/*****************************************************************************/
+void initializeThreadRegistry(ThreadRegistry *registry)
+{
+  INIT_LIST_HEAD(&registry->links);
+  rwlock_init(&registry->lock);
+}
+
+/*****************************************************************************/
+const void *lookupThread(ThreadRegistry *registry)
+{
+  const void *result = NULL;
+  read_lock(&registry->lock);
+  RegisteredThread *thread;
+  list_for_each_entry(thread, &registry->links, links) {
+    if (thread->task == current) {
+      result = thread->pointer;
+      break;
+    }
+  }
+  read_unlock(&registry->lock);
+  return result;
+}
diff --git a/vdo/kernel/threadRegistry.h b/vdo/kernel/threadRegistry.h
new file mode 100644
index 0000000..f32325e
--- /dev/null
+++ b/vdo/kernel/threadRegistry.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.h#1 $
+ */
+
+#ifndef THREAD_REGISTRY_H
+#define THREAD_REGISTRY_H 1
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * We don't expect this set to ever get really large, so a linked list
+ * is adequate.
+ */
+
+typedef struct threadRegistry {
+  struct list_head links;
+  rwlock_t         lock;
+} ThreadRegistry;
+
+typedef struct registeredThread {
+  struct list_head    links;
+  const void         *pointer;
+  struct task_struct *task;
+} RegisteredThread;
+
+/*****************************************************************************/
+
+/**
+ * Initialize a registry of threads and associated data pointers.
+ *
+ * @param  registry  The registry to initialize
+ **/
+void initializeThreadRegistry(ThreadRegistry *registry);
+
+/**
+ * Register the current thread and associate it with a data pointer.
+ *
+ * This call will log messages if the thread is already registered.
+ *
+ * @param registry   The thread registry
+ * @param newThread  RegisteredThread structure to use for the current thread
+ * @param pointer    The value to associated with the current thread
+ **/
+void registerThread(ThreadRegistry   *registry,
+                    RegisteredThread *newThread,
+                    const void       *pointer);
+
+/**
+ * Remove the registration for the current thread.
+ *
+ * A message may be logged if the thread was not registered.
+ *
+ * @param  registry  The thread registry
+ **/
+void unregisterThread(ThreadRegistry *registry);
+
+/**
+ * Fetch a pointer that may have been registered for the current
+ * thread. If the thread is not registered, a null pointer is
+ * returned.
+ *
+ * @param  registry  The thread registry
+ *
+ * @return  the registered pointer, if any, or NULL
+ **/
+const void *lookupThread(ThreadRegistry *registry);
+
+#endif /* THREAD_REGISTRY_H */
diff --git a/vdo/kernel/threads.c b/vdo/kernel/threads.c
new file mode 100644
index 0000000..2f905ed
--- /dev/null
+++ b/vdo/kernel/threads.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.c#1 $
+ */
+
+#include "threads.h"
+
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+/**********************************************************************/
+pid_t getThreadId(void)
+{
+  return in_interrupt() ? -1 : current->pid;
+}
diff --git a/vdo/kernel/threads.h b/vdo/kernel/threads.h
new file mode 100644
index 0000000..25f8b47
--- /dev/null
+++ b/vdo/kernel/threads.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.h#1 $
+ */
+
+#ifndef THREADS_H
+#define THREADS_H
+
+#include <linux/types.h>
+
+/**
+ * Return the id of the current thread.
+ * In kernel interrupt context, returns -1.
+ *
+ * @return       the thread id
+ **/
+pid_t getThreadId(void)
+  __attribute__((warn_unused_result));
+
+#endif /* THREADS_H */
diff --git a/vdo/kernel/udsIndex.c b/vdo/kernel/udsIndex.c
new file mode 100644
index 0000000..a202446
--- /dev/null
+++ b/vdo/kernel/udsIndex.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.c#16 $
+ */
+
+#include "udsIndex.h"
+
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "murmur/MurmurHash3.h"
+#include "numeric.h"
+#include "stringUtils.h"
+#include "uds-block.h"
+
+/*****************************************************************************/
+
+typedef struct udsAttribute {
+  struct attribute attr;
+  const char *(*showString)(DedupeIndex *);
+} UDSAttribute;
+
+/*****************************************************************************/
+
+enum { UDS_Q_ACTION };
+
+/*****************************************************************************/
+
+// These are the values in the atomic dedupeContext.requestState field
+enum {
+  // The UdsRequest object is not in use.
+  UR_IDLE = 0,
+  // The UdsRequest object is in use, and VDO is waiting for the result.
+  UR_BUSY = 1,
+  // The UdsRequest object is in use, but has timed out.
+  UR_TIMED_OUT = 2,
+};
+
+/*****************************************************************************/
+
+typedef enum {
+  // The UDS index is closed
+  IS_CLOSED = 0,
+  // The UDS index session is opening or closing
+  IS_CHANGING = 1,
+  // The UDS index is open.  There is a UDS index session.
+  IS_OPENED = 2,
+} IndexState;
+
+/*****************************************************************************/
+
+typedef struct udsIndex {
+  DedupeIndex        common;
+  struct kobject     dedupeObject;
+  RegisteredThread   allocatingThread;
+  char              *indexName;
+  UdsConfiguration   configuration;
+  struct uds_parameters udsParams;
+  struct uds_index_session *indexSession;
+  atomic_t           active;
+  // This spinlock protects the state fields and the starting of dedupe
+  // requests.
+  spinlock_t         stateLock;
+  KvdoWorkItem       workItem;    // protected by stateLock
+  KvdoWorkQueue     *udsQueue;    // protected by stateLock
+  unsigned int       maximum;     // protected by stateLock
+  IndexState         indexState;  // protected by stateLock
+  IndexState         indexTarget; // protected by stateLock
+  bool               changing;    // protected by stateLock
+  bool               createFlag;  // protected by stateLock
+  bool               dedupeFlag;  // protected by stateLock
+  bool               deduping;    // protected by stateLock
+  bool               errorFlag;   // protected by stateLock
+  bool               suspended;   // protected by stateLock
+  // This spinlock protects the pending list, the pending flag in each KVIO,
+  // and the timeout list.
+  spinlock_t         pendingLock;
+  struct list_head   pendingHead;  // protected by pendingLock
+  struct timer_list  pendingTimer; // protected by pendingLock
+  bool               startedTimer; // protected by pendingLock
+} UDSIndex;
+
+/*****************************************************************************/
+
+// Version 1:  user space albireo index (limited to 32 bytes)
+// Version 2:  kernel space albireo index (limited to 16 bytes)
+enum {
+  UDS_ADVICE_VERSION = 2,
+  // version byte + state byte + 64-bit little-endian PBN
+  UDS_ADVICE_SIZE    = 1 + 1 + sizeof(uint64_t),
+};
+
+/*****************************************************************************/
+
+  // We want to ensure that there is only one copy of the following constants.
+static const char *CLOSED    = "closed";
+static const char *CLOSING   = "closing";
+static const char *ERROR     = "error";
+static const char *OFFLINE   = "offline";
+static const char *ONLINE    = "online";
+static const char *OPENING   = "opening";
+static const char *SUSPENDED = "suspended";
+static const char *UNKNOWN   = "unknown";
+
+/*****************************************************************************/
+static const char *indexStateToString(UDSIndex *index, IndexState state)
+{
+  if (index->suspended) {
+    return SUSPENDED;
+  }
+
+  switch (state) {
+  case IS_CLOSED:
+    // Closed.  The errorFlag tells if it is because of an error.
+    return index->errorFlag ? ERROR : CLOSED;
+  case IS_CHANGING:
+    // The indexTarget tells if we are opening or closing the index.
+    return index->indexTarget == IS_OPENED ? OPENING : CLOSING;
+  case IS_OPENED:
+    // Opened.  The dedupeFlag tells if we are online or offline.
+    return index->dedupeFlag ? ONLINE : OFFLINE;
+  default:
+    return UNKNOWN;
+  }
+}
+
+/**
+ * Encode VDO duplicate advice into the newMetadata field of a UDS request.
+ *
+ * @param request  The UDS request to receive the encoding
+ * @param advice   The advice to encode
+ **/
+static void encodeUDSAdvice(UdsRequest *request, DataLocation advice)
+{
+  size_t offset = 0;
+  struct udsChunkData *encoding = &request->newMetadata;
+  encoding->data[offset++] = UDS_ADVICE_VERSION;
+  encoding->data[offset++] = advice.state;
+  encodeUInt64LE(encoding->data, &offset, advice.pbn);
+  BUG_ON(offset != UDS_ADVICE_SIZE);
+}
+
+/**
+ * Decode VDO duplicate advice from the oldMetadata field of a UDS request.
+ *
+ * @param request  The UDS request containing the encoding
+ * @param advice   The DataLocation to receive the decoded advice
+ *
+ * @return <code>true</code> if valid advice was found and decoded
+ **/
+static bool decodeUDSAdvice(const UdsRequest *request, DataLocation *advice)
+{
+  if ((request->status != UDS_SUCCESS) || !request->found) {
+    return false;
+  }
+
+  size_t offset = 0;
+  const struct udsChunkData *encoding = &request->oldMetadata;
+  byte version = encoding->data[offset++];
+  if (version != UDS_ADVICE_VERSION) {
+    logError("invalid UDS advice version code %u", version);
+    return false;
+  }
+
+  advice->state = encoding->data[offset++];
+  decodeUInt64LE(encoding->data, &offset, &advice->pbn);
+  BUG_ON(offset != UDS_ADVICE_SIZE);
+  return true;
+}
+
+/*****************************************************************************/
+static void finishIndexOperation(UdsRequest *udsRequest)
+{
+  DataKVIO *dataKVIO = container_of(udsRequest, DataKVIO,
+                                    dedupeContext.udsRequest);
+  DedupeContext *dedupeContext = &dataKVIO->dedupeContext;
+  if (compareAndSwap32(&dedupeContext->requestState, UR_BUSY, UR_IDLE)) {
+    KVIO *kvio = dataKVIOAsKVIO(dataKVIO);
+    UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common);
+
+    spin_lock_bh(&index->pendingLock);
+    if (dedupeContext->isPending) {
+      list_del(&dedupeContext->pendingList);
+      dedupeContext->isPending = false;
+    }
+    spin_unlock_bh(&index->pendingLock);
+
+    dedupeContext->status = udsRequest->status;
+    if ((udsRequest->type == UDS_POST) || (udsRequest->type == UDS_QUERY)) {
+      DataLocation advice;
+      if (decodeUDSAdvice(udsRequest, &advice)) {
+        setDedupeAdvice(dedupeContext, &advice);
+      } else {
+        setDedupeAdvice(dedupeContext, NULL);
+      }
+    }
+    invokeDedupeCallback(dataKVIO);
+    atomic_dec(&index->active);
+  } else {
+    compareAndSwap32(&dedupeContext->requestState, UR_TIMED_OUT, UR_IDLE);
+  }
+}
+
+/*****************************************************************************/
+static void startExpirationTimer(UDSIndex *index, DataKVIO *dataKVIO)
+{
+  if (!index->startedTimer) {
+    index->startedTimer = true;
+    mod_timer(&index->pendingTimer,
+              getAlbireoTimeout(dataKVIO->dedupeContext.submissionTime));
+  }
+}
+
+/*****************************************************************************/
+static void startIndexOperation(KvdoWorkItem *item)
+{
+  KVIO *kvio = workItemAsKVIO(item);
+  DataKVIO *dataKVIO = kvioAsDataKVIO(kvio);
+  UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common);
+  DedupeContext *dedupeContext = &dataKVIO->dedupeContext;
+
+  spin_lock_bh(&index->pendingLock);
+  list_add_tail(&dedupeContext->pendingList, &index->pendingHead);
+  dedupeContext->isPending = true;
+  startExpirationTimer(index, dataKVIO);
+  spin_unlock_bh(&index->pendingLock);
+
+  UdsRequest *udsRequest = &dedupeContext->udsRequest;
+  int status = udsStartChunkOperation(udsRequest);
+  if (status != UDS_SUCCESS) {
+    udsRequest->status = status;
+    finishIndexOperation(udsRequest);
+  }
+}
+
+/*****************************************************************************/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+static void timeoutIndexOperations(struct timer_list *t)
+#else
+static void timeoutIndexOperations(unsigned long arg)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+  UDSIndex *index = from_timer(index, t, pendingTimer);
+#else
+  UDSIndex *index = (UDSIndex *) arg;
+#endif
+  LIST_HEAD(expiredHead);
+  uint64_t timeoutJiffies = msecs_to_jiffies(albireoTimeoutInterval);
+  unsigned long earliestSubmissionAllowed = jiffies - timeoutJiffies;
+  spin_lock_bh(&index->pendingLock);
+  index->startedTimer = false;
+  while (!list_empty(&index->pendingHead)) {
+    DataKVIO *dataKVIO = list_first_entry(&index->pendingHead, DataKVIO,
+                                          dedupeContext.pendingList);
+    DedupeContext *dedupeContext = &dataKVIO->dedupeContext;
+    if (earliestSubmissionAllowed <= dedupeContext->submissionTime) {
+      startExpirationTimer(index, dataKVIO);
+      break;
+    }
+    list_del(&dedupeContext->pendingList);
+    dedupeContext->isPending = false;
+    list_add_tail(&dedupeContext->pendingList, &expiredHead);
+  }
+  spin_unlock_bh(&index->pendingLock);
+  while (!list_empty(&expiredHead)) {
+    DataKVIO *dataKVIO = list_first_entry(&expiredHead, DataKVIO,
+                                          dedupeContext.pendingList);
+    DedupeContext *dedupeContext = &dataKVIO->dedupeContext;
+    list_del(&dedupeContext->pendingList);
+    if (compareAndSwap32(&dedupeContext->requestState,
+                         UR_BUSY, UR_TIMED_OUT)) {
+      dedupeContext->status = ETIMEDOUT;
+      invokeDedupeCallback(dataKVIO);
+      atomic_dec(&index->active);
+      kvdoReportDedupeTimeout(dataKVIOAsKVIO(dataKVIO)->layer, 1);
+    }
+  }
+}
+
+/*****************************************************************************/
+static void enqueueIndexOperation(DataKVIO        *dataKVIO,
+                                  UdsCallbackType  operation)
+{
+  KVIO *kvio = dataKVIOAsKVIO(dataKVIO);
+  DedupeContext *dedupeContext = &dataKVIO->dedupeContext;
+  UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common);
+  dedupeContext->status         = UDS_SUCCESS;
+  dedupeContext->submissionTime = jiffies;
+  if (compareAndSwap32(&dedupeContext->requestState, UR_IDLE, UR_BUSY)) {
+    UdsRequest *udsRequest = &dataKVIO->dedupeContext.udsRequest;
+    udsRequest->chunkName = *dedupeContext->chunkName;
+    udsRequest->callback  = finishIndexOperation;
+    udsRequest->session   = index->indexSession;
+    udsRequest->type      = operation;
+    udsRequest->update    = true;
+    if ((operation == UDS_POST) || (operation == UDS_UPDATE)) {
+      encodeUDSAdvice(udsRequest, getDedupeAdvice(dedupeContext));
+    }
+
+    setupWorkItem(&kvio->enqueueable.workItem, startIndexOperation, NULL,
+                  UDS_Q_ACTION);
+
+    spin_lock(&index->stateLock);
+    if (index->deduping) {
+      enqueueWorkQueue(index->udsQueue, &kvio->enqueueable.workItem);
+      unsigned int active = atomic_inc_return(&index->active);
+      if (active > index->maximum) {
+        index->maximum = active;
+      }
+      kvio = NULL;
+    } else {
+      atomicStore32(&dedupeContext->requestState, UR_IDLE);
+    }
+    spin_unlock(&index->stateLock);
+  } else {
+    // A previous user of the KVIO had a dedupe timeout
+    // and its request is still outstanding.
+    atomic64_inc(&kvio->layer->dedupeContextBusy);
+  }
+  if (kvio != NULL) {
+    invokeDedupeCallback(dataKVIO);
+  }
+}
+
+/*****************************************************************************/
+static void closeIndex(UDSIndex *index)
+{
+  // Change the index state so that getIndexStatistics will not try to
+  // use the index session we are closing.
+  index->indexState = IS_CHANGING;
+  spin_unlock(&index->stateLock);
+  int result = udsCloseIndex(index->indexSession);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Error closing index %s",
+                            index->indexName);
+  }
+  spin_lock(&index->stateLock);
+  index->indexState = IS_CLOSED;
+  index->errorFlag |= result != UDS_SUCCESS;
+  // ASSERTION: We leave in IS_CLOSED state.
+}
+
+/*****************************************************************************/
+static void openIndex(UDSIndex *index)
+{
+  // ASSERTION: We enter in IS_CLOSED state.
+  bool createFlag = index->createFlag;
+  index->createFlag = false;
+  // Change the index state so that the it will be reported to the outside
+  // world as "opening".
+  index->indexState = IS_CHANGING;
+  index->errorFlag = false;
+  // Open the index session, while not holding the stateLock
+  spin_unlock(&index->stateLock);
+
+  int result = udsOpenIndex(createFlag ? UDS_CREATE : UDS_LOAD,
+                            index->indexName, &index->udsParams,
+                            index->configuration, index->indexSession);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Error opening index %s",
+                            index->indexName);
+  }
+  spin_lock(&index->stateLock);
+  if (!createFlag) {
+    switch (result) {
+    case UDS_CORRUPT_COMPONENT:
+    case UDS_NO_INDEX:
+      // Either there is no index, or there is no way we can recover the index.
+      // We will be called again and try to create a new index.
+      index->indexState = IS_CLOSED;
+      index->createFlag = true;
+      return;
+    default:
+      break;
+    }
+  }
+  if (result == UDS_SUCCESS) {
+    index->indexState  = IS_OPENED;
+  } else {
+    index->indexState  = IS_CLOSED;
+    index->indexTarget = IS_CLOSED;
+    index->errorFlag   = true;
+    spin_unlock(&index->stateLock);
+    logInfo("Setting UDS index target state to error");
+    spin_lock(&index->stateLock);
+  }
+  // ASSERTION: On success, we leave in IS_OPEN state.
+  // ASSERTION: On failure, we leave in IS_CLOSED state.
+}
+
+/*****************************************************************************/
+static void changeDedupeState(KvdoWorkItem *item)
+{
+  UDSIndex *index = container_of(item, UDSIndex, workItem);
+  spin_lock(&index->stateLock);
+  // Loop until the index is in the target state and the create flag is
+  // clear.
+  while (!index->suspended &&
+         ((index->indexState != index->indexTarget) ||
+          index->createFlag)) {
+    if (index->indexState == IS_OPENED) {
+      closeIndex(index);
+    } else {
+      openIndex(index);
+    }
+  }
+  index->changing = false;
+  index->deduping = index->dedupeFlag && (index->indexState == IS_OPENED);
+  spin_unlock(&index->stateLock);
+}
+
+
+/*****************************************************************************/
+static void launchDedupeStateChange(UDSIndex *index)
+{
+  // ASSERTION: We enter with the state_lock held.
+  if (index->changing || index->suspended) {
+    // Either a change is already in progress, or changes are
+    // not allowed.
+    return;
+  }
+
+  if (index->createFlag ||
+      (index->indexState != index->indexTarget)) {
+    index->changing = true;
+    index->deduping = false;
+    setupWorkItem(&index->workItem,
+                  changeDedupeState,
+                  NULL,
+                  UDS_Q_ACTION);
+    enqueueWorkQueue(index->udsQueue, &index->workItem);
+    return;
+  }
+
+  // Online vs. offline changes happen immediately
+  index->deduping = (index->dedupeFlag && !index->suspended &&
+                     (index->indexState == IS_OPENED));
+
+  // ASSERTION: We exit with the state_lock held.
+}
+
+/*****************************************************************************/
+static void setTargetState(UDSIndex   *index,
+                           IndexState  target,
+                           bool        changeDedupe,
+                           bool        dedupe,
+                           bool        setCreate)
+{
+  spin_lock(&index->stateLock);
+  const char *oldState = indexStateToString(index, index->indexTarget);
+  if (changeDedupe) {
+    index->dedupeFlag = dedupe;
+  }
+  if (setCreate) {
+    index->createFlag = true;
+  }
+  index->indexTarget = target;
+  launchDedupeStateChange(index);
+  const char *newState = indexStateToString(index, index->indexTarget);
+  spin_unlock(&index->stateLock);
+  if (oldState != newState) {
+    logInfo("Setting UDS index target state to %s", newState);
+  }
+}
+
+/*****************************************************************************/
+static void suspendUDSIndex(DedupeIndex *dedupeIndex, bool saveFlag)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  spin_lock(&index->stateLock);
+  index->suspended = true;
+  IndexState indexState = index->indexState;
+  spin_unlock(&index->stateLock);
+  if (indexState != IS_CLOSED) {
+    int result = udsSuspendIndexSession(index->indexSession, saveFlag);
+    if (result != UDS_SUCCESS) {
+      logErrorWithStringError(result, "Error suspending dedupe index");
+    }
+  }
+}
+
+/*****************************************************************************/
+static void resumeUDSIndex(DedupeIndex *dedupeIndex)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  int result = udsResumeIndexSession(index->indexSession);
+  if (result != UDS_SUCCESS) {
+    logErrorWithStringError(result, "Error resuming dedupe index");
+  }
+  spin_lock(&index->stateLock);
+  index->suspended = false;
+  launchDedupeStateChange(index);
+  spin_unlock(&index->stateLock);
+}
+
+/*****************************************************************************/
+
+/*****************************************************************************/
+static void dumpUDSIndex(DedupeIndex *dedupeIndex, bool showQueue)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  spin_lock(&index->stateLock);
+  const char *state = indexStateToString(index, index->indexState);
+  const char *target = (index->changing
+                        ? indexStateToString(index, index->indexTarget)
+                        : NULL);
+  spin_unlock(&index->stateLock);
+  logInfo("UDS index: state: %s", state);
+  if (target != NULL) {
+    logInfo("UDS index: changing to state: %s", target);
+  }
+  if (showQueue) {
+    dumpWorkQueue(index->udsQueue);
+  }
+}
+
+/*****************************************************************************/
+static void finishUDSIndex(DedupeIndex *dedupeIndex)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  setTargetState(index, IS_CLOSED, false, false, false);
+  udsDestroyIndexSession(index->indexSession);
+  finishWorkQueue(index->udsQueue);
+}
+
+/*****************************************************************************/
+static void freeUDSIndex(DedupeIndex *dedupeIndex)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  freeWorkQueue(&index->udsQueue);
+  spin_lock_bh(&index->pendingLock);
+  if (index->startedTimer) {
+    del_timer_sync(&index->pendingTimer);
+  }
+  spin_unlock_bh(&index->pendingLock);
+  kobject_put(&index->dedupeObject);
+}
+
+/*****************************************************************************/
+static const char *getUDSStateName(DedupeIndex *dedupeIndex)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  spin_lock(&index->stateLock);
+  const char *state = indexStateToString(index, index->indexState);
+  spin_unlock(&index->stateLock);
+  return state;
+}
+
+/*****************************************************************************/
+static void getUDSStatistics(DedupeIndex *dedupeIndex, IndexStatistics *stats)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  spin_lock(&index->stateLock);
+  IndexState      indexState   = index->indexState;
+  stats->maxDedupeQueries      = index->maximum;
+  spin_unlock(&index->stateLock);
+  stats->currDedupeQueries     = atomic_read(&index->active);
+  if (indexState == IS_OPENED) {
+    UdsIndexStats indexStats;
+    int result = udsGetIndexStats(index->indexSession, &indexStats);
+    if (result == UDS_SUCCESS) {
+      stats->entriesIndexed = indexStats.entriesIndexed;
+    } else {
+      logErrorWithStringError(result, "Error reading index stats");
+    }
+    UdsContextStats contextStats;
+    result = udsGetIndexSessionStats(index->indexSession, &contextStats);
+    if (result == UDS_SUCCESS) {
+      stats->postsFound      = contextStats.postsFound;
+      stats->postsNotFound   = contextStats.postsNotFound;
+      stats->queriesFound    = contextStats.queriesFound;
+      stats->queriesNotFound = contextStats.queriesNotFound;
+      stats->updatesFound    = contextStats.updatesFound;
+      stats->updatesNotFound = contextStats.updatesNotFound;
+    } else {
+      logErrorWithStringError(result, "Error reading context stats");
+    }
+  }
+}
+
+
+/*****************************************************************************/
+static int processMessage(DedupeIndex *dedupeIndex, const char *name)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  if (strcasecmp(name, "index-close") == 0) {
+    setTargetState(index, IS_CLOSED, false, false, false);
+    return 0;
+  } else if (strcasecmp(name, "index-create") == 0) {
+    setTargetState(index, IS_OPENED, false, false, true);
+    return 0;
+  } else if (strcasecmp(name, "index-disable") == 0) {
+    setTargetState(index, IS_OPENED, true, false, false);
+    return 0;
+  } else if (strcasecmp(name, "index-enable") == 0) {
+    setTargetState(index, IS_OPENED, true, true, false);
+    return 0;
+  }
+  return -EINVAL;
+}
+
+/*****************************************************************************/
+static void udsPost(DataKVIO *dataKVIO)
+{
+  enqueueIndexOperation(dataKVIO, UDS_POST);
+}
+
+/*****************************************************************************/
+static void udsQuery(DataKVIO *dataKVIO)
+{
+  enqueueIndexOperation(dataKVIO, UDS_QUERY);
+}
+
+/*****************************************************************************/
+static void startUDSIndex(DedupeIndex *dedupeIndex, bool createFlag)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  setTargetState(index, IS_OPENED, true, true, createFlag);
+}
+
+/*****************************************************************************/
+static void stopUDSIndex(DedupeIndex *dedupeIndex)
+{
+  UDSIndex *index = container_of(dedupeIndex, UDSIndex, common);
+  setTargetState(index, IS_CLOSED, false, false, false);
+}
+
+/*****************************************************************************/
+static void udsUpdate(DataKVIO *dataKVIO)
+{
+  enqueueIndexOperation(dataKVIO, UDS_UPDATE);
+}
+
+/*****************************************************************************/
+static void dedupeKobjRelease(struct kobject *kobj)
+{
+  UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject);
+  udsFreeConfiguration(index->configuration);
+  FREE(index->indexName);
+  FREE(index);
+}
+
+/*****************************************************************************/
+static ssize_t dedupeStatusShow(struct kobject   *kobj,
+                                struct attribute *attr,
+                                char             *buf)
+{
+  UDSAttribute *ua = container_of(attr, UDSAttribute, attr);
+  UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject);
+  if (ua->showString != NULL) {
+    return sprintf(buf, "%s\n", ua->showString(&index->common));
+  } else {
+    return -EINVAL;
+  }
+}
+
+/*****************************************************************************/
+static ssize_t dedupeStatusStore(struct kobject   *kobj,
+                                 struct attribute *attr,
+                                 const char       *buf,
+                                 size_t            length)
+{
+  return -EINVAL;
+}
+
+/*****************************************************************************/
+
+static struct sysfs_ops dedupeSysfsOps = {
+  .show  = dedupeStatusShow,
+  .store = dedupeStatusStore,
+};
+
+static UDSAttribute dedupeStatusAttribute = {
+  .attr = {.name = "status", .mode = 0444, },
+  .showString = getUDSStateName,
+};
+
+static struct attribute *dedupeAttributes[] = {
+  &dedupeStatusAttribute.attr,
+  NULL,
+};
+
+static struct kobj_type dedupeKobjType = {
+  .release       = dedupeKobjRelease,
+  .sysfs_ops     = &dedupeSysfsOps,
+  .default_attrs = dedupeAttributes,
+};
+
+/*****************************************************************************/
+static void startUDSQueue(void *ptr)
+{
+  /*
+   * Allow the UDS dedupe worker thread to do memory allocations.  It will
+   * only do allocations during the UDS calls that open or close an index,
+   * but those allocations can safely sleep while reserving a large amount
+   * of memory.  We could use an allocationsAllowed boolean (like the base
+   * threads do), but it would be an unnecessary embellishment.
+   */
+  UDSIndex *index = ptr;
+  registerAllocatingThread(&index->allocatingThread, NULL);
+}
+
+/*****************************************************************************/
+static void finishUDSQueue(void *ptr)
+{
+  unregisterAllocatingThread();
+}
+
+/*****************************************************************************/
+int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr)
+{
+  UDSIndex *index;
+  int result = ALLOCATE(1, UDSIndex, "UDS index data", &index);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = allocSprintf("index name", &index->indexName,
+                        "dev=%s offset=4096 size=%llu",
+                        layer->deviceConfig->parentDeviceName,
+                        getIndexRegionSize(layer->geometry) * VDO_BLOCK_SIZE);
+  if (result != UDS_SUCCESS) {
+    logError("Creating index name failed (%d)", result);
+    FREE(index);
+    return result;
+  }
+
+  index->udsParams = (struct uds_parameters) UDS_PARAMETERS_INITIALIZER;
+  indexConfigToUdsParameters(&layer->geometry.indexConfig, &index->udsParams);
+  result = indexConfigToUdsConfiguration(&layer->geometry.indexConfig,
+                                         &index->configuration);
+  if (result != VDO_SUCCESS) {
+    FREE(index->indexName);
+    FREE(index);
+    return result;
+  }
+  udsConfigurationSetNonce(index->configuration,
+                           (UdsNonce) layer->geometry.nonce);
+
+  result = udsCreateIndexSession(&index->indexSession);
+  if (result != UDS_SUCCESS) {
+    udsFreeConfiguration(index->configuration);
+    FREE(index->indexName);
+    FREE(index);
+    return result;
+  }
+
+  static const KvdoWorkQueueType udsQueueType = {
+    .start        = startUDSQueue,
+    .finish       = finishUDSQueue,
+    .actionTable  = {
+      { .name = "uds_action", .code = UDS_Q_ACTION, .priority = 0 },
+    },
+  };
+  result = makeWorkQueue(layer->threadNamePrefix, "dedupeQ",
+                         &layer->wqDirectory, layer, index, &udsQueueType, 1,
+                         &index->udsQueue);
+  if (result != VDO_SUCCESS) {
+    logError("UDS index queue initialization failed (%d)", result);
+    udsDestroyIndexSession(index->indexSession);
+    udsFreeConfiguration(index->configuration);
+    FREE(index->indexName);
+    FREE(index);
+    return result;
+  }
+
+  kobject_init(&index->dedupeObject, &dedupeKobjType);
+  result = kobject_add(&index->dedupeObject, &layer->kobj, "dedupe");
+  if (result != VDO_SUCCESS) {
+    freeWorkQueue(&index->udsQueue);
+    udsDestroyIndexSession(index->indexSession);
+    udsFreeConfiguration(index->configuration);
+    FREE(index->indexName);
+    FREE(index);
+    return result;
+  }
+
+  index->common.dump                      = dumpUDSIndex;
+  index->common.free                      = freeUDSIndex;
+  index->common.getDedupeStateName        = getUDSStateName;
+  index->common.getStatistics             = getUDSStatistics;
+  index->common.message                   = processMessage;
+  index->common.post                      = udsPost;
+  index->common.query                     = udsQuery;
+  index->common.resume                    = resumeUDSIndex;
+  index->common.start                     = startUDSIndex;
+  index->common.stop                      = stopUDSIndex;
+  index->common.suspend                   = suspendUDSIndex;
+  index->common.finish                    = finishUDSIndex;
+  index->common.update                    = udsUpdate;
+
+  INIT_LIST_HEAD(&index->pendingHead);
+  spin_lock_init(&index->pendingLock);
+  spin_lock_init(&index->stateLock);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+  timer_setup(&index->pendingTimer, timeoutIndexOperations, 0);
+#else
+  setup_timer(&index->pendingTimer, timeoutIndexOperations,
+              (unsigned long) index);
+#endif
+
+  *indexPtr = &index->common;
+  return VDO_SUCCESS;
+}
diff --git a/vdo/kernel/udsIndex.h b/vdo/kernel/udsIndex.h
new file mode 100644
index 0000000..19a7470
--- /dev/null
+++ b/vdo/kernel/udsIndex.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.h#1 $
+ */
+
+#ifndef UDS_INDEX_H
+#define UDS_INDEX_H
+
+#include "dedupeIndex.h"
+
+/**
+ * Make a UDS index
+ *
+ * @param layer     the kernel layer
+ * @param indexPtr  dedupe index returned here
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr)
+  __attribute__ ((__warn_unused_result__));
+
+#endif /* UDS_INDEX_H */
diff --git a/vdo/kernel/vdoCommon.h b/vdo/kernel/vdoCommon.h
new file mode 100644
index 0000000..c83e066
--- /dev/null
+++ b/vdo/kernel/vdoCommon.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoCommon.h#1 $
+ */
+
+#ifndef VDO_COMMON_H
+#define VDO_COMMON_H
+
+enum {
+  // Whether the bio acknowledgement queue is used for acks of reads.
+  USE_BIO_ACK_QUEUE_FOR_READ                = 0,
+};
+
+#endif /* VDO_COMMON_H */
diff --git a/vdo/kernel/vdoStringUtils.c b/vdo/kernel/vdoStringUtils.c
new file mode 100644
index 0000000..d12580c
--- /dev/null
+++ b/vdo/kernel/vdoStringUtils.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.c#1 $
+ */
+
+#include "vdoStringUtils.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "stringUtils.h"
+
+#include "statusCodes.h"
+
+/**********************************************************************/
+char *vAppendToBuffer(char       *buffer,
+                      char       *bufEnd,
+                      const char *fmt,
+                      va_list     args)
+{
+  size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args);
+  if (n >= (size_t) (bufEnd - buffer)) {
+    buffer = bufEnd;
+  } else {
+    buffer += n;
+  }
+  return buffer;
+}
+
+/**********************************************************************/
+char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...)
+{
+  va_list ap;
+
+  va_start(ap, fmt);
+  char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap);
+  va_end(ap);
+  return pos;
+}
+
+/**********************************************************************/
+void freeStringArray(char **stringArray)
+{
+  for (unsigned int offset = 0; stringArray[offset] != NULL; offset++) {
+    FREE(stringArray[offset]);
+  }
+  FREE(stringArray);
+}
+
+/**********************************************************************/
+int splitString(const char *string, char separator, char ***substringArrayPtr)
+{
+  unsigned int substringCount = 1;
+  for (const char *s = string; *s != 0; s++) {
+    if (*s == separator) {
+      substringCount++;
+    }
+  }
+
+  char **substrings;
+  int result = ALLOCATE(substringCount + 1, char *, "string-splitting array",
+                        &substrings);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+  unsigned int currentSubstring = 0;
+  for (const char *s = string; *s != 0; s++) {
+    if (*s == separator) {
+      ptrdiff_t length = s - string;
+      result = ALLOCATE(length + 1, char, "split string",
+                        &substrings[currentSubstring]);
+      if (result != UDS_SUCCESS) {
+        freeStringArray(substrings);
+        return result;
+      }
+      // Trailing NUL is already in place after allocation; deal with
+      // the zero or more non-NUL bytes in the string.
+      if (length > 0) {
+        memcpy(substrings[currentSubstring], string, length);
+      }
+      string = s + 1;
+      currentSubstring++;
+      BUG_ON(currentSubstring >= substringCount);
+    }
+  }
+  // Process final string, with no trailing separator.
+  BUG_ON(currentSubstring != (substringCount - 1));
+  ptrdiff_t length = strlen(string);
+  result = ALLOCATE(length + 1, char, "split string",
+                    &substrings[currentSubstring]);
+  if (result != UDS_SUCCESS) {
+    freeStringArray(substrings);
+    return result;
+  }
+  memcpy(substrings[currentSubstring], string, length);
+  currentSubstring++;
+  // substrings[currentSubstring] is NULL already
+  *substringArrayPtr = substrings;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int joinStrings(char   **substringArray,
+                size_t   arrayLength,
+                char     separator,
+                char   **stringPtr)
+{
+  size_t stringLength = 0;
+  for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) {
+    stringLength += strlen(substringArray[i]) + 1;
+  }
+
+  char *output;
+  int result = ALLOCATE(stringLength, char, __func__, &output);
+  if (result != VDO_SUCCESS) {
+    return result;
+  }
+
+  char *currentPosition = &output[0];
+  for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) {
+    currentPosition = appendToBuffer(currentPosition, output + stringLength,
+                                     "%s", substringArray[i]);
+    *currentPosition = separator;
+    currentPosition++;
+  }
+
+  // We output one too many separators; replace the last with a zero byte.
+  if (currentPosition != output) {
+    *(currentPosition - 1) = '\0';
+  }
+
+  *stringPtr = output;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int stringToUInt(const char *input, unsigned int *valuePtr)
+{
+  unsigned long longValue;
+  int result = kstrtoul(input, 10, &longValue);
+  if (result != 0) {
+    return result;
+  }
+
+  if (longValue > UINT_MAX) {
+    return -ERANGE;
+  }
+
+  *valuePtr = longValue;
+  return UDS_SUCCESS;
+}
diff --git a/vdo/kernel/vdoStringUtils.h b/vdo/kernel/vdoStringUtils.h
new file mode 100644
index 0000000..067ed9e
--- /dev/null
+++ b/vdo/kernel/vdoStringUtils.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.h#1 $
+ */
+
+#ifndef VDO_STRING_UTILS_H
+#define VDO_STRING_UTILS_H
+
+#include <stdarg.h>
+#include <linux/types.h>
+
+/**
+ * Helper to append a string to a buffer.
+ *
+ * @param buffer  the place at which to append the string
+ * @param bufEnd  pointer to the end of the buffer
+ * @param fmt     a printf format string
+ *
+ * @return  the updated buffer position after the append
+ *
+ * if insufficient space is available, the contents are silently truncated
+ **/
+char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...);
+
+/**
+ * Variable-arglist helper to append a string to a buffer.
+ * If insufficient space is available, the contents are silently truncated.
+ *
+ * @param buffer  the place at which to append the string
+ * @param bufEnd  pointer to the end of the buffer
+ * @param fmt     a printf format string
+ * @param args    printf arguments
+ *
+ * @return  the updated buffer position after the append
+ **/
+char *vAppendToBuffer(char       *buffer,
+                      char       *bufEnd,
+                      const char *fmt,
+                      va_list     args);
+
+/**
+ * Split the input string into substrings, separated at occurrences of
+ * the indicated character, returning a null-terminated list of string
+ * pointers.
+ *
+ * The string pointers and the pointer array itself should both be
+ * freed with FREE() when no longer needed. This can be done with
+ * freeStringArray (below) if the pointers in the array are not
+ * changed. Since the array and copied strings are allocated by this
+ * function, it may only be used in contexts where allocation is
+ * permitted.
+ *
+ * Empty substrings are not ignored; that is, returned substrings may
+ * be empty strings if the separator occurs twice in a row.
+ *
+ * @param [in]  string             The input string to be broken apart
+ * @param [in]  separator          The separator character
+ * @param [out] substringArrayPtr  The NULL-terminated substring array
+ *
+ * @return  UDS_SUCCESS or -ENOMEM
+ **/
+int splitString(const char *string, char separator, char ***substringArrayPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Join the input substrings into one string, joined with the indicated
+ * character, returning a string.
+ *
+ * @param [in]  substringArray  The NULL-terminated substring array
+ * @param [in]  arrayLength     A bound on the number of valid elements
+ *                              in substringArray, in case it is not
+ *                              NULL-terminated.
+ * @param [in]  separator       The separator character
+ * @param [out] stringPtr       A pointer to hold the joined string
+ *
+ * @return  VDO_SUCCESS or an error
+ **/
+int joinStrings(char   **substringArray,
+                size_t   arrayLength,
+                char     separator,
+                char   **stringPtr)
+  __attribute__((warn_unused_result));
+
+/**
+ * Free a list of non-NULL string pointers, and then the list itself.
+ *
+ * @param stringArray  The string list
+ **/
+void freeStringArray(char **stringArray);
+
+/**
+ * Parse a string as an "unsigned int" value, yielding the value.
+ * On overflow, -ERANGE is returned. On invalid number, -EINVAL is
+ * returned.
+ *
+ * @param [in]  input     The string to be processed
+ * @param [out] valuePtr  The value of the number read
+ *
+ * @return  UDS_SUCCESS or -EINVAL or -ERANGE.
+ **/
+int stringToUInt(const char *input, unsigned int *valuePtr)
+  __attribute__((warn_unused_result));
+
+#endif /* VDO_STRING_UTILS_H */
diff --git a/vdo/kernel/verify.c b/vdo/kernel/verify.c
new file mode 100644
index 0000000..672ac91
--- /dev/null
+++ b/vdo/kernel/verify.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.c#3 $
+ */
+
+#include "verify.h"
+
+#include "logger.h"
+
+#include "dataKVIO.h"
+#include "numeric.h"
+
+/**
+ * Compare blocks of memory for equality.
+ *
+ * This assumes the blocks are likely to be large; it's not well
+ * optimized for comparing just a few bytes.  This is desirable
+ * because the Linux kernel memcmp() routine on x86 is not well
+ * optimized for large blocks, and the performance penalty turns out
+ * to be significant if you're doing lots of 4KB comparisons.
+ *
+ * @param pointerArgument1  first data block
+ * @param pointerArgument2  second data block
+ * @param length            length of the data block
+ *
+ * @return   true iff the two blocks are equal
+ **/
+__attribute__((warn_unused_result))
+static bool memoryEqual(void   *pointerArgument1,
+                        void   *pointerArgument2,
+                        size_t  length)
+{
+  byte *pointer1 = pointerArgument1;
+  byte *pointer2 = pointerArgument2;
+  while (length >= sizeof(uint64_t)) {
+    /*
+     * GET_UNALIGNED is just for paranoia.  (1) On x86_64 it is
+     * treated the same as an aligned access.  (2) In this use case,
+     * one or both of the inputs will almost(?) always be aligned.
+     */
+    if (GET_UNALIGNED(uint64_t, pointer1)
+        != GET_UNALIGNED(uint64_t, pointer2)) {
+      return false;
+    }
+    pointer1 += sizeof(uint64_t);
+    pointer2 += sizeof(uint64_t);
+    length -= sizeof(uint64_t);
+  }
+  while (length > 0) {
+    if (*pointer1 != *pointer2) {
+      return false;
+    }
+    pointer1++;
+    pointer2++;
+    length--;
+  }
+  return true;
+}
+
+/**
+ * Verify the Albireo-provided deduplication advice, and invoke a
+ * callback once the answer is available.
+ *
+ * After we've compared the stored data with the data to be written,
+ * or after we've failed to be able to do so, the stored VIO callback
+ * is queued to be run in the main (kvdoReqQ) thread.
+ *
+ * If the advice turns out to be stale and the deduplication session
+ * is still active, submit a correction.  (Currently the correction
+ * must be sent before the callback can be invoked, if the dedupe
+ * session is still live.)
+ *
+ * @param item  The workitem from the queue
+ **/
+static void verifyDuplicationWork(KvdoWorkItem *item)
+{
+  DataKVIO *dataKVIO = workItemAsDataKVIO(item);
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F;j=dedupe;cb=verify"));
+
+  if (likely(memoryEqual(dataKVIO->dataBlock, dataKVIO->readBlock.data,
+                         VDO_BLOCK_SIZE))) {
+    // Leave dataKVIO->dataVIO.isDuplicate set to true.
+  } else {
+    dataKVIO->dataVIO.isDuplicate = false;
+  }
+
+  kvdoEnqueueDataVIOCallback(dataKVIO);
+}
+
+/**
+ * Verify the Albireo-provided deduplication advice, and invoke a
+ * callback once the answer is available.
+ *
+ * @param dataKVIO  The DataKVIO that we are looking to dedupe.
+ **/
+static void verifyReadBlockCallback(DataKVIO *dataKVIO)
+{
+  dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL));
+  int err = dataKVIO->readBlock.status;
+  if (unlikely(err != 0)) {
+    logDebug("%s: err %d", __func__, err);
+    dataKVIO->dataVIO.isDuplicate = false;
+    kvdoEnqueueDataVIOCallback(dataKVIO);
+    return;
+  }
+
+  launchDataKVIOOnCPUQueue(dataKVIO, verifyDuplicationWork, NULL,
+                           CPU_Q_ACTION_COMPRESS_BLOCK);
+}
+
+/**********************************************************************/
+void kvdoVerifyDuplication(DataVIO *dataVIO)
+{
+  ASSERT_LOG_ONLY(dataVIO->isDuplicate, "advice to verify must be valid");
+  ASSERT_LOG_ONLY(dataVIO->duplicate.state != MAPPING_STATE_UNMAPPED,
+                  "advice to verify must not be a discard");
+  ASSERT_LOG_ONLY(dataVIO->duplicate.pbn != ZERO_BLOCK,
+                  "advice to verify must not point to the zero block");
+  ASSERT_LOG_ONLY(!dataVIO->isZeroBlock,
+                  "zeroed block should not have advice to verify");
+
+  TraceLocation location
+    = THIS_LOCATION("verifyDuplication;dup=update(verify);io=verify");
+  dataVIOAddTraceRecord(dataVIO, location);
+  kvdoReadBlock(dataVIO, dataVIO->duplicate.pbn, dataVIO->duplicate.state,
+                BIO_Q_ACTION_VERIFY, verifyReadBlockCallback);
+}
+
+/**********************************************************************/
+bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second)
+{
+  dataVIOAddTraceRecord(second, THIS_LOCATION(NULL));
+  DataKVIO *a = dataVIOAsDataKVIO(first);
+  DataKVIO *b = dataVIOAsDataKVIO(second);
+  return memoryEqual(a->dataBlock, b->dataBlock, VDO_BLOCK_SIZE);
+}
diff --git a/vdo/kernel/verify.h b/vdo/kernel/verify.h
new file mode 100644
index 0000000..5b03dd7
--- /dev/null
+++ b/vdo/kernel/verify.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.h#1 $
+ */
+
+#include "kernelLayer.h"
+
+/**
+ * Verify the Albireo-provided deduplication advice, and invoke a callback once
+ * the answer is available. This is done through a call to kvdoReadBlock()
+ * which will eventually call back to verifyDuplication() once the block is
+ * read and possibly uncompressed.
+ *
+ * @param dataVIO   The DataVIO with advice filled in.
+ **/
+void kvdoVerifyDuplication(DataVIO *dataVIO);
+
+/**
+ * Implements DataVIOComparator.
+ *
+ * @param first   The first DataVIO to compare
+ * @param second  The second DataVIO to compare
+ *
+ * @return <code>true</code> if the contents of the two DataVIOs are the same
+ **/
+bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second)
+  __attribute__((warn_unused_result));
diff --git a/vdo/kernel/workItemStats.c b/vdo/kernel/workItemStats.c
new file mode 100644
index 0000000..2027cd8
--- /dev/null
+++ b/vdo/kernel/workItemStats.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.c#4 $
+ */
+
+#include "workItemStats.h"
+
+#include "atomic.h"
+#include "logger.h"
+
+/**
+ * Scan the work queue stats table for the provided work function and
+ * priority value. If it's not found, see if an empty slot is
+ * available.
+ *
+ * @param table       The work queue's function table
+ * @param work        The function we want to record stats for
+ * @param priority    The priority of the work item
+ *
+ * @return   The index of the slot to use (matching or empty), or
+ *           NUM_WORK_QUEUE_ITEM_STATS if the table is full of
+ *           non-matching entries.
+ **/
+static inline unsigned int scanStatTable(const KvdoWorkFunctionTable *table,
+                                         KvdoWorkFunction             work,
+                                         unsigned int                 priority)
+{
+  unsigned int i;
+  /*
+   * See comments in getStatTableIndex regarding order of memory
+   * accesses. Work function first, then a barrier, then priority.
+   */
+  for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) {
+    if (table->functions[i] == NULL) {
+      return i;
+    } else if (table->functions[i] == work) {
+      smp_rmb();
+      if (table->priorities[i] == priority) {
+        return i;
+      }
+    }
+  }
+  return NUM_WORK_QUEUE_ITEM_STATS;
+}
+
+/**
+ * Scan the work queue stats table for the provided work function and
+ * priority value. Assign an empty slot if necessary.
+ *
+ * @param stats       The stats structure
+ * @param work        The function we want to record stats for
+ * @param priority    The priority of the work item
+ *
+ * @return   The index of the matching slot, or NUM_WORK_QUEUE_ITEM_STATS
+ *           if the table is full of non-matching entries.
+ **/
+static unsigned int getStatTableIndex(KvdoWorkItemStats *stats,
+                                      KvdoWorkFunction   work,
+                                      unsigned int       priority)
+{
+  KvdoWorkFunctionTable *functionTable = &stats->functionTable;
+
+  unsigned int index = scanStatTable(functionTable, work, priority);
+  if (unlikely(index == NUM_WORK_QUEUE_ITEM_STATS)
+      || likely(functionTable->functions[index] != NULL)) {
+    return index;
+  }
+
+  unsigned long flags = 0;
+  // The delayed-work-item processing uses queue->lock in some cases,
+  // and one case may call into this function, so we can't reuse
+  // queue->lock here.
+  spin_lock_irqsave(&functionTable->lock, flags);
+  // Recheck now that we've got the lock...
+  index = scanStatTable(functionTable, work, priority);
+  if ((index == NUM_WORK_QUEUE_ITEM_STATS)
+      || (functionTable->functions[index] != NULL)) {
+    spin_unlock_irqrestore(&functionTable->lock, flags);
+    return index;
+  }
+
+  /*
+   * An uninitialized priority is indistinguishable from a zero
+   * priority. So store the priority first, and enforce the ordering,
+   * so that a non-null work function pointer indicates we've finished
+   * filling in the value. (And, to make this work, we have to read
+   * the work function first and priority second, when comparing.)
+   */
+  functionTable->priorities[index] = priority;
+  smp_wmb();
+  functionTable->functions[index] = work;
+  spin_unlock_irqrestore(&functionTable->lock, flags);
+  return index;
+}
+
+/**
+ * Get counters on work items, identified by index into the internal
+ * array.
+ *
+ * @param [in]  stats         The collected statistics
+ * @param [in]  index         The index
+ * @param [out] enqueuedPtr   The total work items enqueued
+ * @param [out] processedPtr  The number of work items processed
+ * @param [out] pendingPtr    The number of work items still pending
+ **/
+static void getWorkItemCountsByItem(const KvdoWorkItemStats *stats,
+                                    unsigned int             index,
+                                    uint64_t                *enqueuedPtr,
+                                    uint64_t                *processedPtr,
+                                    unsigned int            *pendingPtr)
+{
+  uint64_t enqueued  = atomic64_read(&stats->enqueued[index]);
+  uint64_t processed = stats->times[index].count;
+  unsigned int pending;
+  if (enqueued < processed) {
+    // Probably just out of sync.
+    pending = 1;
+  } else {
+    pending = enqueued - processed;
+    // Pedantic paranoia: Check for overflow of the 32-bit "pending".
+    if ((pending + processed) < enqueued) {
+      pending = UINT_MAX;
+    }
+  }
+  *enqueuedPtr  = enqueued;
+  *processedPtr = processed;
+  *pendingPtr   = pending;
+}
+
+/**
+ * Get counters on work items not covered by any index value.
+ *
+ * @param [in]  stats         The collected statistics
+ * @param [out] enqueuedPtr   The total work items enqueued
+ * @param [out] processedPtr  The number of work items processed
+ **/
+static void getOtherWorkItemCounts(const KvdoWorkItemStats *stats,
+                                   uint64_t                *enqueuedPtr,
+                                   uint64_t                *processedPtr)
+{
+  unsigned int pending;
+  getWorkItemCountsByItem(stats, NUM_WORK_QUEUE_ITEM_STATS,
+                          enqueuedPtr, processedPtr, &pending);
+}
+
+/**
+ * Get timing stats on work items, identified by index into the
+ * internal array.
+ *
+ * @param [in]  stats  The collected statistics
+ * @param [in]  index  The index into the array
+ * @param [out] min    The minimum execution time
+ * @param [out] mean   The mean execution time
+ * @param [out] max    The maximum execution time
+ **/
+static void getWorkItemTimesByItem(const KvdoWorkItemStats *stats,
+                                   unsigned int             index,
+                                   uint64_t                *min,
+                                   uint64_t                *mean,
+                                   uint64_t                *max)
+{
+  *min  = stats->times[index].min;
+  *mean = getSampleAverage(&stats->times[index]);
+  *max  = stats->times[index].max;
+}
+
+/**********************************************************************/
+void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats,
+                                   KvdoWorkItem      *item,
+                                   int                priority)
+{
+  item->statTableIndex = getStatTableIndex(stats, item->statsFunction,
+                                           priority);
+  atomic64_add(1, &stats->enqueued[item->statTableIndex]);
+}
+
+/**********************************************************************/
+char *getFunctionName(void *pointer, char *buffer, size_t bufferLength)
+{
+  if (pointer == NULL) {
+    /*
+     * Format "%ps" logs a null pointer as "(null)" with a bunch of
+     * leading spaces. We sometimes use this when logging lots of
+     * data; don't be so verbose.
+     */
+    strncpy(buffer, "-", bufferLength);
+  } else {
+    /*
+     * Use a non-const array instead of a string literal below to
+     * defeat gcc's format checking, which doesn't understand that
+     * "%ps" actually does support a precision spec in Linux kernel
+     * code.
+     */
+    static char truncatedFunctionNameFormatString[] = "%.*ps";
+    snprintf(buffer, bufferLength,
+             truncatedFunctionNameFormatString,
+             bufferLength - 1,
+             pointer);
+
+    char *space = strchr(buffer, ' ');
+    if (space != NULL) {
+      *space = '\0';
+    }
+  }
+
+  return buffer;
+}
+
+/**********************************************************************/
+size_t formatWorkItemStats(const KvdoWorkItemStats *stats,
+                           char                    *buffer,
+                           size_t                   length)
+{
+  const KvdoWorkFunctionTable *functionIDs = &stats->functionTable;
+  size_t currentOffset = 0;
+
+  uint64_t enqueued, processed;
+  int i;
+  for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) {
+    if (functionIDs->functions[i] == NULL) {
+      break;
+    }
+    if (atomic64_read(&stats->enqueued[i]) == 0) {
+      continue;
+    }
+    /*
+     * The reporting of all of "pending", "enqueued" and "processed"
+     * here seems redundant, but "pending" is limited to 0 in the case
+     * where "processed" exceeds "enqueued", either through current
+     * activity and a lack of synchronization when fetching stats, or
+     * a coding bug. This report is intended largely for debugging, so
+     * we'll go ahead and print the not-necessarily-redundant values.
+     */
+    unsigned int pending;
+    getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending);
+
+    // Format: fn prio enq proc timeo [ min max mean ]
+    if (ENABLE_PER_FUNCTION_TIMING_STATS) {
+      uint64_t min, mean, max;
+      getWorkItemTimesByItem(stats, i, &min, &mean, &max);
+      currentOffset += snprintf(buffer + currentOffset,
+                                length - currentOffset,
+                                "%-36ps %d %10llu %10" PRIu64
+                                " %10llu %10llu %10" PRIu64
+                                "\n",
+                                functionIDs->functions[i],
+                                functionIDs->priorities[i],
+                                enqueued, processed,
+                                min, max, mean);
+    } else {
+      currentOffset += snprintf(buffer + currentOffset,
+                                length - currentOffset,
+                                "%-36ps %d %10llu %10" PRIu64
+                                "\n",
+                                functionIDs->functions[i],
+                                functionIDs->priorities[i],
+                                enqueued, processed);
+    }
+    if (currentOffset >= length) {
+      break;
+    }
+  }
+  if ((i == NUM_WORK_QUEUE_ITEM_STATS) && (currentOffset < length)) {
+    uint64_t enqueued, processed;
+    getOtherWorkItemCounts(stats, &enqueued, &processed);
+    if (enqueued > 0) {
+      currentOffset += snprintf(buffer + currentOffset,
+                                length - currentOffset,
+                                "%-36s %d %10llu %10" PRIu64
+                                "\n",
+                                "OTHER", 0,
+                                enqueued, processed);
+    }
+  }
+  return currentOffset;
+}
+
+/**********************************************************************/
+void logWorkItemStats(const KvdoWorkItemStats *stats)
+{
+  uint64_t totalEnqueued = 0;
+  uint64_t totalProcessed = 0;
+
+  const KvdoWorkFunctionTable *functionIDs = &stats->functionTable;
+
+  int i;
+  for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) {
+    if (functionIDs->functions[i] == NULL) {
+      break;
+    }
+    if (atomic64_read(&stats->enqueued[i]) == 0) {
+      continue;
+    }
+    /*
+     * The reporting of all of "pending", "enqueued" and "processed"
+     * here seems redundant, but "pending" is limited to 0 in the case
+     * where "processed" exceeds "enqueued", either through current
+     * activity and a lack of synchronization when fetching stats, or
+     * a coding bug. This report is intended largely for debugging, so
+     * we'll go ahead and print the not-necessarily-redundant values.
+     */
+    uint64_t enqueued, processed;
+    unsigned int pending;
+    getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending);
+    totalEnqueued  += enqueued;
+    totalProcessed += processed;
+
+    static char work[256]; // arbitrary size
+    getFunctionName(functionIDs->functions[i], work, sizeof(work));
+
+    if (ENABLE_PER_FUNCTION_TIMING_STATS) {
+      uint64_t min, mean, max;
+      getWorkItemTimesByItem(stats, i, &min, &mean, &max);
+      logInfo("  priority %d: %u pending"
+              " %llu enqueued %llu processed"
+              " %s"
+              " times %llu/%llu/%lluns",
+              functionIDs->priorities[i],
+              pending, enqueued, processed, work,
+              min, mean, max);
+    } else {
+      logInfo("  priority %d: %u pending"
+              " %llu enqueued %llu processed"
+              " %s",
+              functionIDs->priorities[i],
+              pending, enqueued, processed, work);
+    }
+  }
+  if (i == NUM_WORK_QUEUE_ITEM_STATS) {
+    uint64_t enqueued, processed;
+    getOtherWorkItemCounts(stats, &enqueued, &processed);
+    if (enqueued > 0) {
+      totalEnqueued  += enqueued;
+      totalProcessed += processed;
+      logInfo("  ... others: %llu enqueued %llu processed",
+              enqueued, processed);
+    }
+  }
+  logInfo("  total: %llu enqueued %llu processed",
+          totalEnqueued, totalProcessed);
+}
diff --git a/vdo/kernel/workItemStats.h b/vdo/kernel/workItemStats.h
new file mode 100644
index 0000000..0898f3b
--- /dev/null
+++ b/vdo/kernel/workItemStats.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.h#2 $
+ */
+
+#ifndef WORK_ITEM_STATS_H
+#define WORK_ITEM_STATS_H
+
+#include "timeUtils.h"
+
+#include "workQueue.h"
+
+enum {
+  // Whether to enable tracking of per-work-function run-time stats.
+  ENABLE_PER_FUNCTION_TIMING_STATS = 0,
+  // How many work function/priority pairs to track call stats for
+  NUM_WORK_QUEUE_ITEM_STATS        = 18,
+};
+
+typedef struct simpleStats {
+  uint64_t count;
+  uint64_t sum;
+  uint64_t min;
+  uint64_t max;
+} SimpleStats;
+
+/*
+ * We track numbers of work items handled (and optionally the
+ * wall-clock time to run the work functions), broken down by
+ * individual work functions (or alternate functions that the caller
+ * wants recorded, like the VIO completion callback function if we're
+ * just enqueueing a work function that invokes that indirectly) and
+ * priority.
+ *
+ * The first part of this structure manages the function/priority
+ * pairs, and is read frequently but updated rarely (once for each
+ * pair, plus possibly spin lock contention).
+ *
+ * The second part holds counters, and is updated often; different
+ * parts are updated by various threads as described below. The last
+ * element of each array, index NUM_WORK_QUEUE_ITEM_STATS, is updated
+ * only if we have filled the arrays and can't add the current work
+ * function/priority. See how the statTableIndex field is set in
+ * workItemStats.c.
+ *
+ * All fields may additionally be read when reporting statistics
+ * (including optionally reporting stats when the worker thread shuts
+ * down), but that's rare and shouldn't significantly affect cache
+ * contention issues.
+ *
+ * There is no "pending" count per work function here. For reporting
+ * statistics, it can be approximated by looking at the other fields.
+ * Do not rely on them being precise and synchronized, though.
+ */
+typedef struct kvdoWorkItemStatsFunctionTable {
+  /*
+   * The spin lock is used to protect .functions and .priorities
+   * during updates. All three are modified by producers (enqueueing
+   * threads) but only rarely. The .functions and .priorities arrays
+   * are read by producers very frequently.
+   */
+  spinlock_t       lock;
+  KvdoWorkFunction functions[NUM_WORK_QUEUE_ITEM_STATS];
+  uint8_t          priorities[NUM_WORK_QUEUE_ITEM_STATS];
+} KvdoWorkFunctionTable;
+
+typedef struct kvdoWorkItemStats {
+  /*
+   * Table of functions and priorities, for determining the index to
+   * use into the counter arrays below.
+   *
+   * This table is read by producers (usually multiple entries) for
+   * every work item enqueued, and when reporting stats. It is updated
+   * by producers, and only the first time a new (work-function,
+   * priority) combination is seen.
+   */
+  KvdoWorkFunctionTable  functionTable;
+  // Skip to (somewhere on) the next cache line
+  char                   pad[CACHE_LINE_BYTES - sizeof(atomic64_t)];
+  /*
+   * The .enqueued field is updated by producers only, once per work
+   * item processed; __sync operations are used to update these
+   * values.
+   */
+  atomic64_t             enqueued[NUM_WORK_QUEUE_ITEM_STATS + 1];
+  // Skip to (somewhere on) the next cache line
+  char                   pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)];
+  /*
+   * These values are updated only by the consumer (worker thread). We
+   * overload the .times[].count field as a count of items processed,
+   * so if we're not doing the optional processing-time tracking
+   * (controlled via an option in workQueue.c), we need to explicitly
+   * update the count.
+   *
+   * Since only one thread can ever update these values, no
+   * synchronization is used.
+   */
+  SimpleStats            times[NUM_WORK_QUEUE_ITEM_STATS + 1];
+} KvdoWorkItemStats;
+
+/**
+ * Initialize a statistics structure for tracking sample
+ * values. Assumes the storage was already zeroed out at allocation
+ * time.
+ *
+ * @param stats    The statistics structure
+ **/
+static inline void initSimpleStats(SimpleStats *stats)
+{
+  // Assume other fields are initialized to zero at allocation.
+  stats->min = UINT64_MAX;
+}
+
+/**
+ * Update the statistics being tracked for a new sample value.
+ *
+ * @param stats    The statistics structure
+ * @param value    The new value to be folded in
+ **/
+static inline void addSample(SimpleStats *stats, uint64_t value)
+{
+  stats->count++;
+  stats->sum += value;
+  if (stats->min > value) {
+    stats->min = value;
+  }
+  if (stats->max < value) {
+    stats->max = value;
+  }
+}
+
+/**
+ * Return the average of the samples collected.
+ *
+ * @param stats    The statistics structure
+ *
+ * @return         The average sample value
+ **/
+static inline uint64_t getSampleAverage(const SimpleStats *stats)
+{
+  uint64_t slop = stats->count / 2;
+  return (stats->sum + slop) / stats->count;
+}
+
+/**
+ * Update all work queue statistics (work-item and otherwise) after
+ * enqueueing a work item.
+ *
+ * @param  stats     The statistics structure
+ * @param  item      The work item enqueued
+ * @param  priority  The work item's priority
+ **/
+void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats,
+                                   KvdoWorkItem      *item,
+                                   int                priority);
+
+/**
+ * Update all work queue statistics (work-item and otherwise) after enqueueing
+ * a work item.
+ *
+ * This is a very lightweight function (after optimizing away conditionals and
+ * no-ops) and is called for every work item processed, hence the inline
+ * definition.
+ *
+ * This function requires that recordStartTime and
+ * updateWorkItemStatsForWorkTime below both get called as well; in some cases
+ * counters may be updated in updateWorkItemStatsForWorkTime rather than here.
+ *
+ * @param  stats  The statistics structure
+ * @param  item   The work item enqueued
+ **/
+static inline void updateWorkItemStatsForDequeue(KvdoWorkItemStats *stats,
+                                                 KvdoWorkItem      *item)
+{
+  // The times[].count field is overloaded as a count of items
+  // processed.
+  if (!ENABLE_PER_FUNCTION_TIMING_STATS) {
+    stats->times[item->statTableIndex].count++;
+  } else {
+    // In this case, updateWorkItemStatsForWorkTime will bump the counter.
+  }
+}
+
+/**
+ * Record the starting time for processing a work item, if timing
+ * stats are enabled and if we haven't run out of room for recording
+ * stats in the table.
+ *
+ * @param  index  The work item's index into the internal array
+ *
+ * @return    The current time, or zero
+ **/
+static inline uint64_t recordStartTime(unsigned int index)
+{
+  return (ENABLE_PER_FUNCTION_TIMING_STATS ? currentTime(CLOCK_MONOTONIC) : 0);
+}
+
+/**
+ * Update the work queue statistics with the wall-clock time for
+ * processing a work item, if timing stats are enabled and if we
+ * haven't run out of room for recording stats in the table.
+ *
+ * @param  stats      The statistics structure
+ * @param  index      The work item's index into the internal array
+ * @param  startTime  The start time as reported by recordStartTime
+ **/
+static inline void updateWorkItemStatsForWorkTime(KvdoWorkItemStats *stats,
+                                                  unsigned int       index,
+                                                  uint64_t           startTime)
+{
+  if (ENABLE_PER_FUNCTION_TIMING_STATS) {
+    uint64_t endTime = currentTime(CLOCK_MONOTONIC);
+    addSample(&stats->times[index], endTime - startTime);
+  }
+}
+
+/**
+ * Convert the pointer into a string representation, using a function
+ * name if available.
+ *
+ * @param pointer       The pointer to be converted
+ * @param buffer        The output buffer
+ * @param bufferLength  The size of the output buffer
+ **/
+char *getFunctionName(void *pointer, char *buffer, size_t bufferLength);
+
+/**
+ * Dump statistics broken down by work function and priority into the
+ * kernel log.
+ *
+ * @param  stats  The statistics structure
+ **/
+void logWorkItemStats(const KvdoWorkItemStats *stats);
+
+/**
+ * Format counters for per-work-function stats for reporting via /sys.
+ *
+ * @param [in]  stats   The statistics structure
+ * @param [out] buffer  The output buffer
+ * @param [in]  length  The size of the output buffer
+ *
+ * @return  The size of the string actually written
+ **/
+size_t formatWorkItemStats(const KvdoWorkItemStats *stats,
+                           char                    *buffer,
+                           size_t                   length);
+
+#endif // WORK_ITEM_STATS_H
diff --git a/vdo/kernel/workQueue.c b/vdo/kernel/workQueue.c
new file mode 100644
index 0000000..8be3285
--- /dev/null
+++ b/vdo/kernel/workQueue.c
@@ -0,0 +1,1152 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.c#11 $
+ */
+
+#include "workQueue.h"
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/version.h>
+
+#include "atomic.h"
+#include "logger.h"
+#include "memoryAlloc.h"
+#include "permassert.h"
+#include "stringUtils.h"
+
+#include "numeric.h"
+#include "workItemStats.h"
+#include "workQueueHandle.h"
+#include "workQueueInternals.h"
+#include "workQueueStats.h"
+#include "workQueueSysfs.h"
+
+enum {
+  // Time between work queue heartbeats in usec. The default kernel
+  // configurations generally have 1ms or 4ms tick rates, so let's make this a
+  // multiple for accuracy.
+  FUNNEL_HEARTBEAT_INTERVAL = 4000,
+
+  // Time to wait for a work queue to flush remaining items during shutdown.
+  // Specified in milliseconds.
+  FUNNEL_FINISH_SLEEP = 5000,
+};
+
+static struct mutex queueDataLock;
+static SimpleWorkQueue queueData;
+
+static void freeSimpleWorkQueue(SimpleWorkQueue *queue);
+static void finishSimpleWorkQueue(SimpleWorkQueue *queue);
+
+// work item lists (used for delayed work items)
+
+/**********************************************************************/
+static void initializeWorkItemList(KvdoWorkItemList *list)
+{
+  list->tail = NULL;
+}
+
+/**********************************************************************/
+static void addToWorkItemList(KvdoWorkItemList *list, KvdoWorkItem *item)
+{
+  if (list->tail == NULL) {
+    item->next = item;
+  } else {
+    KvdoWorkItem *head = list->tail->next;
+    list->tail->next = item;
+    item->next = head;
+  }
+  list->tail = item;
+}
+
+/**********************************************************************/
+static bool isWorkItemListEmpty(KvdoWorkItemList *list)
+{
+  return list->tail == NULL;
+}
+
+/**********************************************************************/
+static KvdoWorkItem *workItemListPoll(KvdoWorkItemList *list)
+{
+  KvdoWorkItem *tail = list->tail;
+  if (tail == NULL) {
+    return NULL;
+  }
+  // Extract and return head of list.
+  KvdoWorkItem *head = tail->next;
+  // Only one entry?
+  if (head == tail) {
+    list->tail = NULL;
+  } else {
+    tail->next = head->next;
+  }
+  head->next = NULL;
+  return head;
+}
+
+/**********************************************************************/
+static KvdoWorkItem *workItemListPeek(KvdoWorkItemList *list)
+{
+  KvdoWorkItem *tail = list->tail;
+  return tail ? tail->next : NULL;
+}
+
+// Finding the SimpleWorkQueue to actually operate on.
+
+/**
+ * Pick the next subordinate service queue in rotation.
+ *
+ * This doesn't need to be 100% precise in distributing work items around, so
+ * playing loose with concurrent field modifications isn't going to hurt us.
+ * (Avoiding the atomic ops may help us a bit in performance, but we'll still
+ * have contention over the fields.)
+ *
+ * @param queue  The round-robin-type work queue
+ *
+ * @return  A subordinate work queue
+ **/
+static inline SimpleWorkQueue *nextServiceQueue(RoundRobinWorkQueue *queue)
+{
+  unsigned int index = (queue->serviceQueueRotor++ % queue->numServiceQueues);
+  return queue->serviceQueues[index];
+}
+
+/**
+ * Find a simple work queue on which to operate.
+ *
+ * If the argument is already a simple work queue, use it. If it's a
+ * round-robin work queue, pick the next subordinate service queue and use it.
+ *
+ * @param queue  a work queue (round-robin or simple)
+ *
+ * @return  a simple work queue
+ **/
+static inline SimpleWorkQueue *pickSimpleQueue(KvdoWorkQueue *queue)
+{
+  return (queue->roundRobinMode
+          ? nextServiceQueue(asRoundRobinWorkQueue(queue))
+          : asSimpleWorkQueue(queue));
+}
+
+// Processing normal work items.
+
+/**
+ * Scan the work queue's work item lists, and dequeue and return the next
+ * waiting work item, if any.
+ *
+ * We scan the funnel queues from highest priority to lowest, once; there is
+ * therefore a race condition where a high-priority work item can be enqueued
+ * followed by a lower-priority one, and we'll grab the latter (but we'll catch
+ * the high-priority item on the next call). If strict enforcement of
+ * priorities becomes necessary, this function will need fixing.
+ *
+ * @param queue  the work queue
+ *
+ * @return  a work item pointer, or NULL
+ **/
+static KvdoWorkItem *pollForWorkItem(SimpleWorkQueue *queue)
+{
+  KvdoWorkItem *item = NULL;
+  for (int i = READ_ONCE(queue->numPriorityLists) - 1; i >= 0; i--) {
+    FunnelQueueEntry *link = funnelQueuePoll(queue->priorityLists[i]);
+    if (link != NULL) {
+      item = container_of(link, KvdoWorkItem, workQueueEntryLink);
+      break;
+    }
+  }
+
+  return item;
+}
+
+/**
+ * Add a work item into the queue, and inform the caller of any additional
+ * processing necessary.
+ *
+ * If the worker thread may not be awake, true is returned, and the caller
+ * should attempt a wakeup.
+ *
+ * @param queue  The work queue
+ * @param item   The work item to add
+ *
+ * @return  true iff the caller should wake the worker thread
+ **/
+__attribute__((warn_unused_result))
+static bool enqueueWorkQueueItem(SimpleWorkQueue *queue, KvdoWorkItem *item)
+{
+  ASSERT_LOG_ONLY(item->myQueue == NULL,
+                  "item %" PRIptr " (fn %" PRIptr "/%" PRIptr
+                  ") to enqueue (%" PRIptr
+                  ") is not already queued (%" PRIptr ")",
+                  item, item->work, item->statsFunction, queue,
+                  item->myQueue);
+  if (ASSERT(item->action < WORK_QUEUE_ACTION_COUNT,
+             "action is in range for queue") != VDO_SUCCESS) {
+    item->action = 0;
+  }
+  unsigned int priority = READ_ONCE(queue->priorityMap[item->action]);
+
+  // Update statistics.
+  updateStatsForEnqueue(&queue->stats, item, priority);
+
+  item->myQueue = &queue->common;
+
+  // Funnel queue handles the synchronization for the put.
+  funnelQueuePut(queue->priorityLists[priority], &item->workQueueEntryLink);
+
+  /*
+   * Due to how funnel-queue synchronization is handled (just atomic
+   * operations), the simplest safe implementation here would be to wake-up any
+   * waiting threads after enqueueing each item. Even if the funnel queue is
+   * not empty at the time of adding an item to the queue, the consumer thread
+   * may not see this since it is not guaranteed to have the same view of the
+   * queue as a producer thread.
+   *
+   * However, the above is wasteful so instead we attempt to minimize the
+   * number of thread wakeups. This is normally unsafe due to the above
+   * consumer-producer synchronization constraints. To correct this a timeout
+   * mechanism is used to wake the thread periodically to handle the occasional
+   * race condition that triggers and results in this thread not being woken
+   * properly.
+   *
+   * In most cases, the above timeout will not occur prior to some other work
+   * item being added after the queue is set to idle state, so thread wakeups
+   * will generally be triggered much faster than this interval. The timeout
+   * provides protection against the cases where more work items are either not
+   * added or are added too infrequently.
+   *
+   * This is also why we can get away with the normally-unsafe optimization for
+   * the common case by checking queue->idle first without synchronization. The
+   * race condition exists, but another work item getting enqueued can wake us
+   * up, and if we don't get that either, we still have the timeout to fall
+   * back on.
+   *
+   * Developed and tuned for some x86 boxes; untested whether this is any
+   * better or worse for other platforms, with or without the explicit memory
+   * barrier.
+   */
+  smp_mb();
+  return ((atomic_read(&queue->idle) == 1)
+          && (atomic_cmpxchg(&queue->idle, 1, 0) == 1));
+}
+
+/**
+ * Compute an approximate indication of the number of pending work items.
+ *
+ * No synchronization is used, so it's guaranteed to be correct only if there
+ * is no activity.
+ *
+ * @param queue  The work queue to examine
+ *
+ * @return  the estimate of the number of pending work items
+ **/
+static unsigned int getPendingCount(SimpleWorkQueue *queue)
+{
+  KvdoWorkItemStats *stats = &queue->stats.workItemStats;
+  long long pending = 0;
+  for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) {
+    pending += atomic64_read(&stats->enqueued[i]);
+    pending -= stats->times[i].count;
+  }
+  if (pending < 0) {
+    /*
+     * If we fetched numbers that were changing, we can get negative results.
+     * Just return an indication that there's some activity.
+     */
+    pending = 1;
+  }
+  return pending;
+}
+
+/**
+ * Run any start hook that may be defined for the work queue.
+ *
+ * @param queue  The work queue
+ **/
+static void runStartHook(SimpleWorkQueue *queue)
+{
+  if (queue->type->start != NULL) {
+    queue->type->start(queue->private);
+  }
+}
+
+/**
+ * Run any finish hook that may be defined for the work queue.
+ *
+ * @param queue  The work queue
+ **/
+static void runFinishHook(SimpleWorkQueue *queue)
+{
+  if (queue->type->finish != NULL) {
+    queue->type->finish(queue->private);
+  }
+}
+
+/**
+ * If the work queue has a suspend hook, invoke it, and when it finishes, check
+ * again for any pending work items.
+ *
+ * We assume a check for pending work items has just been done and turned up
+ * empty; so, if no suspend hook exists, we can just return NULL without doing
+ * another check.
+ *
+ * @param [in]     queue  The work queue preparing to suspend
+ *
+ * @return  the newly found work item, if any
+ **/
+static KvdoWorkItem *runSuspendHook(SimpleWorkQueue *queue)
+{
+  if (queue->type->suspend == NULL) {
+    return NULL;
+  }
+
+  queue->type->suspend(queue->private);
+  return pollForWorkItem(queue);
+}
+
+/**
+ * Check whether a work queue has delayed work items pending.
+ *
+ * @param queue  The work queue
+ *
+ * @return true iff delayed work items are pending
+ **/
+static bool hasDelayedWorkItems(SimpleWorkQueue *queue)
+{
+  bool result;
+  unsigned long flags;
+  spin_lock_irqsave(&queue->lock, flags);
+  result = !isWorkItemListEmpty(&queue->delayedItems);
+  spin_unlock_irqrestore(&queue->lock, flags);
+  return result;
+}
+
+/**
+ * Wait for the next work item to process, or until kthread_should_stop
+ * indicates that it's time for us to shut down.
+ *
+ * If kthread_should_stop says it's time to stop but we have pending work
+ * items, return a work item.
+ *
+ * Update statistics relating to scheduler interactions.
+ *
+ * @param [in]     queue            The work queue to wait on
+ * @param [in]     timeoutInterval  How long to wait each iteration
+ *
+ * @return  the next work item, or NULL to indicate shutdown is requested
+ **/
+static KvdoWorkItem *waitForNextWorkItem(SimpleWorkQueue *queue,
+                                         TimeoutJiffies   timeoutInterval)
+{
+  KvdoWorkItem *item = runSuspendHook(queue);
+  if (item != NULL) {
+    return item;
+  }
+
+  DEFINE_WAIT(wait);
+  while (true) {
+    atomic64_set(&queue->firstWakeup, 0);
+    prepare_to_wait(&queue->waitingWorkerThreads, &wait, TASK_INTERRUPTIBLE);
+    /*
+     * Don't set the idle flag until a wakeup will not be lost.
+     *
+     * Force synchronization between setting the idle flag and checking the
+     * funnel queue; the producer side will do them in the reverse order.
+     * (There's still a race condition we've chosen to allow, because we've got
+     * a timeout below that unwedges us if we hit it, but this may narrow the
+     * window a little.)
+     */
+    atomic_set(&queue->idle, 1);
+    memoryFence(); // store-load barrier between "idle" and funnel queue
+
+    item = pollForWorkItem(queue);
+    if (item != NULL) {
+      break;
+    }
+
+    /*
+     * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state
+     * up above. Otherwise, schedule() will put the thread to sleep and might
+     * miss a wakeup from kthread_stop() call in finishWorkQueue().
+     *
+     * If there are delayed work items, we need to wait for them to
+     * get run. Then, when we check kthread_should_stop again, we'll
+     * finally exit.
+     */
+    if (kthread_should_stop() && !hasDelayedWorkItems(queue)) {
+      /*
+       * Recheck once again in case we *just* converted a delayed work item to
+       * a regular enqueued work item.
+       *
+       * It's important that processDelayedWorkItems holds the spin lock until
+       * it finishes enqueueing the work item to run.
+       *
+       * Funnel queues aren't synchronized between producers and consumer.
+       * Normally a producer interrupted mid-update can hide a later producer's
+       * entry until the first completes. This would be a problem, except that
+       * when kthread_stop is called, we should already have ceased adding new
+       * work items and have waited for all the regular work items to finish;
+       * (recurring) delayed work items should be the only exception.
+       *
+       * Worker thread shutdown would be simpler if even the delayed work items
+       * were required to be completed and not re-queued before shutting down a
+       * work queue.
+       */
+      item = pollForWorkItem(queue);
+      break;
+    }
+
+    /*
+     * We don't need to update the wait count atomically since this is the only
+     * place it is modified and there is only one thread involved.
+     */
+    queue->stats.waits++;
+    uint64_t timeBeforeSchedule = currentTime(CLOCK_MONOTONIC);
+    atomic64_add(timeBeforeSchedule - queue->mostRecentWakeup,
+                 &queue->stats.runTime);
+    // Wake up often, to address the missed-wakeup race.
+    schedule_timeout(timeoutInterval);
+    queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC);
+    uint64_t callDurationNS = queue->mostRecentWakeup - timeBeforeSchedule;
+    enterHistogramSample(queue->stats.scheduleTimeHistogram,
+                         callDurationNS / 1000);
+
+    /*
+     * Check again before resetting firstWakeup for more accurate
+     * stats. (It's still racy, which can't be fixed without requiring
+     * tighter synchronization between producer and consumer sides.)
+     */
+    item = pollForWorkItem(queue);
+    if (item != NULL) {
+      break;
+    }
+  }
+
+  if (item != NULL) {
+    uint64_t firstWakeup = atomic64_read(&queue->firstWakeup);
+    /*
+     * We sometimes register negative wakeup latencies without this fencing.
+     * Whether it's forcing full serialization between the read of firstWakeup
+     * and the "rdtsc" that might be used depending on the clock source that
+     * helps, or some extra nanoseconds of delay covering for high-resolution
+     * clocks not being quite in sync between CPUs, is not yet clear.
+     */
+    loadFence();
+    if (firstWakeup != 0) {
+      enterHistogramSample(queue->stats.wakeupLatencyHistogram,
+                           (currentTime(CLOCK_MONOTONIC) - firstWakeup) / 1000);
+      enterHistogramSample(queue->stats.wakeupQueueLengthHistogram,
+                           getPendingCount(queue));
+    }
+  }
+  finish_wait(&queue->waitingWorkerThreads, &wait);
+  atomic_set(&queue->idle, 0);
+
+  return item;
+}
+
+/**
+ * Get the next work item to process, possibly waiting for one, unless
+ * kthread_should_stop indicates that it's time for us to shut down.
+ *
+ * If kthread_should_stop says it's time to stop but we have pending work
+ * items, return a work item.
+ *
+ * @param [in]     queue            The work queue to wait on
+ * @param [in]     timeoutInterval  How long to wait each iteration
+ *
+ * @return  the next work item, or NULL to indicate shutdown is requested
+ **/
+static KvdoWorkItem *getNextWorkItem(SimpleWorkQueue *queue,
+                                     TimeoutJiffies   timeoutInterval)
+{
+  KvdoWorkItem *item = pollForWorkItem(queue);
+  if (item != NULL) {
+    return item;
+  }
+  return waitForNextWorkItem(queue, timeoutInterval);
+}
+
+/**
+ * Execute a work item from a work queue, and do associated bookkeeping.
+ *
+ * @param [in]     queue  the work queue the item is from
+ * @param [in]     item   the work item to run
+ **/
+static void processWorkItem(SimpleWorkQueue *queue,
+                            KvdoWorkItem    *item)
+{
+  if (ASSERT(item->myQueue == &queue->common,
+             "item %" PRIptr " from queue %" PRIptr
+             " marked as being in this queue (%" PRIptr ")",
+             item, queue, item->myQueue) == UDS_SUCCESS) {
+    updateStatsForDequeue(&queue->stats, item);
+    item->myQueue = NULL;
+  }
+
+  // Save the index, so we can use it after the work function.
+  unsigned int index = item->statTableIndex;
+  uint64_t workStartTime = recordStartTime(index);
+  item->work(item);
+  // We just surrendered control of the work item; no more access.
+  item = NULL;
+  updateWorkItemStatsForWorkTime(&queue->stats.workItemStats, index,
+                                 workStartTime);
+
+  /*
+   * Be friendly to a CPU that has other work to do, if the kernel has told us
+   * to. This speeds up some performance tests; that "other work" might include
+   * other VDO threads.
+   *
+   * N.B.: We compute the pending count info here without any synchronization,
+   * but it's for stats reporting only, so being imprecise isn't too big a
+   * deal, as long as reads and writes are atomic operations.
+   */
+  if (need_resched()) {
+    uint64_t timeBeforeReschedule = currentTime(CLOCK_MONOTONIC);
+    // Record the queue length we have *before* rescheduling.
+    unsigned int queueLen = getPendingCount(queue);
+    cond_resched();
+    uint64_t timeAfterReschedule = currentTime(CLOCK_MONOTONIC);
+
+    enterHistogramSample(queue->stats.rescheduleQueueLengthHistogram,
+                         queueLen);
+    uint64_t runTimeNS = timeBeforeReschedule - queue->mostRecentWakeup;
+    enterHistogramSample(queue->stats.runTimeBeforeRescheduleHistogram,
+                         runTimeNS / 1000);
+    atomic64_add(runTimeNS, &queue->stats.runTime);
+    uint64_t callTimeNS = timeAfterReschedule - timeBeforeReschedule;
+    enterHistogramSample(queue->stats.rescheduleTimeHistogram,
+                         callTimeNS / 1000);
+    atomic64_add(callTimeNS, &queue->stats.rescheduleTime);
+    queue->mostRecentWakeup = timeAfterReschedule;
+  }
+}
+
+/**
+ * Main loop of the work queue worker thread.
+ *
+ * Waits for work items and runs them, until told to stop.
+ *
+ * @param queue  The work queue to run
+ **/
+static void serviceWorkQueue(SimpleWorkQueue *queue)
+{
+  TimeoutJiffies timeoutInterval =
+    maxLong(2, usecs_to_jiffies(FUNNEL_HEARTBEAT_INTERVAL + 1) - 1);
+
+  runStartHook(queue);
+
+  while (true) {
+    KvdoWorkItem *item = getNextWorkItem(queue, timeoutInterval);
+    if (item == NULL) {
+      // No work items but kthread_should_stop was triggered.
+      break;
+    }
+    // Process the work item
+    processWorkItem(queue, item);
+  }
+
+  runFinishHook(queue);
+}
+
+/**
+ * Initialize per-thread data for a new worker thread and run the work queue.
+ * Called in a new thread created by kthread_run().
+ *
+ * @param ptr  A pointer to the KvdoWorkQueue to run.
+ *
+ * @return  0 (indicating success to kthread_run())
+ **/
+static int workQueueRunner(void *ptr)
+{
+  SimpleWorkQueue *queue = ptr;
+  kobject_get(&queue->common.kobj);
+
+  WorkQueueStackHandle queueHandle;
+  initializeWorkQueueStackHandle(&queueHandle, queue);
+  queue->stats.startTime = queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC);
+  unsigned long flags;
+  spin_lock_irqsave(&queue->lock, flags);
+  queue->started = true;
+  spin_unlock_irqrestore(&queue->lock, flags);
+  wake_up(&queue->startWaiters);
+  serviceWorkQueue(queue);
+
+  // Zero out handle structure for safety.
+  memset(&queueHandle, 0, sizeof(queueHandle));
+
+  kobject_put(&queue->common.kobj);
+  return 0;
+}
+
+// Preparing work items
+
+/**********************************************************************/
+void setupWorkItem(KvdoWorkItem     *item,
+                   KvdoWorkFunction  work,
+                   void             *statsFunction,
+                   unsigned int      action)
+{
+  ASSERT_LOG_ONLY(item->myQueue == NULL,
+                  "setupWorkItem not called on enqueued work item");
+  item->work           = work;
+  item->statsFunction  = ((statsFunction == NULL) ? work : statsFunction);
+  item->statTableIndex = 0;
+  item->action         = action;
+  item->myQueue        = NULL;
+  item->executionTime  = 0;
+  item->next           = NULL;
+}
+
+// Thread management
+
+/**********************************************************************/
+static inline void wakeWorkerThread(SimpleWorkQueue *queue)
+{
+  smp_mb();
+  atomic64_cmpxchg(&queue->firstWakeup, 0, currentTime(CLOCK_MONOTONIC));
+  // Despite the name, there's a maximum of one thread in this list.
+  wake_up(&queue->waitingWorkerThreads);
+}
+
+// Delayed work items
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+/**
+ * Timer function invoked when a delayed work item is ready to run.
+ *
+ * @param timer  The timer which has just finished
+ **/
+static void processDelayedWorkItems(struct timer_list *timer)
+#else
+/**
+ * Timer function invoked when a delayed work item is ready to run.
+ *
+ * @param data  The queue pointer, as an unsigned long
+ **/
+static void processDelayedWorkItems(unsigned long data)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+  SimpleWorkQueue *queue = from_timer(queue, timer, delayedItemsTimer);
+#else
+  SimpleWorkQueue *queue = (SimpleWorkQueue *) data;
+#endif
+  Jiffies          nextExecutionTime = 0;
+  bool             reschedule        = false;
+  bool             needsWakeup       = false;
+
+  unsigned long flags;
+  spin_lock_irqsave(&queue->lock, flags);
+  while (!isWorkItemListEmpty(&queue->delayedItems)) {
+    KvdoWorkItem *item = workItemListPeek(&queue->delayedItems);
+    if (item->executionTime > jiffies) {
+      nextExecutionTime = item->executionTime;
+      reschedule = true;
+      break;
+    }
+    workItemListPoll(&queue->delayedItems);
+    item->executionTime = 0;    // not actually looked at...
+    item->myQueue = NULL;
+    needsWakeup |= enqueueWorkQueueItem(queue, item);
+  }
+  spin_unlock_irqrestore(&queue->lock, flags);
+  if (reschedule) {
+    mod_timer(&queue->delayedItemsTimer, nextExecutionTime);
+  }
+  if (needsWakeup) {
+    wakeWorkerThread(queue);
+  }
+}
+
+// Creation & teardown
+
+/**********************************************************************/
+static bool queueStarted(SimpleWorkQueue *queue)
+{
+  unsigned long flags;
+  spin_lock_irqsave(&queue->lock, flags);
+  bool started = queue->started;
+  spin_unlock_irqrestore(&queue->lock, flags);
+  return started;
+}
+
+/**
+ * Create a simple work queue with a worker thread.
+ *
+ * @param [in]  threadNamePrefix The per-device prefix to use in thread names
+ * @param [in]  name             The queue name
+ * @param [in]  parentKobject    The parent sysfs node
+ * @param [in]  owner            The kernel layer owning the work queue
+ * @param [in]  private          Private data of the queue for use by work
+ *                               items or other queue-specific functions
+ * @param [in]  type             The work queue type defining the lifecycle
+ *                               functions, queue actions, priorities, and
+ *                               timeout behavior
+ * @param [out] queuePtr         Where to store the queue handle
+ *
+ * @return  VDO_SUCCESS or an error code
+ **/
+static int makeSimpleWorkQueue(const char               *threadNamePrefix,
+                               const char               *name,
+                               struct kobject           *parentKobject,
+                               KernelLayer              *owner,
+                               void                     *private,
+                               const KvdoWorkQueueType  *type,
+                               SimpleWorkQueue         **queuePtr)
+{
+  SimpleWorkQueue *queue;
+  int result = ALLOCATE(1, SimpleWorkQueue, "simple work queue", &queue);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  queue->type         = type;
+  queue->private      = private;
+  queue->common.owner = owner;
+
+  unsigned int numPriorityLists = 1;
+  for (int i = 0; i < WORK_QUEUE_ACTION_COUNT; i++) {
+    const KvdoWorkQueueAction *action = &queue->type->actionTable[i];
+    if (action->name == NULL) {
+      break;
+    }
+    unsigned int code     = action->code;
+    unsigned int priority = action->priority;
+
+    result = ASSERT(code < WORK_QUEUE_ACTION_COUNT,
+                    "invalid action code %u in work queue initialization",
+                    code);
+    if (result != VDO_SUCCESS) {
+      FREE(queue);
+      return result;
+    }
+    result = ASSERT(priority < WORK_QUEUE_PRIORITY_COUNT,
+                    "invalid action priority %u in work queue initialization",
+                    priority);
+    if (result != VDO_SUCCESS) {
+      FREE(queue);
+      return result;
+    }
+    queue->priorityMap[code] = priority;
+    if (numPriorityLists <= priority) {
+      numPriorityLists = priority + 1;
+    }
+  }
+
+  result = duplicateString(name, "queue name", &queue->common.name);
+  if (result != VDO_SUCCESS) {
+    FREE(queue);
+    return -ENOMEM;
+  }
+
+  init_waitqueue_head(&queue->waitingWorkerThreads);
+  init_waitqueue_head(&queue->startWaiters);
+  spin_lock_init(&queue->lock);
+
+  initializeWorkItemList(&queue->delayedItems);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0)
+  timer_setup(&queue->delayedItemsTimer, processDelayedWorkItems, 0);
+#else
+  setup_timer(&queue->delayedItemsTimer, processDelayedWorkItems,
+              (unsigned long) queue);
+#endif
+
+  kobject_init(&queue->common.kobj, &simpleWorkQueueKobjType);
+  result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name);
+  if (result != 0) {
+    logError("Cannot add sysfs node: %d", result);
+    freeSimpleWorkQueue(queue);
+    return result;
+  }
+  queue->numPriorityLists = numPriorityLists;
+  for (int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) {
+    result = makeFunnelQueue(&queue->priorityLists[i]);
+    if (result != UDS_SUCCESS) {
+      freeSimpleWorkQueue(queue);
+      return result;
+    }
+  }
+  result = initializeWorkQueueStats(&queue->stats, &queue->common.kobj);
+  if (result != 0) {
+    logError("Cannot initialize statistics tracking: %d", result);
+    freeSimpleWorkQueue(queue);
+    return result;
+  }
+
+  queue->started = false;
+  struct task_struct *thread = NULL;
+  thread = kthread_run(workQueueRunner, queue, "%s:%s", threadNamePrefix,
+                       queue->common.name);
+
+  if (IS_ERR(thread)) {
+    freeSimpleWorkQueue(queue);
+    return (int) PTR_ERR(thread);
+  }
+  queue->thread = thread;
+  atomic_set(&queue->threadID, thread->pid);
+  /*
+   * If we don't wait to ensure the thread is running VDO code, a
+   * quick kthread_stop (due to errors elsewhere) could cause it to
+   * never get as far as running VDO, skipping the cleanup code.
+   *
+   * Eventually we should just make that path safe too, and then we
+   * won't need this synchronization.
+   */
+  wait_event(queue->startWaiters, queueStarted(queue) == true);
+  *queuePtr = queue;
+  return UDS_SUCCESS;
+}
+
+/**********************************************************************/
+int makeWorkQueue(const char               *threadNamePrefix,
+                  const char               *name,
+                  struct kobject           *parentKobject,
+                  KernelLayer              *owner,
+                  void                     *private,
+                  const KvdoWorkQueueType  *type,
+                  unsigned int              threadCount,
+                  KvdoWorkQueue           **queuePtr)
+{
+  if (threadCount == 1) {
+    SimpleWorkQueue *simpleQueue;
+    int result = makeSimpleWorkQueue(threadNamePrefix, name, parentKobject,
+                                     owner, private, type, &simpleQueue);
+    if (result == VDO_SUCCESS) {
+      *queuePtr = &simpleQueue->common;
+    }
+    return result;
+  }
+
+  RoundRobinWorkQueue *queue;
+  int result = ALLOCATE(1, RoundRobinWorkQueue, "round-robin work queue",
+                        &queue);
+  if (result != UDS_SUCCESS) {
+    return result;
+  }
+
+  result = ALLOCATE(threadCount, SimpleWorkQueue *, "subordinate work queues",
+                    &queue->serviceQueues);
+  if (result != UDS_SUCCESS) {
+    FREE(queue);
+    return result;
+  }
+
+  queue->numServiceQueues      = threadCount;
+  queue->common.roundRobinMode = true;
+  queue->common.owner          = owner;
+
+  result = duplicateString(name, "queue name", &queue->common.name);
+  if (result != VDO_SUCCESS) {
+    FREE(queue->serviceQueues);
+    FREE(queue);
+    return -ENOMEM;
+  }
+
+  kobject_init(&queue->common.kobj, &roundRobinWorkQueueKobjType);
+  result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name);
+  if (result != 0) {
+    logError("Cannot add sysfs node: %d", result);
+    finishWorkQueue(&queue->common);
+    kobject_put(&queue->common.kobj);
+    return result;
+  }
+
+  *queuePtr = &queue->common;
+
+  char threadName[TASK_COMM_LEN];
+  for (unsigned int i = 0; i < threadCount; i++) {
+    snprintf(threadName, sizeof(threadName), "%s%u", name, i);
+    result = makeSimpleWorkQueue(threadNamePrefix, threadName,
+                                 &queue->common.kobj, owner, private, type,
+                                 &queue->serviceQueues[i]);
+    if (result != VDO_SUCCESS) {
+      queue->numServiceQueues = i;
+      // Destroy previously created subordinates.
+      finishWorkQueue(*queuePtr);
+      freeWorkQueue(queuePtr);
+      return result;
+    }
+    queue->serviceQueues[i]->parentQueue = *queuePtr;
+  }
+
+  return VDO_SUCCESS;
+}
+
+/**
+ * Shut down a simple work queue's worker thread.
+ *
+ * @param queue  The work queue to shut down
+ **/
+static void finishSimpleWorkQueue(SimpleWorkQueue *queue)
+{
+  // Tell the worker thread to shut down.
+  if (queue->thread != NULL) {
+    atomic_set(&queue->threadID, 0);
+    // Waits for thread to exit.
+    kthread_stop(queue->thread);
+  }
+
+  queue->thread = NULL;
+}
+
+/**
+ * Shut down a round-robin work queue's service queues.
+ *
+ * @param queue  The work queue to shut down
+ **/
+static void finishRoundRobinWorkQueue(RoundRobinWorkQueue *queue)
+{
+  SimpleWorkQueue **queueTable = queue->serviceQueues;
+  unsigned int      count      = queue->numServiceQueues;
+
+  for (unsigned int i = 0; i < count; i++) {
+    finishSimpleWorkQueue(queueTable[i]);
+  }
+}
+
+/**********************************************************************/
+void finishWorkQueue(KvdoWorkQueue *queue)
+{
+  if (queue->roundRobinMode) {
+    finishRoundRobinWorkQueue(asRoundRobinWorkQueue(queue));
+  } else {
+    finishSimpleWorkQueue(asSimpleWorkQueue(queue));
+  }
+}
+
+/**
+ * Tear down a simple work queue, and decrement the kobject reference
+ * count on it.
+ *
+ * @param queue  The work queue
+ **/
+static void freeSimpleWorkQueue(SimpleWorkQueue *queue)
+{
+  for (unsigned int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) {
+    freeFunnelQueue(queue->priorityLists[i]);
+  }
+  cleanupWorkQueueStats(&queue->stats);
+  kobject_put(&queue->common.kobj);
+}
+
+/**
+ * Tear down a round-robin work queue and its service queues, and
+ * decrement the kobject reference count on it.
+ *
+ * @param queue  The work queue
+ **/
+static void freeRoundRobinWorkQueue(RoundRobinWorkQueue *queue)
+{
+  SimpleWorkQueue **queueTable = queue->serviceQueues;
+  unsigned int      count      = queue->numServiceQueues;
+
+  queue->serviceQueues = NULL;
+  for (unsigned int i = 0; i < count; i++) {
+    freeSimpleWorkQueue(queueTable[i]);
+  }
+  FREE(queueTable);
+  kobject_put(&queue->common.kobj);
+}
+
+/**********************************************************************/
+void freeWorkQueue(KvdoWorkQueue **queuePtr)
+{
+  KvdoWorkQueue *queue = *queuePtr;
+  if (queue == NULL) {
+    return;
+  }
+  *queuePtr = NULL;
+
+  finishWorkQueue(queue);
+
+  if (queue->roundRobinMode) {
+    freeRoundRobinWorkQueue(asRoundRobinWorkQueue(queue));
+  } else {
+    freeSimpleWorkQueue(asSimpleWorkQueue(queue));
+  }
+}
+
+// Debugging dumps
+
+/**********************************************************************/
+static void dumpSimpleWorkQueue(SimpleWorkQueue *queue)
+{
+  mutex_lock(&queueDataLock);
+  // Take a snapshot to reduce inconsistency in logged numbers.
+  queueData = *queue;
+  const char *threadStatus;
+
+  char taskStateReport = '-';
+  if (queueData.thread != NULL) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0)
+    taskStateReport = task_state_to_char(queue->thread);
+#else
+    unsigned int taskState = queue->thread->state & TASK_REPORT;
+    taskState &= 0x1ff;
+    unsigned int taskStateIndex;
+    if (taskState != 0) {
+      taskStateIndex = __ffs(taskState)+1;
+      BUG_ON(taskStateIndex >= sizeof(TASK_STATE_TO_CHAR_STR));
+    } else {
+      taskStateIndex = 0;
+    }
+    taskStateReport = TASK_STATE_TO_CHAR_STR[taskStateIndex];
+#endif
+  }
+
+  if (queueData.thread == NULL) {
+    threadStatus = "no threads";
+  } else if (atomic_read(&queueData.idle)) {
+    threadStatus = "idle";
+  } else {
+    threadStatus = "running";
+  }
+  logInfo("workQ %" PRIptr " (%s) %u entries %llu waits, %s (%c)",
+          &queue->common,
+          queueData.common.name,
+          getPendingCount(&queueData),
+          queueData.stats.waits,
+          threadStatus,
+          taskStateReport);
+
+  logWorkItemStats(&queueData.stats.workItemStats);
+  logWorkQueueStats(queue);
+
+  mutex_unlock(&queueDataLock);
+
+  // ->lock spin lock status?
+  // ->waitingWorkerThreads wait queue status? anyone waiting?
+}
+
+/**********************************************************************/
+void dumpWorkQueue(KvdoWorkQueue *queue)
+{
+  if (queue->roundRobinMode) {
+    RoundRobinWorkQueue *roundRobinQueue = asRoundRobinWorkQueue(queue);
+    for (unsigned int i = 0; i < roundRobinQueue->numServiceQueues; i++) {
+      dumpSimpleWorkQueue(roundRobinQueue->serviceQueues[i]);
+    }
+  } else {
+    dumpSimpleWorkQueue(asSimpleWorkQueue(queue));
+  }
+}
+
+/**********************************************************************/
+void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length)
+{
+  size_t currentLength
+    = snprintf(buffer, length, "%.*s/", TASK_COMM_LEN,
+               item->myQueue == NULL ? "-" : item->myQueue->name);
+  if (currentLength < length) {
+    getFunctionName(item->statsFunction, buffer + currentLength,
+                    length - currentLength);
+  }
+}
+
+// Work submission
+
+/**********************************************************************/
+void enqueueWorkQueue(KvdoWorkQueue *kvdoWorkQueue, KvdoWorkItem *item)
+{
+  SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue);
+
+  item->executionTime = 0;
+
+  if (enqueueWorkQueueItem(queue, item)) {
+    wakeWorkerThread(queue);
+  }
+}
+
+/**********************************************************************/
+void enqueueWorkQueueDelayed(KvdoWorkQueue *kvdoWorkQueue,
+                             KvdoWorkItem  *item,
+                             Jiffies        executionTime)
+{
+  if (executionTime <= jiffies) {
+    enqueueWorkQueue(kvdoWorkQueue, item);
+    return;
+  }
+
+  SimpleWorkQueue *queue             = pickSimpleQueue(kvdoWorkQueue);
+  bool             rescheduleTimer   = false;
+  unsigned long    flags;
+
+  item->executionTime = executionTime;
+
+  // Lock if the work item is delayed. All delayed items are handled via a
+  // single linked list.
+  spin_lock_irqsave(&queue->lock, flags);
+
+  if (isWorkItemListEmpty(&queue->delayedItems)) {
+    rescheduleTimer = true;
+  }
+  /*
+   * XXX We should keep the list sorted, but at the moment the list won't
+   * grow above a single entry anyway.
+   */
+  item->myQueue = &queue->common;
+  addToWorkItemList(&queue->delayedItems, item);
+
+  spin_unlock_irqrestore(&queue->lock, flags);
+
+  if (rescheduleTimer) {
+    mod_timer(&queue->delayedItemsTimer, executionTime);
+  }
+}
+
+// Misc
+
+
+/**********************************************************************/
+KvdoWorkQueue *getCurrentWorkQueue(void)
+{
+  SimpleWorkQueue *queue = getCurrentThreadWorkQueue();
+  return (queue == NULL) ? NULL : &queue->common;
+}
+
+/**********************************************************************/
+KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue)
+{
+  return queue->owner;
+}
+
+/**********************************************************************/
+void *getWorkQueuePrivateData(void)
+{
+  SimpleWorkQueue *queue = getCurrentThreadWorkQueue();
+  return (queue != NULL) ? queue->private : NULL;
+}
+
+/**********************************************************************/
+void setWorkQueuePrivateData(void *newData)
+{
+  SimpleWorkQueue *queue = getCurrentThreadWorkQueue();
+  BUG_ON(queue == NULL);
+  queue->private = newData;
+}
+
+/**********************************************************************/
+void initWorkQueueOnce(void)
+{
+  // We can't use DEFINE_MUTEX because it's not compatible with c99 mode.
+  mutex_init(&queueDataLock);
+  initWorkQueueStackHandleOnce();
+}
diff --git a/vdo/kernel/workQueue.h b/vdo/kernel/workQueue.h
new file mode 100644
index 0000000..4043295
--- /dev/null
+++ b/vdo/kernel/workQueue.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.h#2 $
+ */
+
+#ifndef ALBIREO_WORK_QUEUE_H
+#define ALBIREO_WORK_QUEUE_H
+
+#include <linux/kobject.h>
+#include <linux/sched.h>        /* for TASK_COMM_LEN */
+
+#include "kernelTypes.h"
+#include "util/funnelQueue.h"
+
+enum {
+  MAX_QUEUE_NAME_LEN        = TASK_COMM_LEN,
+  /** Maximum number of action definitions per work queue type */
+  WORK_QUEUE_ACTION_COUNT   = 8,
+  /** Number of priority values available */
+  WORK_QUEUE_PRIORITY_COUNT = 4,
+};
+
+struct kvdoWorkItem {
+  /** Entry link for lock-free work queue */
+  FunnelQueueEntry  workQueueEntryLink;
+  /** Function to be called */
+  KvdoWorkFunction  work;
+  /** Optional alternate function for display in queue stats */
+  void             *statsFunction;
+  /** An index into the statistics table; filled in by workQueueStats code */
+  unsigned int      statTableIndex;
+  /**
+   * The action code given to setupWorkItem, from which a priority will be
+   * determined.
+   **/
+  unsigned int      action;
+  /** The work queue in which the item is enqueued, or NULL if not enqueued. */
+  KvdoWorkQueue    *myQueue;
+  /**
+   * Time at which to execute in jiffies for a delayed work item, or zero to
+   * queue for execution ASAP.
+   **/
+  Jiffies           executionTime;
+  /** List management for delayed or expired work items */
+  KvdoWorkItem     *next;
+  /** Time of enqueueing, in ns, for recording queue (waiting) time stats */
+  uint64_t          enqueueTime;
+};
+
+/**
+ * Table entries defining an action.
+ *
+ * Actions are intended to distinguish general classes of activity for
+ * prioritization purposes, but not necessarily to indicate specific work
+ * functions. They are indicated to setupWorkItem numerically, using an
+ * enumerator defined per kind of work queue -- bio submission work queue
+ * actions use BioQAction, cpu actions use CPUQAction, etc. For example, for
+ * the CPU work queues, data compression can be prioritized separately from
+ * final cleanup processing of a KVIO or from dedupe verification; base code
+ * threads prioritize all VIO callback invocation the same, but separate from
+ * sync or heartbeat operations. The bio acknowledgement work queue, on the
+ * other hand, only does one thing, so it only defines one action code.
+ *
+ * Action codes values must be small integers, 0 through
+ * WORK_QUEUE_ACTION_COUNT-1, and should not be duplicated for a queue type.
+ *
+ * A table of KvdoWorkQueueAction entries embedded in KvdoWorkQueueType
+ * specifies the name, code, and priority for each type of action in the work
+ * queue. The table can have at most WORK_QUEUE_ACTION_COUNT entries, but a
+ * NULL name indicates an earlier end to the table.
+ *
+ * Priorities may be specified as values from 0 through
+ * WORK_QUEUE_PRIORITY_COUNT-1, higher values indicating higher priority.
+ * Priorities are just strong suggestions; it's possible for a lower-priority
+ * work item scheduled right after a high-priority one to be run first, if the
+ * worker thread happens to be scanning its queues at just the wrong moment,
+ * but the high-priority item will be picked up next.
+ *
+ * Internally, the priorities in this table are used to initialize another
+ * table in the constructed work queue object, and in internal builds,
+ * device-mapper messages can be sent to change the priority for an action,
+ * identified by name, in a running VDO device. Doing so does not affect the
+ * priorities for other devices, or for future VDO device creation.
+ **/
+typedef struct kvdoWorkQueueAction {
+  /** Name of the action */
+  char         *name;
+
+  /** The action code (per-type enum) */
+  unsigned int  code;
+
+  /** The initial priority for this action */
+  unsigned int  priority;
+} KvdoWorkQueueAction;
+
+typedef void (*KvdoWorkQueueFunction)(void *);
+
+/**
+ * Static attributes of a work queue that are fixed at compile time
+ * for a given call site. (Attributes that may be computed at run time
+ * are passed as separate arguments.)
+ **/
+typedef struct kvdoWorkQueueType {
+  /** A function to call in the new thread before servicing requests */
+  KvdoWorkQueueFunction start;
+
+  /** A function to call in the new thread when shutting down */
+  KvdoWorkQueueFunction finish;
+
+  /** A function to call in the new thread after running out of work */
+  KvdoWorkQueueFunction suspend;
+
+  /** Table of actions for this work queue */
+  KvdoWorkQueueAction   actionTable[WORK_QUEUE_ACTION_COUNT];
+} KvdoWorkQueueType;
+
+/**
+ * Create a work queue.
+ *
+ * If multiple threads are requested, work items will be distributed to them in
+ * round-robin fashion.
+ *
+ * @param [in]  threadNamePrefix The per-device prefix to use in thread names
+ * @param [in]  name             The queue name
+ * @param [in]  parentKobject    The parent sysfs node
+ * @param [in]  owner            The kernel layer owning the work queue
+ * @param [in]  private          Private data of the queue for use by work
+ *                               items or other queue-specific functions
+ * @param [in]  type             The work queue type defining the lifecycle
+ *                               functions, queue actions, priorities, and
+ *                               timeout behavior
+ * @param [in]  threadCount      Number of service threads to set up
+ * @param [out] queuePtr         Where to store the queue handle
+ *
+ * @return VDO_SUCCESS or an error code
+ **/
+int makeWorkQueue(const char               *threadNamePrefix,
+                  const char               *name,
+                  struct kobject           *parentKobject,
+                  KernelLayer              *owner,
+                  void                     *private,
+                  const KvdoWorkQueueType  *type,
+                  unsigned int              threadCount,
+                  KvdoWorkQueue           **queuePtr);
+
+/**
+ * Set up the fields of a work queue item.
+ *
+ * Before the first setup call (setupWorkItem or setupWorkItemWithTimeout), the
+ * work item must have been initialized to all-zero. Resetting a
+ * previously-used work item does not require another memset.
+ *
+ * The action code is typically defined in a work-queue-type-specific
+ * enumeration; see the description of KvdoWorkQueueAction.
+ *
+ * @param item           The work item to initialize
+ * @param work           The function pointer to execute
+ * @param statsFunction  A function pointer to record for stats, or NULL
+ * @param action         Action code, for determination of priority
+ **/
+void setupWorkItem(KvdoWorkItem     *item,
+                   KvdoWorkFunction  work,
+                   void             *statsFunction,
+                   unsigned int      action);
+
+/**
+ * Add a work item to a work queue.
+ *
+ * If the work item has a timeout that has already passed, the timeout
+ * handler function may be invoked at this time.
+ *
+ * @param queue      The queue handle
+ * @param item       The work item to be processed
+ **/
+void enqueueWorkQueue(KvdoWorkQueue *queue, KvdoWorkItem *item);
+
+/**
+ * Add a work item to a work queue, to be run at a later point in time.
+ *
+ * Currently delayed work items are used only in a very limited fashion -- at
+ * most one at a time for any of the work queue types that use them -- and some
+ * shortcuts have been taken that assume that that's the case. Multiple delayed
+ * work items should work, but they will execute in the order they were
+ * enqueued.
+ *
+ * @param queue           The queue handle
+ * @param item            The work item to be processed
+ * @param executionTime   When to run the work item (jiffies)
+ **/
+void enqueueWorkQueueDelayed(KvdoWorkQueue *queue,
+                             KvdoWorkItem  *item,
+                             Jiffies        executionTime);
+
+/**
+ * Shut down a work queue's worker thread.
+ *
+ * Alerts the worker thread that it should shut down, and then waits
+ * for it to do so.
+ *
+ * There should not be any new enqueueing of work items done once this
+ * function is called. Any pending delayed work items will be
+ * processed, as scheduled, before the worker thread shuts down, but
+ * they must not re-queue themselves to run again.
+ *
+ * @param queue  The work queue to shut down
+ **/
+void finishWorkQueue(KvdoWorkQueue *queue);
+
+/**
+ * Free a work queue and null out the reference to it.
+ *
+ * @param queuePtr  Where the queue handle is found
+ **/
+void freeWorkQueue(KvdoWorkQueue **queuePtr);
+
+/**
+ * Print work queue state and statistics to the kernel log.
+ *
+ * @param queue  The work queue to examine
+ **/
+void dumpWorkQueue(KvdoWorkQueue *queue);
+
+/**
+ * Write to the buffer some info about the work item, for logging.
+ * Since the common use case is dumping info about a lot of work items
+ * to syslog all at once, the format favors brevity over readability.
+ *
+ * @param item    The work item
+ * @param buffer  The message buffer to fill in
+ * @param length  The length of the message buffer
+ **/
+void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length);
+
+
+/**
+ * Initialize work queue internals at module load time.
+ **/
+void initWorkQueueOnce(void);
+
+/**
+ * Checks whether two work items have the same action codes
+ *
+ * @param item1 The first item
+ * @param item2 The second item
+ *
+ * @return TRUE if the actions are the same, FALSE otherwise
+ */
+static inline bool areWorkItemActionsEqual(KvdoWorkItem *item1,
+                                           KvdoWorkItem *item2)
+{
+  return item1->action == item2->action;
+}
+
+/**
+ * Returns the private data for the current thread's work queue.
+ *
+ * @return  The private data pointer, or NULL if none or if the current
+ *          thread is not a work queue thread.
+ **/
+void *getWorkQueuePrivateData(void);
+
+/**
+ * Updates the private data pointer for the current thread's work queue.
+ *
+ * @param newData  The new private data pointer
+ **/
+void setWorkQueuePrivateData(void *newData);
+
+/**
+ * Returns the work queue pointer for the current thread, if any.
+ *
+ * @return   The work queue pointer or NULL
+ **/
+KvdoWorkQueue *getCurrentWorkQueue(void);
+
+/**
+ * Returns the kernel layer that owns the work queue.
+ *
+ * @param queue  The work queue
+ *
+ * @return   The owner pointer supplied at work queue creation
+ **/
+KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue);
+
+#endif /* ALBIREO_WORK_QUEUE_H */
diff --git a/vdo/kernel/workQueueHandle.c b/vdo/kernel/workQueueHandle.c
new file mode 100644
index 0000000..65b3e02
--- /dev/null
+++ b/vdo/kernel/workQueueHandle.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.c#2 $
+ */
+
+#include "workQueueHandle.h"
+
+WorkQueueStackHandleGlobals workQueueStackHandleGlobals;
+
+/**********************************************************************/
+void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle,
+                                    SimpleWorkQueue      *queue)
+{
+  handle->nonce = workQueueStackHandleGlobals.nonce;
+  handle->queue = queue;
+
+  long offset = (char *) handle - (char *) task_stack_page(current);
+  spin_lock(&workQueueStackHandleGlobals.offsetLock);
+  if (workQueueStackHandleGlobals.offset == 0) {
+    workQueueStackHandleGlobals.offset = offset;
+    spin_unlock(&workQueueStackHandleGlobals.offsetLock);
+  } else {
+    long foundOffset = workQueueStackHandleGlobals.offset;
+    spin_unlock(&workQueueStackHandleGlobals.offsetLock);
+    BUG_ON(foundOffset != offset);
+  }
+}
+
+/**********************************************************************/
+void initWorkQueueStackHandleOnce(void)
+{
+  spin_lock_init(&workQueueStackHandleGlobals.offsetLock);
+  workQueueStackHandleGlobals.nonce = currentTime(CLOCK_MONOTONIC);
+}
diff --git a/vdo/kernel/workQueueHandle.h b/vdo/kernel/workQueueHandle.h
new file mode 100644
index 0000000..e72ce42
--- /dev/null
+++ b/vdo/kernel/workQueueHandle.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.h#1 $
+ */
+
+#ifndef WORK_QUEUE_HANDLE_H
+#define WORK_QUEUE_HANDLE_H
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0)
+#include <linux/sched/task_stack.h>
+#else
+#include <linux/sched.h>
+#endif
+
+#include "workQueueInternals.h"
+
+/*
+ * Layout of a special structure stored at a consistent place on the
+ * stack in work queue threads.
+ */
+typedef struct workQueueStackHandle {
+  unsigned long    nonce;
+  SimpleWorkQueue *queue;
+} WorkQueueStackHandle;
+
+typedef struct workQueueStackHandleGlobals {
+  /*
+   * Location in the stack, relative to the task structure which is
+   * contained in the same memory allocation.
+   */
+  long          offset;
+  /*
+   * A lock is used to guard against multiple updaters, but once an
+   * update is done, the offset variable will be read-only.
+   */
+  spinlock_t    offsetLock;
+  /*
+   * A nonce chosen differently each time the module is loaded, used
+   * as a marker so we can check that the current thread really is a
+   * work queue thread. Set at module initialization time, before any
+   * work queues are created.
+   */
+  unsigned long nonce;
+} WorkQueueStackHandleGlobals;
+
+extern WorkQueueStackHandleGlobals workQueueStackHandleGlobals;
+
+/**
+ * Initialize a stack handle associated with a work queue.
+ *
+ * @param [out] handle  The handle to be initialized
+ * @param [in]  queue   The work queue pointer
+ **/
+void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle,
+                                    SimpleWorkQueue      *queue);
+
+/**
+ * Return the work queue pointer recorded at initialization time in
+ * the work-queue stack handle initialized on the stack of the current
+ * thread, if any.
+ *
+ * @return   the work queue pointer, or NULL
+ **/
+static inline SimpleWorkQueue *getCurrentThreadWorkQueue(void)
+{
+  WorkQueueStackHandle *handle
+    = (WorkQueueStackHandle *)(task_stack_page(current)
+                               + workQueueStackHandleGlobals.offset);
+  if (likely(handle->nonce == workQueueStackHandleGlobals.nonce)) {
+    return handle->queue;
+  } else {
+    return NULL;
+  }
+}
+
+/**
+ * Initialize the global state used by the work-queue stack-handle
+ * code.
+ **/
+void initWorkQueueStackHandleOnce(void);
+
+#endif // WORK_QUEUE_HANDLE_H
diff --git a/vdo/kernel/workQueueInternals.h b/vdo/kernel/workQueueInternals.h
new file mode 100644
index 0000000..fc7a2a3
--- /dev/null
+++ b/vdo/kernel/workQueueInternals.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueInternals.h#4 $
+ */
+
+#ifndef WORK_QUEUE_INTERNALS_H
+#define WORK_QUEUE_INTERNALS_H
+
+#include <linux/completion.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+
+#include "workItemStats.h"
+#include "workQueueStats.h"
+
+typedef struct kvdoWorkItemList {
+  KvdoWorkItem *tail;
+} KvdoWorkItemList;
+
+/**
+ * Work queue definition.
+ *
+ * There are two types of work queues: simple, with one worker thread, and
+ * round-robin, which uses a group of the former to do the work, and assigns
+ * work to them in -- you guessed it -- round-robin fashion. Externally, both
+ * are represented via the same common sub-structure, though there's actually
+ * not a great deal of overlap between the two types internally.
+ **/
+struct kvdoWorkQueue {
+  /** Name of just the work queue (e.g., "cpuQ12") */
+  char           *name;
+  /**
+   * Whether this is a round-robin work queue or a simple (one-thread)
+   * work queue.
+   **/
+  bool            roundRobinMode;
+  /** A handle to a sysfs tree for reporting stats and other info */
+  struct kobject  kobj;
+  /** The kernel layer owning this work queue */
+  KernelLayer    *owner;
+};
+
+typedef struct simpleWorkQueue     SimpleWorkQueue;
+typedef struct roundRobinWorkQueue RoundRobinWorkQueue;
+
+struct simpleWorkQueue {
+  /** Common work queue bits */
+  KvdoWorkQueue            common;
+  /** A copy of .thread->pid, for safety in the sysfs support */
+  atomic_t                 threadID;
+  /**
+   * Number of priorities actually used, so we don't keep re-checking unused
+   * funnel queues.
+   **/
+  unsigned int             numPriorityLists;
+  /**
+   * Map from action codes to priorities.
+   *
+   * This mapping can be changed at run time in internal builds, for tuning
+   * purposes.
+   **/
+  uint8_t                  priorityMap[WORK_QUEUE_ACTION_COUNT];
+  /** The funnel queues */
+  FunnelQueue             *priorityLists[WORK_QUEUE_PRIORITY_COUNT];
+  /** The kernel thread */
+  struct task_struct      *thread;
+  /** Life cycle functions, etc */
+  const KvdoWorkQueueType *type;
+  /** Opaque private data pointer, defined by higher level code */
+  void                    *private;
+  /** In a subordinate work queue, a link back to the round-robin parent */
+  KvdoWorkQueue           *parentQueue;
+  /** Padding for cache line separation */
+  char                     pad[CACHE_LINE_BYTES - sizeof(KvdoWorkQueue *)];
+  /** Lock protecting delayedItems, priorityMap, numPriorityLists, started */
+  spinlock_t               lock;
+  /** Any worker threads (zero or one) waiting for new work to do */
+  wait_queue_head_t        waitingWorkerThreads;
+  /**
+   * Hack to reduce wakeup calls if the worker thread is running. See comments
+   * in workQueue.c.
+   *
+   * There is a lot of redundancy with "firstWakeup", though, and the pair
+   * should be re-examined.
+   **/
+  atomic_t                 idle;
+  /** Wait list for synchronization during worker thread startup */
+  wait_queue_head_t        startWaiters;
+  /** Worker thread status (boolean) */
+  bool                     started;
+
+  /** List of delayed work items; usually only one, if any */
+  KvdoWorkItemList         delayedItems;
+  /**
+   * Timer for pulling delayed work items off their list and submitting them to
+   * run.
+   *
+   * If the spinlock "lock" above is not held, this timer is scheduled (or
+   * currently firing and the callback about to acquire the lock) iff
+   * delayedItems is nonempty.
+   **/
+  struct timer_list        delayedItemsTimer;
+
+  /**
+   * Timestamp (ns) from the submitting thread that decided to wake us up; also
+   * used as a flag to indicate whether a wakeup is needed.
+   *
+   * Written by submitting threads with atomic64_cmpxchg, and by the worker
+   * thread setting to 0.
+   *
+   * If the value is 0, the worker is probably asleep; the submitting thread
+   * stores a non-zero value and becomes responsible for calling wake_up on the
+   * worker thread. If the value is non-zero, either the worker is running or
+   * another thread has the responsibility for issuing the wakeup.
+   *
+   * The "sleep" mode has periodic wakeups and the worker thread may happen to
+   * wake up while a work item is being enqueued. If that happens, the wakeup
+   * may be unneeded but will be attempted anyway.
+   *
+   * So the return value from cmpxchg(firstWakeup,0,nonzero) can always be
+   * done, and will tell the submitting thread whether to issue the wakeup or
+   * not; cmpxchg is atomic, so no other synchronization is needed.
+   *
+   * A timestamp is used rather than, say, 1, so that the worker thread can
+   * record stats on how long it takes to actually get the worker thread
+   * running.
+   *
+   * There is some redundancy between this and "idle" above.
+   **/
+  atomic64_t               firstWakeup;
+  /** Padding for cache line separation */
+  char                     pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)];
+  /** Scheduling and work-function statistics */
+  KvdoWorkQueueStats       stats;
+  /** Last time (ns) the scheduler actually woke us up */
+  uint64_t                 mostRecentWakeup;
+};
+
+struct roundRobinWorkQueue {
+  /** Common work queue bits */
+  KvdoWorkQueue     common;
+  /** Simple work queues, for actually getting stuff done */
+  SimpleWorkQueue **serviceQueues;
+  /** Number of subordinate work queues */
+  unsigned int      numServiceQueues;
+  /** Padding for cache line separation */
+  char              pad[CACHE_LINE_BYTES - sizeof(unsigned int)];
+  /**
+   * Rotor used for dispatching across subordinate service queues.
+   *
+   * Used and updated by submitting threads. (Not atomically or with locking,
+   * because we don't really care about it being precise, only about getting a
+   * roughly even spread; if an increment is missed here and there, it's not a
+   * problem.)
+   **/
+  unsigned int      serviceQueueRotor;
+};
+
+static inline SimpleWorkQueue *asSimpleWorkQueue(KvdoWorkQueue *queue)
+{
+  return ((queue == NULL)
+          ? NULL
+          : container_of(queue, SimpleWorkQueue, common));
+}
+
+static inline const SimpleWorkQueue *
+asConstSimpleWorkQueue(const KvdoWorkQueue *queue)
+{
+  return ((queue == NULL)
+          ? NULL
+          : container_of(queue, SimpleWorkQueue, common));
+}
+
+static inline RoundRobinWorkQueue *asRoundRobinWorkQueue(KvdoWorkQueue *queue)
+{
+  return ((queue == NULL)
+          ? NULL
+          : container_of(queue, RoundRobinWorkQueue, common));
+}
+
+#endif // WORK_QUEUE_INTERNALS_H
diff --git a/vdo/kernel/workQueueStats.c b/vdo/kernel/workQueueStats.c
new file mode 100644
index 0000000..d5a38ae
--- /dev/null
+++ b/vdo/kernel/workQueueStats.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.c#6 $
+ */
+
+#include "workQueueStats.h"
+
+#include "atomic.h"
+#include "logger.h"
+#include "workItemStats.h"
+#include "workQueueInternals.h"
+
+/**********************************************************************/
+int initializeWorkQueueStats(KvdoWorkQueueStats *stats,
+                             struct kobject     *queueKObject)
+{
+  spin_lock_init(&stats->workItemStats.functionTable.lock);
+  if (ENABLE_PER_FUNCTION_TIMING_STATS) {
+    for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) {
+      initSimpleStats(&stats->workItemStats.times[i]);
+    }
+  }
+
+  stats->queueTimeHistogram
+    = makeLogarithmicHistogram(queueKObject, "queue_time",
+                               "Queue Time", "work items", "wait time",
+                               "microseconds", 9);
+  if (stats->queueTimeHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->rescheduleQueueLengthHistogram
+    = makeLogarithmicHistogram(queueKObject, "reschedule_queue_length",
+                               "Reschedule Queue Length", "calls",
+                               "queued work items", NULL, 4);
+  if (stats->rescheduleQueueLengthHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->rescheduleTimeHistogram
+    = makeLogarithmicHistogram(queueKObject, "reschedule_time",
+                               "Reschedule Time", "calls",
+                               "sleep interval", "microseconds", 9);
+  if (stats->rescheduleTimeHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->runTimeBeforeRescheduleHistogram
+    = makeLogarithmicHistogram(queueKObject, "run_time_before_reschedule",
+                               "Run Time Before Reschedule",
+                               "calls", "run time", "microseconds", 9);
+  if (stats->runTimeBeforeRescheduleHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->scheduleTimeHistogram
+    = makeLogarithmicHistogram(queueKObject, "schedule_time",
+                               "Schedule Time",
+                               "calls", "sleep interval", "microseconds", 9);
+  if (stats->scheduleTimeHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->wakeupLatencyHistogram
+    = makeLogarithmicHistogram(queueKObject, "wakeup_latency",
+                               "Wakeup Latency",
+                               "wakeups", "latency", "microseconds", 9);
+  if (stats->wakeupLatencyHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  stats->wakeupQueueLengthHistogram
+    = makeLogarithmicHistogram(queueKObject, "wakeup_queue_length",
+                               "Wakeup Queue Length", "wakeups",
+                               "queued work items", NULL, 4);
+  if (stats->wakeupQueueLengthHistogram == NULL) {
+    return -ENOMEM;
+  }
+
+  return 0;
+}
+
+/**********************************************************************/
+void cleanupWorkQueueStats(KvdoWorkQueueStats *stats)
+{
+  freeHistogram(&stats->queueTimeHistogram);
+  freeHistogram(&stats->rescheduleQueueLengthHistogram);
+  freeHistogram(&stats->rescheduleTimeHistogram);
+  freeHistogram(&stats->runTimeBeforeRescheduleHistogram);
+  freeHistogram(&stats->scheduleTimeHistogram);
+  freeHistogram(&stats->wakeupLatencyHistogram);
+  freeHistogram(&stats->wakeupQueueLengthHistogram);
+}
+
+/**********************************************************************/
+static uint64_t getTotalProcessed(const SimpleWorkQueue *queue)
+{
+  uint64_t totalProcessed = 0;
+  for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) {
+    totalProcessed += queue->stats.workItemStats.times[i].count;
+  }
+  return totalProcessed;
+}
+
+/**********************************************************************/
+void logWorkQueueStats(const SimpleWorkQueue *queue)
+{
+  uint64_t runtimeNS = 0;
+  if (queue->thread != NULL) {
+    runtimeNS += queue->thread->se.sum_exec_runtime;
+  }
+
+  unsigned long nsPerWorkItem = 0;
+  uint64_t totalProcessed = getTotalProcessed(queue);
+  if (totalProcessed > 0) {
+    nsPerWorkItem = runtimeNS / totalProcessed;
+  }
+  unsigned long runtimeMS = runtimeNS / 1000;
+  logInfo("workQ %" PRIptr " (%s) thread cpu usage %lu.%06lus, %" PRIu64
+          " tasks, %lu.%03luus/task",
+          queue,
+          queue->common.name,
+          runtimeMS / 1000000, runtimeMS % 1000000,
+          totalProcessed,
+          nsPerWorkItem / 1000, nsPerWorkItem % 1000);
+}
+
+/**********************************************************************/
+ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer)
+{
+  // Get snapshots of all three at approximately the same time.
+  uint64_t startTime = stats->startTime;
+  uint64_t runTime = atomic64_read(&stats->runTime);
+  uint64_t rescheduleTime = atomic64_read(&stats->rescheduleTime);
+  loadFence();                  // rdtsc barrier
+  uint64_t now = currentTime(CLOCK_MONOTONIC);
+  uint64_t lifetime = now - startTime;
+
+  return sprintf(buffer,
+                 "%llu %llu %llu\n",
+                 lifetime, runTime, rescheduleTime);
+}
diff --git a/vdo/kernel/workQueueStats.h b/vdo/kernel/workQueueStats.h
new file mode 100644
index 0000000..914f5f4
--- /dev/null
+++ b/vdo/kernel/workQueueStats.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.h#2 $
+ */
+
+#ifndef WORK_QUEUE_STATS_H
+#define WORK_QUEUE_STATS_H
+
+#include "workQueue.h"
+
+#include "timeUtils.h"
+
+#include "histogram.h"
+#include "workItemStats.h"
+
+// Defined in workQueueInternals.h after inclusion of workQueueStats.h.
+struct simpleWorkQueue;
+
+/*
+ * Tracking statistics.
+ *
+ * Cache line contention issues:
+ *
+ * In workItemStats, there are read-only fields accessed mostly by
+ * work submitters, then fields updated by the work submitters (for
+ * which there will be contention), then fields rarely if ever updated
+ * (more than two cache lines' worth), then fields updated only by the
+ * worker thread. The trailing fields here are updated only by the
+ * worker thread.
+ */
+typedef struct kvdoWorkQueueStats {
+  // Per-work-function counters and optional nanosecond timing data
+  KvdoWorkItemStats  workItemStats;
+  // How often we go to sleep waiting for work
+  uint64_t           waits;
+
+  // Run time data, for monitoring utilization levels.
+
+  // Thread start time, from which we can compute lifetime thus far.
+  uint64_t           startTime;
+  /*
+   * Time the thread has not been blocked waiting for a new work item,
+   * nor in cond_resched(). This will include time the thread has been
+   * blocked by some kernel function invoked by the work functions
+   * (e.g., waiting for socket buffer space).
+   *
+   * This is not redundant with runTimeBeforeRescheduleHistogram, as
+   * the latter doesn't count run time not followed by a cond_resched
+   * call.
+   */
+  atomic64_t         runTime;
+  // Time the thread has been suspended via cond_resched().
+  // (Duplicates data hidden within rescheduleTimeHistogram.)
+  atomic64_t         rescheduleTime;
+
+  // Histogram of the queue times of work items (microseconds)
+  Histogram         *queueTimeHistogram;
+  // How busy we are when cond_resched is called
+  Histogram         *rescheduleQueueLengthHistogram;
+  // Histogram of the time cond_resched makes us sleep for (microseconds)
+  Histogram         *rescheduleTimeHistogram;
+  // Histogram of the run time between cond_resched calls (microseconds)
+  Histogram         *runTimeBeforeRescheduleHistogram;
+  // Histogram of the time schedule_timeout lets us sleep for (microseconds)
+  Histogram         *scheduleTimeHistogram;
+  // How long from thread wakeup call to thread actually running (microseconds)
+  Histogram         *wakeupLatencyHistogram;
+  // How much work is pending by the time we start running
+  Histogram         *wakeupQueueLengthHistogram;
+} KvdoWorkQueueStats;
+
+/**
+ * Initialize the work queue's statistics tracking.
+ *
+ * @param stats         The statistics structure
+ * @param queueKObject  The sysfs directory kobject for the work queue
+ *
+ * @return  0 or a kernel error code
+ **/
+int initializeWorkQueueStats(KvdoWorkQueueStats *stats,
+                             struct kobject     *queueKObject)
+  __attribute__((warn_unused_result));
+
+/**
+ * Tear down any allocated storage or objects for statistics tracking.
+ *
+ * @param stats  The statistics structure
+ **/
+void cleanupWorkQueueStats(KvdoWorkQueueStats *stats);
+
+/**
+ * Update the work queue statistics tracking to note the enqueueing of
+ * a work item.
+ *
+ * @param stats     The statistics structure
+ * @param item      The work item being enqueued
+ * @param priority  The priority of the work item
+ **/
+static inline void updateStatsForEnqueue(KvdoWorkQueueStats *stats,
+                                         KvdoWorkItem       *item,
+                                         int                 priority)
+{
+  updateWorkItemStatsForEnqueue(&stats->workItemStats, item, priority);
+  item->enqueueTime = currentTime(CLOCK_MONOTONIC);
+}
+
+/**
+ * Update the work queue statistics tracking to note the dequeueing of
+ * a work item.
+ *
+ * @param stats  The statistics structure
+ * @param item   The work item being enqueued
+ **/
+static inline void updateStatsForDequeue(KvdoWorkQueueStats *stats,
+                                         KvdoWorkItem       *item)
+{
+  updateWorkItemStatsForDequeue(&stats->workItemStats, item);
+  enterHistogramSample(stats->queueTimeHistogram,
+                       (currentTime(CLOCK_MONOTONIC) - item->enqueueTime) / 1000);
+  item->enqueueTime = 0;
+}
+
+/**
+ * Write the work queue's accumulated statistics to the kernel log.
+ *
+ * The queue pointer is needed so that its address and name can be
+ * logged along with the statistics.
+ *
+ * @param queue  The work queue
+ **/
+void logWorkQueueStats(const struct simpleWorkQueue *queue);
+
+/**
+ * Format the thread lifetime, run time, and suspend time into a
+ * supplied buffer for reporting via sysfs.
+ *
+ * @param [in]  stats   The stats structure containing the run-time info
+ * @param [out] buffer  The buffer in which to report the info
+ **/
+ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer);
+
+#endif // WORK_QUEUE_STATS_H
diff --git a/vdo/kernel/workQueueSysfs.c b/vdo/kernel/workQueueSysfs.c
new file mode 100644
index 0000000..f9dd9cb
--- /dev/null
+++ b/vdo/kernel/workQueueSysfs.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.c#1 $
+ */
+
+#include "workQueueSysfs.h"
+
+#include <linux/kobject.h>
+
+#include "logger.h"
+#include "memoryAlloc.h"
+
+#include "workQueueInternals.h"
+
+typedef struct workQueueAttribute {
+  struct attribute attr;
+  ssize_t (*show)(const KvdoWorkQueue *queue, char *buf);
+  ssize_t (*store)(KvdoWorkQueue *queue, const char *buf, size_t length);
+} WorkQueueAttribute;
+
+/**********************************************************************/
+static ssize_t nameShow(const KvdoWorkQueue *queue, char *buf)
+{
+  return sprintf(buf, "%s\n", queue->name);
+}
+
+/**********************************************************************/
+static ssize_t pidShow(const KvdoWorkQueue *queue, char *buf)
+{
+  return sprintf(buf, "%ld\n",
+                 (long) atomic_read(&asConstSimpleWorkQueue(queue)->threadID));
+}
+
+/**********************************************************************/
+static ssize_t timesShow(const KvdoWorkQueue *queue, char *buf)
+{
+  return formatRunTimeStats(&asConstSimpleWorkQueue(queue)->stats, buf);
+}
+
+/**********************************************************************/
+static ssize_t typeShow(const KvdoWorkQueue *queue, char *buf)
+{
+  strcpy(buf, queue->roundRobinMode ? "round-robin\n" : "simple\n");
+  return strlen(buf);
+}
+
+/**********************************************************************/
+static ssize_t workFunctionsShow(const KvdoWorkQueue *queue, char *buf)
+{
+  const SimpleWorkQueue *simpleQueue = asConstSimpleWorkQueue(queue);
+  return formatWorkItemStats(&simpleQueue->stats.workItemStats, buf,
+                             PAGE_SIZE);
+}
+
+/**********************************************************************/
+static WorkQueueAttribute nameAttr = {
+  .attr = { .name = "name", .mode = 0444, },
+  .show = nameShow,
+};
+
+/**********************************************************************/
+static WorkQueueAttribute pidAttr = {
+  .attr = { .name = "pid", .mode = 0444, },
+  .show = pidShow,
+};
+
+/**********************************************************************/
+static WorkQueueAttribute timesAttr = {
+  .attr = { .name = "times", .mode = 0444 },
+  .show = timesShow,
+};
+
+/**********************************************************************/
+static WorkQueueAttribute typeAttr = {
+  .attr = { .name = "type", .mode = 0444, },
+  .show = typeShow,
+};
+
+/**********************************************************************/
+static WorkQueueAttribute workFunctionsAttr = {
+  .attr = { .name = "work_functions", .mode = 0444, },
+  .show = workFunctionsShow,
+};
+
+/**********************************************************************/
+static struct attribute *simpleWorkQueueAttrs[] = {
+  &nameAttr.attr,
+  &pidAttr.attr,
+  &timesAttr.attr,
+  &typeAttr.attr,
+  &workFunctionsAttr.attr,
+  NULL,
+};
+
+/**********************************************************************/
+static struct attribute *roundRobinWorkQueueAttrs[] = {
+  &nameAttr.attr,
+  &typeAttr.attr,
+  NULL,
+};
+
+/**********************************************************************/
+static ssize_t workQueueAttrShow(struct kobject   *kobj,
+                                 struct attribute *attr,
+                                 char             *buf)
+{
+  WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr);
+  if (wqAttr->show == NULL) {
+    return -EINVAL;
+  }
+  KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj);
+  return wqAttr->show(queue, buf);
+}
+
+/**********************************************************************/
+static ssize_t workQueueAttrStore(struct kobject   *kobj,
+                                  struct attribute *attr,
+                                  const char       *buf,
+                                  size_t            length)
+{
+  WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr);
+  if (wqAttr->store == NULL) {
+    return -EINVAL;
+  }
+  KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj);
+  return wqAttr->store(queue, buf, length);
+}
+
+/**********************************************************************/
+static struct sysfs_ops workQueueSysfsOps = {
+  .show  = workQueueAttrShow,
+  .store = workQueueAttrStore,
+};
+
+/**********************************************************************/
+static void workQueueRelease(struct kobject *kobj)
+{
+  KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj);
+  FREE(queue->name);
+  if (queue->roundRobinMode) {
+    FREE(asRoundRobinWorkQueue(queue));
+  } else {
+    FREE(asSimpleWorkQueue(queue));
+  }
+}
+
+/**********************************************************************/
+struct kobj_type simpleWorkQueueKobjType = {
+  .default_attrs = simpleWorkQueueAttrs,
+  .release       = workQueueRelease,
+  .sysfs_ops     = &workQueueSysfsOps,
+};
+
+/**********************************************************************/
+struct kobj_type roundRobinWorkQueueKobjType = {
+  .default_attrs = roundRobinWorkQueueAttrs,
+  .release       = workQueueRelease,
+  .sysfs_ops     = &workQueueSysfsOps,
+};
diff --git a/vdo/kernel/workQueueSysfs.h b/vdo/kernel/workQueueSysfs.h
new file mode 100644
index 0000000..41f6af5
--- /dev/null
+++ b/vdo/kernel/workQueueSysfs.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA. 
+ *
+ * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.h#1 $
+ */
+
+#ifndef WORK_QUEUE_SYSFS_H
+#define WORK_QUEUE_SYSFS_H
+
+#include <linux/kobject.h>
+
+extern struct kobj_type roundRobinWorkQueueKobjType;
+extern struct kobj_type simpleWorkQueueKobjType;
+
+#endif // WORK_QUEUE_SYSFS_H